00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00013
00014 #include "gn/gnSourceSpec.h"
00015 #include "gn/gnStringSpec.h"
00016 #include "gn/gnSourceFactory.h"
00017 #include "gn/gnFASSource.h"
00018 #include "gn/gnGBKSource.h"
00019 #include "gn/gnBaseHeader.h"
00020 #include "gn/gnFilter.h"
00021 #include "gn/gnDebug.h"
00022 #include "gn/gnStringTools.h"
00023 #include <string>
00024
00025 gnDNXSource::gnDNXSource()
00026 {
00027 m_DNXSpec = new gnGenomeSpec();
00028 m_pFilter = gnFilter::fullDNASeqFilter();
00029 if(m_pFilter == NULL){
00030 DebugMsg("Error using static sequence filters.");
00031 }
00032 }
00033
00034 gnDNXSource::gnDNXSource( const gnDNXSource& s ) : gnFileSource(s)
00035 {
00036 if(s.m_DNXSpec != NULL)
00037 m_DNXSpec = s.m_DNXSpec->Clone();
00038 }
00039
00040 gnDNXSource::~gnDNXSource()
00041 {
00042 m_ifstream.close();
00043 delete m_DNXSpec;
00044 }
00045 boolean gnDNXSource::HasContig( const string& name ) const
00046 {
00047 for(uint32 contigI = 0; contigI < m_DNXSpec->GetSpecListLength(); contigI++){
00048 if(m_DNXSpec->GetSpec(contigI)->GetName() == name)
00049 return true;
00050 }
00051 return false;
00052 }
00053 uint32 gnDNXSource::GetContigID( const string& name ) const
00054 {
00055 for(uint32 contigI = 0; contigI < m_DNXSpec->GetSpecListLength(); contigI++){
00056 if(m_DNXSpec->GetSpec(contigI)->GetName() == name)
00057 return contigI;
00058 }
00059 return ALL_CONTIGS;
00060 }
00061 string gnDNXSource::GetContigName( const uint32 i ) const
00062 {
00063 if(i < m_DNXSpec->GetSpecListLength()){
00064 gnBaseSpec *gnbs = m_DNXSpec->GetSpec(i);
00065 return gnbs->GetName();
00066 }
00067 return "";
00068 }
00069 gnSeqI gnDNXSource::GetContigSeqLength( const uint32 i ) const
00070 {
00071 if( i == ALL_CONTIGS){
00072 return m_DNXSpec->GetLength();
00073 }else if(i < m_DNXSpec->GetSpecListLength()){
00074 gnBaseSpec *gnbs = m_DNXSpec->GetSpec(i);
00075 return gnbs->GetLength();
00076 }
00077 return 0;
00078 }
00079
00080
00081 void gnDNXSource::ValidateName(string& name){
00082 if(name == ""){
00083 name.resize(4);
00084 srand(time(NULL));
00085 for(int i=0; i < 4; i++)
00086 name[i] = (rand() % 26) + 64;
00087 }
00088 }
00089
00090 boolean gnDNXSource::Write(gnGenomeSpec* spec, const string& filename){
00091 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00092 gnSourceFactory* m_sSourceFactory = gnSourceFactory::GetSourceFactory();
00093 if(!m_ofstream.is_open())
00094 return false;
00095 for(uint32 i=0; i < spec->GetSpecListLength(); i++){
00096 gnFragmentSpec* curStatementSpec = spec->GetSpec(i);
00097 string sourceName = spec->GetSourceName();
00098 string statementName = spec->GetName();
00099 if(!m_sSourceFactory->HasSource(sourceName)){
00100 ValidateName(statementName);
00101 statementName += ".seq";
00102 m_ofstream << statementName << "=";
00103 }else
00104 m_ofstream << sourceName << "=";
00105 for(uint32 j=0; j < curStatementSpec->GetSpecListLength(); j++){
00106
00107 gnContigSpec* curSubSpec = curStatementSpec->GetSpec(i);
00108 sourceName = curStatementSpec->GetSourceName();
00109 string contigName = curStatementSpec->GetName();
00110 if(!m_sSourceFactory->HasSource(sourceName)){
00111 ValidateName(contigName);
00112 string writename = contigName+".seq";
00113 gnSequence gns = *curSubSpec;
00114 gnGBKSource::Write(gns, writename);
00115 m_ofstream << writename;
00116 }else
00117 m_ofstream << sourceName;
00118 if( j + 1 < curStatementSpec->GetSpecListLength())
00119 m_ofstream << "+";
00120 }
00121 m_ofstream << ";";
00122 gnBaseHeader *gpbh = spec->GetHeader(0);
00123 string header = "";
00124 if(gpbh != NULL){
00125 header = gpbh->GetHeader();
00126
00127 uint32 newlinepos = header.find_first_of('\n', 0);
00128 if(newlinepos != string::npos)
00129 header = header.substr(0, newlinepos - 1);
00130 }
00131 m_ofstream << header << "\r\n";
00132 }
00133 m_ofstream.close();
00134 return true;
00135 }
00136
00137 gnFileContig* gnDNXSource::GetFileContig( const uint32 contigI ) const{
00138 return NULL;
00139 }
00140
00141
00142 boolean gnDNXSource::ParseStream( istream& fin )
00143 {
00144
00145 uint32 readState = 0;
00146 uint32 sectionStart = 0;
00147 gnFragmentSpec* currentFragSpec = 0;
00148 gnBaseSource *currentSource;
00149 string currentSourceName;
00150 uint32 currentContig = ALL_CONTIGS;
00151 uint32 currentSeqStart = 0;
00152 boolean currentRevComp = false;
00153
00154 uint64 bufReadLen = 0;
00155 uint64 remainingBuffer = 0;
00156 char* buf = new char[BUFFER_SIZE];
00157 string curliteral;
00158
00159
00160 gnSourceFactory *sourceFactory = gnSourceFactory::GetSourceFactory();
00161 sourceFactory->AddPath(getPathString(m_openString));
00162
00163 while( !fin.eof() )
00164 {
00165 if(sectionStart > 0){
00166 remainingBuffer = bufReadLen - sectionStart;
00167 if(readState == 5){
00168 curliteral += string(buf, sectionStart, remainingBuffer);
00169 remainingBuffer = 0;
00170 sectionStart = bufReadLen;
00171 }else
00172 memmove(buf, buf+sectionStart, remainingBuffer);
00173 }
00174
00175 fin.read( buf + remainingBuffer, BUFFER_SIZE - (bufReadLen - sectionStart));
00176 sectionStart = 0;
00177 bufReadLen = fin.gcount() + remainingBuffer;
00178
00179 for( uint32 i=0 ; i < bufReadLen ; i++ )
00180 {
00181 char ch = buf[i];
00182 switch( readState )
00183 {
00184 case 0:
00185 if(ch == '='){
00186
00187 string contigName(buf+sectionStart, i - sectionStart);
00188 currentFragSpec = new gnFragmentSpec();
00189 currentFragSpec->SetName(contigName);
00190 currentFragSpec->SetSourceName(m_openString);
00191 m_DNXSpec->AddSpec(currentFragSpec);
00192 sectionStart = i+1;
00193 readState = 1;
00194 }
00195 break;
00196 case 1:
00197 if((ch == ' ')||(ch == ' '))
00198 break;
00199 case 2:
00200 if(ch == '"'){
00201 readState = 5;
00202 sectionStart = i+1;
00203 break;
00204 }
00205 readState = 3;
00206 sectionStart = i;
00207 case 3:
00208
00209 if(ch == '\n' && sectionStart == i -1){
00210 if(buf[sectionStart]=='\r'){
00211 sectionStart = i + 1;
00212 break;
00213 }
00214 }
00215 if((ch == '+')||(ch == '>')||(ch == '(')||(ch == '\n')||(ch == ';')){
00216
00217 string seqfile(buf, sectionStart, i - sectionStart);
00218 currentSourceName = seqfile;
00219 currentSource = sourceFactory->AddSource(seqfile, true);
00220 if (currentSource==NULL)
00221 {
00222 delete[] buf;
00223 return false;
00224 }
00225 if((ch == '+')||(ch == '\n')||(ch == ';')){
00226 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource);
00227 tmp_spec->SetSourceName(seqfile);
00228 currentFragSpec->AddSpec(tmp_spec);
00229 readState = 1;
00230 if(ch == '\n'){
00231 readState = 0;
00232 }else if(ch == ';'){
00233 readState = 9;
00234 }
00235 }else if(ch == '>'){
00236 readState = 4;
00237 }else if(ch == '('){
00238 readState = 6;
00239 }
00240 sectionStart = i + 1;
00241 }
00242 break;
00243 case 4:
00244
00245 if((ch == '+')||(ch == '\n')||(ch == ';')||(ch == '(')){
00246
00247 string contigname(buf, sectionStart, i - sectionStart);
00248 currentContig = currentSource->GetContigID(contigname);
00249 if((ch == '+')||(ch == '\n')||(ch == ';')){
00250 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource, currentContig);
00251 tmp_spec->SetSourceName(currentSourceName);
00252 currentFragSpec->AddSpec(tmp_spec);
00253 readState = 1;
00254 if(ch == '\n'){
00255 readState = 0;
00256 }else if(ch == ';'){
00257 readState = 9;
00258 }
00259 }else if(ch == '('){
00260 readState = 6;
00261 }
00262 sectionStart = i + 1;
00263 }
00264 break;
00265 case 5:
00266
00267 if(ch == '"'){
00268
00269 string literal(buf, sectionStart, i - sectionStart);
00270 if(curliteral.length() > 0){
00271 literal += curliteral;
00272 curliteral = "";
00273 }
00274 gnStringSpec *gpss = new gnStringSpec(literal, currentFragSpec->GetSpecListLength());
00275 currentFragSpec->AddSpec(gpss);
00276 }
00277 case 6:
00278
00279 if((ch == ',') || (ch == '<') || (ch == '>')){
00280 string seqstartstring(buf, sectionStart, i - sectionStart);
00281 if(seqstartstring == "lend"){
00282 currentSeqStart = 0;
00283 }else if (seqstartstring == "rend"){
00284 currentSeqStart = GNSEQI_END;
00285 }else
00286 currentSeqStart = atoi(seqstartstring.c_str()) - 1;
00287 if(ch == '<')
00288 currentRevComp = true;
00289 sectionStart = i + 1;
00290 readState = 7;
00291 }
00292 break;
00293 case 7:
00294
00295 if(ch == ')'){
00296 string seqendstring(buf, sectionStart, i - sectionStart);
00297 uint32 currentSeqEnd = GNSEQI_END;
00298 if(seqendstring == "lend"){
00299 currentSeqEnd = 0;
00300 }else if (seqendstring == "rend"){
00301 currentSeqEnd = GNSEQI_END;
00302 }else
00303 currentSeqEnd = atoi(seqendstring.c_str()) - 1;
00304 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource, currentContig, currentSeqStart, currentSeqEnd, currentRevComp);
00305 tmp_spec->SetSourceName(currentSourceName);
00306 currentFragSpec->AddSpec(tmp_spec);
00307 currentRevComp = false;
00308 sectionStart = i + 1;
00309 readState = 8;
00310 }
00311 break;
00312 case 8:
00313 if(ch == '+'){
00314 sectionStart = i + 1;
00315 readState = 1;
00316 }
00317 if(ch == '\n'){
00318 sectionStart = i + 1;
00319 readState = 0;
00320 }
00321 if(ch == ';'){
00322 sectionStart = i + 1;
00323 readState = 9;
00324 }
00325 break;
00326 case 9:
00327 if(ch == '\n'){
00328 sectionStart = i + 1;
00329 readState = 0;
00330 }
00331 break;
00332 default:
00333 DebugMsg("ERROR in file\n");
00334 return false;
00335 break;
00336 }
00337 }
00338 }
00339
00340 delete[] buf;
00341 return true;
00342 }