00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "gn/gnFilter.h"
00013 #include "gn/gnFeature.h"
00014 #include "gn/gnGBKSource.h"
00015 #include "gn/gnSourceSpec.h"
00016 #include "gn/gnSourceHeader.h"
00017 #include "gn/gnSourceQualifier.h"
00018 #include "gn/gnLocation.h"
00019 #include "gn/gnStringTools.h"
00020 #include "gn/gnDebug.h"
00021 #include <string>
00022
00023 gnGBKSource::gnGBKSource()
00024 {
00025 m_openString = "";
00026 m_pFilter = gnFilter::fullDNASeqFilter();
00027 if(m_pFilter == NULL){
00028 DebugMsg("Error using static sequence filters.");
00029 }
00030 }
00031 gnGBKSource::gnGBKSource( const gnGBKSource& s ) : gnFileSource(s)
00032 {
00033 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00034 for( ; iter != s.m_contigList.end(); ++iter )
00035 {
00036 m_contigList.push_back( (*iter)->Clone() );
00037 }
00038 }
00039 gnGBKSource::~gnGBKSource()
00040 {
00041 m_ifstream.close();
00042 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00043 for( ; iter != m_contigList.end(); ++iter )
00044 {
00045 gnFileContig* fg = *iter;
00046 *iter = 0;
00047 delete fg;
00048 }
00049 }
00050 boolean gnGBKSource::HasContig( const string& name ) const
00051 {
00052 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00053 {
00054 if( name == m_contigList[i]->GetName() )
00055 return true;
00056 }
00057 return false;
00058 }
00059 uint32 gnGBKSource::GetContigID( const string& name ) const
00060 {
00061 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00062 {
00063 if( name == m_contigList[i]->GetName() )
00064 return i;
00065 }
00066 return ALL_CONTIGS;
00067 }
00068 string gnGBKSource::GetContigName( const uint32 i ) const
00069 {
00070 if( i < m_contigList.size() )
00071 {
00072 return m_contigList[i]->GetName();
00073 }
00074 return "";
00075 }
00076 gnSeqI gnGBKSource::GetContigSeqLength( const uint32 i ) const
00077 {
00078 if( i == ALL_CONTIGS)
00079 return m_spec->GetLength();
00080 if( i < m_contigList.size() )
00081 {
00082 return m_contigList[i]->GetSeqLength();
00083 }
00084 return GNSEQI_ERROR;
00085 }
00086
00087 boolean gnGBKSource::SeqRead( const gnSeqI start, char* buf, uint32& bufLen, const uint32 contigI ){
00088 uint64 startPos = 0;
00089 uint64 readableBytes = 0;
00090 if( !SeqSeek( start, contigI, startPos, readableBytes ) )
00091 {
00092 bufLen = 0;
00093 return false;
00094 }
00095
00096 if( contigI == ALL_CONTIGS )
00097 {
00098 uint32 curLen = 0;
00099 uint64 bytesRead = 0;
00100 while (curLen < bufLen)
00101 {
00102
00103 if(readableBytes <= 0)
00104 if( !SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){
00105 bufLen = curLen;
00106 return true;
00107 }
00108
00109 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00110 Array<gnSeqC> array_buf( readLen );
00111 gnSeqC* tmpBuf = array_buf.data;
00112
00113
00114 m_ifstream.read(tmpBuf, readLen);
00115 uint64 gc = m_ifstream.gcount();
00116 bytesRead += gc;
00117 readableBytes -= gc;
00118 for(uint32 i=0; i < gc; i++){
00119 if( m_pFilter->IsValid(tmpBuf[i]) ){
00120 buf[curLen] = tmpBuf[i];
00121 curLen++;
00122 }
00123 }
00124 if(m_ifstream.eof()){
00125 m_ifstream.clear();
00126 bufLen = curLen;
00127 return true;
00128 }
00129 }
00130 bufLen = curLen;
00131 }
00132 else if( contigI < m_contigList.size() )
00133 {
00134 uint32 curLen = 0;
00135
00136 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00137 bufLen = bufLen < contigSize ? bufLen : contigSize;
00138 while (curLen < bufLen)
00139 {
00140 uint64 readLen = bufLen - curLen;
00141 Array<gnSeqC> array_buf( readLen );
00142 gnSeqC* tmpBuf = array_buf.data;
00143
00144
00145 m_ifstream.read(tmpBuf, readLen);
00146 uint64 gc = m_ifstream.gcount();
00147
00148
00149 for(uint32 i=0; i < gc; i++){
00150 if( m_pFilter->IsValid(tmpBuf[i]) ){
00151 buf[curLen] = tmpBuf[i];
00152 curLen++;
00153 }
00154 }
00155 if(m_ifstream.eof()){
00156 m_ifstream.clear();
00157 bufLen = curLen;
00158 return true;
00159 }
00160 }
00161 bufLen = curLen;
00162 }
00163 return true;
00164
00165 }
00166
00167
00168
00169
00170 boolean gnGBKSource::SeqSeek( const gnSeqI start, const uint32& contigI, uint64& startPos, uint64& readableBytes )
00171 {
00172 if( contigI == ALL_CONTIGS )
00173 {
00174
00175 gnSeqI curIndex = 0;
00176 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00177 for( ; iter != m_contigList.end(); ++iter )
00178 {
00179 uint64 len = (*iter)->GetSeqLength();
00180 if( (curIndex + len) > start )
00181 break;
00182 curIndex += len;
00183 }
00184 if( iter == m_contigList.end() )
00185 return false;
00186
00187 gnSeqI startIndex = start - curIndex;
00188 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00189 }
00190 else if( contigI < m_contigList.size() )
00191 {
00192 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00193 }
00194 return false;
00195 }
00196
00197 boolean gnGBKSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes )
00198 {
00199 readableBytes = 0;
00200 uint32 curLen = 0;
00201
00202 startPos = contig.GetSectStartEnd(gnContigSequence).first;
00203 m_ifstream.seekg( startPos, ios::beg );
00204 if( m_ifstream.eof() ){
00205 ErrorMsg("ERROR in gnGBKSource::Incorrect contig start position, End of file reached!\n");
00206 return false;
00207 }
00208 while( true )
00209 {
00210
00211
00212 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00213 if(tmpbufsize == 0){
00214 ErrorMsg("ERROR in gnGBKSource: stored contig size is incorrect.");
00215 return false;
00216 }
00217 uint64 startOffset = start;
00218 if(contig.HasRepeatSeqGap()){
00219 startOffset += (9 + m_newlineSize) * (start / 60 + 1) + start / 10 + 1;
00220 startPos+=startOffset;
00221 m_ifstream.seekg(startPos , ios::beg);
00222 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00223 return true;
00224 }
00225
00226
00227 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;
00228 Array<char> array_buf( tmpbufsize );
00229 char* tmpbuf = array_buf.data;
00230
00231 m_ifstream.read( tmpbuf, tmpbufsize );
00232 if( m_ifstream.eof() ){
00233 ErrorMsg("ERROR in gnGBKSource::Read End of file reached!\n");
00234 return false;
00235 }
00236 for( uint32 i=0; i < tmpbufsize; ++i ){
00237 if( m_pFilter->IsValid(tmpbuf[i]) ){
00238 if( curLen >= start ){
00239 startPos += i;
00240 m_ifstream.seekg( startPos, ios::beg );
00241 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00242 return true;
00243 }
00244 ++curLen;
00245 }
00246 }
00247 startPos += tmpbufsize;
00248 }
00249 return true;
00250 }
00251
00252 void gnGBKSource::FormatString(string& data, uint32 offset, uint32 width){
00253
00254 string::size_type newline_loc = data.find_first_of('\n', 0);
00255 while(newline_loc != string::npos){
00256 if(data[newline_loc-1] == '\r')
00257 newline_loc--;
00258 string::size_type text_loc = newline_loc;
00259 while((data[text_loc] == ' ') ||(data[text_loc] == ' ')||(data[text_loc] == '\n')||(data[text_loc] == '\r')){
00260 text_loc++;
00261 if(text_loc+1 == data.length())
00262 break;
00263 }
00264 data = (data.substr(0, newline_loc) + " " + data.substr(text_loc));
00265 newline_loc = data.find_first_of('\n', 0);
00266 }
00267
00268 string output_string = "";
00269 for(uint32 charI = 0; charI < data.length();){
00270
00271 string::size_type base_loc = charI;
00272 string append_string;
00273 while(base_loc - charI <= width){
00274 string::size_type space_loc = data.find_first_of(' ', base_loc+1);
00275 if(space_loc - charI < width)
00276 base_loc = space_loc;
00277 else if(base_loc == charI){
00278
00279 append_string = data.substr(charI, width);
00280 charI+=width;
00281 }else{
00282 append_string = data.substr(charI, base_loc - charI);
00283 charI = base_loc;
00284 }
00285 }
00286 output_string += string(offset, ' ') + append_string;
00287 if(charI + width < data.length())
00288 output_string += "\r\n";
00289 }
00290 data = output_string;
00291 }
00292
00293 void gnGBKSource::WriteHeader(gnMultiSpec* spec, const string& hdr, ofstream& m_ofstream){
00294 gnBaseHeader* gpbh = NULL;
00295 uint32 header_index = 0;
00296 do{
00297 gpbh = spec->GetHeader(hdr, header_index);
00298 if(gpbh != NULL)
00299 m_ofstream << gpbh->GetHeader();
00300 header_index++;
00301 }while(gpbh != NULL);
00302 }
00303
00304 boolean gnGBKSource::Write(gnSequence& seq, const string& filename){
00305 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00306 if(!m_ofstream.is_open())
00307 return false;
00308
00309 string newline = "\r\n";
00310 gnGenomeSpec* spec = seq.GetSpec();
00311
00312
00313 if(spec->GetHeaderListLength() == 1){
00314 gnBaseHeader *gpbh = spec->GetHeader(0);
00315 string name = gpbh->GetHeaderName();
00316
00317 if(string::npos != name.find(".SEQ")){
00318 string header = gpbh->GetHeader();
00319 m_ofstream << header;
00320 }
00321 }
00322
00323 Array<gnSeqC> array_buf( BUFFER_SIZE );
00324 gnSeqC *bases = array_buf.data;
00325
00326 for(uint32 specI = 0; specI < spec->GetSpecListLength(); specI++){
00327 gnFragmentSpec* subSpec = spec->GetSpec(specI);
00328
00329
00330 m_ofstream << "LOCUS ";
00331
00332 string contigName = subSpec->GetName();
00333 if(contigName.length() > SEQ_LOCUS_NAME_LENGTH)
00334 contigName = contigName.substr(0, SEQ_LOCUS_NAME_LENGTH);
00335 uint32 filler_size = SEQ_LOCUS_NAME_LENGTH - contigName.length();
00336 m_ofstream << contigName << string(filler_size, ' ');
00337
00338 string length_string = uintToString(subSpec->GetLength());
00339 filler_size = SEQ_LOCUS_SIZE_LENGTH - length_string.size();
00340 m_ofstream << string(filler_size, ' ') << length_string << " bp ";
00341
00342 string dnatype = string(SEQ_LOCUS_DNATYPE_LENGTH, ' ');
00343 uint32 head_look_i = 0;
00344 gnBaseHeader* gpbh = subSpec->GetHeader("LOCUS", head_look_i);
00345 if(gpbh != NULL)
00346 dnatype = gpbh->GetHeader().substr(SEQ_LOCUS_DNATYPE_OFFSET, SEQ_LOCUS_DNATYPE_LENGTH);
00347 m_ofstream << dnatype << string(2, ' ');
00348
00349 string circular = subSpec->IsCircular() ? string("circular ") : string(10, ' ');
00350 m_ofstream << circular;
00351
00352 string division = string(SEQ_LOCUS_DIVCODE_LENGTH, ' ');
00353 if(gpbh != NULL)
00354 division = gpbh->GetHeader().substr(SEQ_LOCUS_DIVCODE_OFFSET, SEQ_LOCUS_DIVCODE_LENGTH);
00355 m_ofstream << division;
00356
00357 string date = string(SEQ_LOCUS_DATE_LENGTH, ' ');
00358 if(gpbh != NULL)
00359 date = gpbh->GetHeader().substr(SEQ_LOCUS_DATE_OFFSET, SEQ_LOCUS_DATE_LENGTH);
00360 m_ofstream << string(7, ' ') << date << "\r\n";
00361
00362
00363 WriteHeader(subSpec, "DEFINITION", m_ofstream);
00364 WriteHeader(subSpec, "ACCESSION", m_ofstream);
00365 WriteHeader(subSpec, "VERSION", m_ofstream);
00366 WriteHeader(subSpec, "KEYWORDS", m_ofstream);
00367 WriteHeader(subSpec, "SEGMENT", m_ofstream);
00368 WriteHeader(subSpec, "SOURCE", m_ofstream);
00369 WriteHeader(subSpec, "REFERENCE", m_ofstream);
00370 WriteHeader(subSpec, "COMMENT", m_ofstream);
00371
00372
00373 m_ofstream << "FEATURES Location/Qualifiers" << "\r\n";
00374 for(uint32 featureI = 0; featureI < subSpec->GetFeatureListLength(); featureI++){
00375
00376 gnBaseFeature *gpmf = subSpec->GetFeature(featureI);
00377 string featureName = gpmf->GetName();
00378 m_ofstream << string(SEQ_SUBTAG_COLUMN, ' ') << featureName;
00379 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET - featureName.length() - SEQ_SUBTAG_COLUMN, ' ');
00380
00381 uint32 location_count = gpmf->GetLocationListLength();
00382 uint32 line_pos = SEQ_FEATURE_LOC_OFFSET;
00383 uint32 parenthesis_count = 0;
00384 if(location_count > 1){
00385 m_ofstream << "join(";
00386 line_pos += 5;
00387 parenthesis_count++;
00388 }
00389 gnLocation::gnLocationType loc_type = gpmf->GetLocationType();
00390 switch(loc_type){
00391 case gnLocation::LT_Standard:
00392 break;
00393 case gnLocation::LT_Complement:
00394 m_ofstream << "complement(";
00395 line_pos += 11;
00396 parenthesis_count++;
00397 break;
00398 case gnLocation::LT_Order:
00399 m_ofstream << "order(";
00400 line_pos += 6;
00401 parenthesis_count++;
00402 break;
00403 case gnLocation::LT_Group:
00404 m_ofstream << "group(";
00405 parenthesis_count++;
00406 line_pos += 6;
00407 break;
00408 case gnLocation::LT_OneOf:
00409 m_ofstream << "one-of(";
00410 parenthesis_count++;
00411 line_pos += 7;
00412 break;
00413 default:
00414 break;
00415 }
00416
00417 string location;
00418 for(uint32 locationI = 0; locationI < location_count; locationI++){
00419 gnLocation gpl = gpmf->GetLocation(locationI);
00420 if(gpl.IsStartBoundLonger())
00421 location += ">";
00422 if(gpl.IsStartBoundShorter())
00423 location += "<";
00424 location += uintToString(gpl.GetStart());
00425 gnSeqI end_loc = gpl.GetEnd();
00426 if(end_loc != 0){
00427 switch(gpl.GetType()){
00428 case gnLocation::LT_BetweenBases:
00429 location += "^";
00430 break;
00431 case gnLocation::LT_OneOf:
00432 location += ".";
00433 break;
00434 default:
00435 location += "..";
00436 break;
00437 }
00438 if(gpl.IsEndBoundShorter())
00439 location += "<";
00440 if(gpl.IsEndBoundLonger())
00441 location += ">";
00442 location+= uintToString(end_loc);
00443 }
00444 if(locationI +1 < location_count)
00445 location += ",";
00446 else{
00447 for(;parenthesis_count > 0; parenthesis_count--)
00448 location += ")";
00449 }
00450
00451 if(line_pos + location.length() < SEQ_COLUMN_WIDTH){
00452 m_ofstream << location;
00453 line_pos += location.length();
00454 }else{
00455 m_ofstream << "\r\n" << string(SEQ_FEATURE_LOC_OFFSET, ' ') << location;
00456 line_pos = SEQ_FEATURE_LOC_OFFSET + location.length();
00457 }
00458 location = "";
00459 }
00460 m_ofstream << "\r\n";
00461
00462
00463
00464 uint32 qualifier_count = gpmf->GetQualifierListLength();
00465 for(uint32 qualifierI = 0; qualifierI < qualifier_count; qualifierI++){
00466 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET, ' ');
00467 gnBaseQualifier* qualifier = gpmf->GetQualifier(qualifierI);
00468 m_ofstream << "/" << qualifier->GetName() << "=";
00469
00470 string qually = string(qualifier->GetValue());
00471
00472
00473 m_ofstream << qually << "\r\n";
00474 }
00475 if(gpmf != NULL)
00476 delete gpmf;
00477 }
00478
00479
00480 gnSeqI readOffset = seq.contigStart(specI);
00481 gnSeqI readLength = seq.contigLength(specI);
00482
00483
00484 m_ofstream << "BASE COUNT ";
00485 gnSeqI a_count=0, c_count=0, g_count=0, t_count=0, other_count=0;
00486 gnSeqI countLen = readLength + readOffset;
00487 for(gnSeqI countI = readOffset; countI < countLen;){
00488 gnSeqI writeLen = countLen - countI < BUFFER_SIZE ? countLen - countI : BUFFER_SIZE;
00489 if(!seq.ToArray(bases, writeLen, countI))
00490 return false;
00491 gnSeqI a, c, g, t, other;
00492 BaseCount(string(bases, writeLen), a, c, g, t, other);
00493 a_count += a;
00494 c_count += c;
00495 g_count += g;
00496 t_count += t;
00497 other_count += other;
00498 countI += writeLen;
00499 }
00500 m_ofstream << uintToString(a_count) << " a ";
00501 m_ofstream << uintToString(c_count) << " c ";
00502 m_ofstream << uintToString(g_count) << " g ";
00503 m_ofstream << uintToString(t_count) << " t ";
00504 m_ofstream << uintToString(other_count) << " others" << "\r\n";
00505
00506 string origin = "ORIGIN\r\n";
00507 head_look_i = 0;
00508 gpbh = subSpec->GetHeader("ORIGIN", head_look_i);
00509 if(gpbh != NULL)
00510 origin = gpbh->GetHeader();
00511 m_ofstream << origin;
00512
00513
00514 gnSeqI contig_bases = 0;
00515 while(readLength > 0){
00516 gnSeqI writeLen = readLength < BUFFER_SIZE + 20 ? readLength : BUFFER_SIZE + 20;
00517 boolean success = seq.ToArray(bases, writeLen, readOffset);
00518 if(!success)
00519 return false;
00520
00521 for(gnSeqI curbaseI = 0; curbaseI < writeLen; curbaseI += 60){
00522 string baseIndexStr = uintToString(contig_bases + curbaseI +1);
00523 m_ofstream << string(SEQ_BASES_INDEX_END - baseIndexStr.length(), ' ');
00524 m_ofstream << baseIndexStr;
00525 for(gnSeqI base_offset = 0; base_offset <= 50; base_offset+=10){
00526 if(writeLen <= curbaseI + base_offset)
00527 break;
00528 int64 print_length = writeLen - (curbaseI + base_offset);
00529 print_length = print_length > 10 ? 10 : print_length;
00530 m_ofstream << ' ' << string(bases + curbaseI + base_offset, print_length);
00531 }
00532 m_ofstream << "\r\n";
00533 }
00534 readLength -= writeLen;
00535 readOffset += writeLen;
00536 contig_bases += writeLen;
00537 }
00538 m_ofstream << "//\r\n";
00539 }
00540
00541 m_ofstream.close();
00542 return true;
00543 }
00544
00545 gnFileContig* gnGBKSource::GetFileContig( const uint32 contigI ) const{
00546 if(m_contigList.size() > contigI)
00547 return m_contigList[contigI];
00548 return NULL;
00549 }
00550
00551
00552 boolean gnGBKSource::ParseStream( istream& fin )
00553 {
00554
00555 uint32 readState = 0;
00556 uint32 lineStart = 0;
00557
00558 uint32 sectionStart = 0;
00559 uint64 streamPos = 0;
00560 uint64 bufReadLen = 0;
00561 uint64 remainingBuffer = 0;
00562 Array<char> array_buf( BUFFER_SIZE );
00563 char* buf = array_buf.data;
00564 gnFragmentSpec* curFrag = 0;
00565 gnSourceSpec* curSpec = 0;
00566 gnSourceHeader *curHeader;
00567 gnFeature* curFeature;
00568 gnFileContig* curContig = 0;
00569 gnLocation::gnLocationType curBaseLocationType;
00570 gnSeqI curLocationStart;
00571 int32 curStartLength = 0;
00572 int32 curEndLength = 0;
00573 string curLocContig = "";
00574 string curQualifierName;
00575 uint64 curQualifierStart;
00576 string curContigName = "";
00577 gnSeqI seqLength = 0;
00578 gnSeqI seqChunk, seqChunkCount, gapChunk;
00579 boolean corruptWarning = false;
00580
00581
00582 DetermineNewlineType();
00583
00584 m_spec = new gnGenomeSpec();
00585 while( !fin.eof() )
00586 {
00587 if(sectionStart > 0){
00588 if(readState == 14)
00589 sectionStart = lineStart;
00590 remainingBuffer = bufReadLen - sectionStart;
00591 memmove(buf, buf+sectionStart, remainingBuffer);
00592 }
00593
00594 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer);
00595 streamPos -= remainingBuffer;
00596 lineStart -= sectionStart;
00597 sectionStart = 0;
00598 bufReadLen = fin.gcount();
00599 bufReadLen += remainingBuffer;
00600
00601 for( uint32 i=remainingBuffer ; i < bufReadLen ; i++ )
00602 {
00603 char ch = buf[i];
00604 switch( readState )
00605 {
00606 case 0:
00607
00608 if((ch == '\n')&&(buf[lineStart] != ' ')&&(buf[lineStart] != ' ')){
00609 if(curSpec == NULL){
00610 curSpec = new gnSourceSpec(this, m_spec->GetSpecListLength());
00611 curFrag = new gnFragmentSpec();
00612 curFrag->AddSpec(curSpec);
00613 curSpec->SetSourceName(m_openString);
00614 m_spec->AddSpec(curFrag);
00615 }
00616 if(lineStart != sectionStart){
00617 uint32 j = SEQ_HEADER_NAME_LENGTH-1;
00618 for(; j > 0; j--)
00619 if((buf[sectionStart+j] != ' ')&&(buf[sectionStart+j] != ' '))
00620 break;
00621 string header_name = string(buf+sectionStart, j+1);
00622 curHeader = new gnSourceHeader(this, header_name, sectionStart + streamPos, lineStart - sectionStart);
00623
00624 if(strncmp(&buf[lineStart], "LOCUS", 5) == 0)
00625 m_spec->AddHeader(curHeader);
00626 else
00627 curFrag->AddHeader(curHeader);
00628 sectionStart = lineStart;
00629 }
00630
00631 if(strncmp(&buf[lineStart], "FEATURES", 8) == 0){
00632 sectionStart = i + 1;
00633 readState = 1;
00634 }else if(strncmp(&buf[lineStart], "ORIGIN", 6) == 0){
00635 curHeader = new gnSourceHeader(this, string("ORIGIN"), sectionStart + streamPos, i - sectionStart + 1);
00636 curFrag->AddHeader(curHeader);
00637 curContig = new gnFileContig();
00638 curContig->SetName(curContigName);
00639 curContigName = "";
00640 readState = 13;
00641 }else if(strncmp(&buf[lineStart], "LOCUS", 5) == 0){
00642 if(strncmp(&buf[lineStart+SEQ_LOCUS_CIRCULAR_COLUMN-1], "circular", 8) == 0)
00643 curFrag->SetCircular(true);
00644 uint32 j = SEQ_LOCUS_NAME_LENGTH;
00645 for(; j >= 0; j--)
00646 if((buf[lineStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')&&(buf[sectionStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' '))
00647 break;
00648 curContigName = string(buf+lineStart+SEQ_LOCUS_NAME_COLUMN-1, j+1);
00649 curFrag->SetName(curContigName);
00650 }
00651 }
00652 if(ch == '\n'){
00653 lineStart = i + 1;
00654 }
00655 break;
00656 case 1:
00657 if((ch == ' ')||(ch == ' ')){
00658 break;
00659 }else if(ch == '\n'){
00660 lineStart = i + 1;
00661 sectionStart = i + 1;
00662 break;
00663 }else if(sectionStart == i){
00664 i--;
00665 readState = 0;
00666 sectionStart = i + 1;
00667 break;
00668 }else if((i - lineStart == SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1))){
00669 sectionStart = i;
00670 readState = 2;
00671 }
00672 case 2:
00673 if((ch == ' ')||(ch == ' ')){
00674 string featureName(buf+sectionStart, i - sectionStart);
00675 curFeature = new gnFeature(featureName);
00676 curFrag->AddFeature(curFeature);
00677 sectionStart = i + 1;
00678 readState = 3;
00679 }
00680 break;
00681 case 3:
00682 if((ch == ' ')||(ch == ' ')){
00683 break;
00684 }else if((ch == '\r')||(ch == '\n')){
00685 lineStart = i+1;
00686 break;
00687 }
00688 sectionStart = i;
00689 readState = 4;
00690
00691
00692
00693 case 4:
00694 if((ch == ' ')||(ch == ' ')||(ch == '(')||(ch == '.')||(ch=='^')||(ch==':')){
00695 string starter(buf+sectionStart, i - sectionStart);
00696 if(ch == '('){
00697 if(starter == "complement")
00698 curFeature->SetLocationType(gnLocation::LT_Complement);
00699 else if(starter == "order")
00700 curFeature->SetLocationType(gnLocation::LT_Order);
00701 else if(starter == "group")
00702 curFeature->SetLocationType(gnLocation::LT_Group);
00703 else if(starter == "one-of")
00704 curFeature->SetLocationType(gnLocation::LT_OneOf);
00705 sectionStart = i + 1;
00706 break;
00707 }else if(ch == ':'){
00708 curLocContig = starter;
00709 sectionStart = i + 1;
00710 break;
00711 }
00712 curLocationStart = atoi(starter.c_str());
00713 readState = 6;
00714 if(ch == '.'){
00715
00716 readState = 5;
00717 sectionStart = i + 1;
00718 break;
00719 }else if(ch == '^'){
00720 curBaseLocationType = gnLocation::LT_BetweenBases;
00721 }else if((ch == ' ')||(ch == ' ')){
00722
00723 gnLocation curLocation(curLocationStart, curLocationStart);
00724 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00725 readState = 7;
00726 }
00727 sectionStart = i + 1;
00728
00729 }else if(ch == '<'){
00730 curStartLength = -1;
00731 sectionStart = i + 1;
00732 }else if(ch == '>'){
00733 curStartLength = 1;
00734 sectionStart = i + 1;
00735 }
00736 break;
00737 case 5:
00738 if(ch == '.'){
00739 curBaseLocationType = gnLocation::LT_Standard;
00740 readState = 6;
00741 sectionStart = i + 1;
00742 break;
00743 }
00744 curBaseLocationType = gnLocation::LT_OneOf;
00745 case 6:
00746 if(ch == '>'){
00747 curEndLength = 1;
00748 sectionStart = i + 1;
00749 }else if(ch == '<'){
00750 curEndLength = -1;
00751 sectionStart = i + 1;
00752 }else if((ch == ' ')||(ch == ' ')||(ch == ',')){
00753
00754 string ender(buf+sectionStart, i - sectionStart);
00755 gnSeqI curLocationEnd = atoi(ender.c_str());
00756 gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType);
00757 curEndLength = 0;
00758 curStartLength = 0;
00759 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00760 readState = ch == ',' ? 3 : 7;
00761 sectionStart = i+1;
00762 }
00763 break;
00764 case 7:
00765 if((ch != ' ')&&(ch != ' ')&&(lineStart == i)){
00766 sectionStart = i;
00767 readState = 0;
00768 i--;
00769 }else if((ch != ' ')&&(ch != ' ')&&((lineStart == i - SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1)))){
00770 sectionStart = i;
00771 readState = 2;
00772 i--;
00773 }else if(ch == ','){
00774 sectionStart = i+1;
00775 readState = 3;
00776 }else if(ch == '/'){
00777 sectionStart = i+1;
00778 readState = 8;
00779 }else if(ch == '\n')
00780 lineStart = i + 1;
00781 break;
00782 case 8:
00783 if(ch == '='){
00784 curQualifierName = string(buf+sectionStart, i - sectionStart);
00785 readState = 9;
00786 sectionStart = i+1;
00787 }
00788 break;
00789 case 9:
00790 if(ch == '"'){
00791 readState = 10;
00792 sectionStart = i;
00793 curQualifierStart = i + streamPos;
00794 }else if(ch == '['){
00795 readState = 11;
00796 sectionStart = i;
00797 }else if((ch == '\r')||(ch == '\n')){
00798 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00799 sectionStart = i+1;
00800 readState = 7;
00801 }
00802 break;
00803 case 10:
00804 if(ch == '"')
00805 readState = 11;
00806 if(ch == '\n'){
00807 lineStart = i + 1;
00808 }
00809 break;
00810 case 11:
00811 if(ch != '"'){
00812 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, curQualifierStart, i - sectionStart));
00813 sectionStart = i+1;
00814 readState = 7;
00815 if(ch == '\n')
00816 lineStart = i + 1;
00817 }else
00818 readState = 10;
00819 break;
00820 case 12:
00821 if(ch == ']'){
00822 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00823 sectionStart = i+1;
00824 readState = 7;
00825 }
00826 break;
00827 case 13:
00828 curContig->SetSectStart(gnContigSequence, i - 1 + streamPos);
00829 curContig->SetRepeatSeqGap(true);
00830 seqChunk = 0;
00831 seqChunkCount = 0;
00832 gapChunk = m_newlineSize + 1;
00833 readState = 14;
00834 break;
00835 case 14:
00836 while(i < bufReadLen){
00837 ch = buf[i];
00838 if((ch == '/')&&(i==lineStart)){
00839 readState = 15;
00840 break;
00841 }else if(m_pFilter->IsValid(ch)){
00842 if(gapChunk > 0){
00843 if((gapChunk > 1 && seqChunkCount > 0) ||
00844 (gapChunk != 10 + m_newlineSize && seqChunkCount == 0)){
00845 if( !corruptWarning ){
00846 ErrorMsg("File is corrupt. Proceed with caution.");
00847 corruptWarning = true;
00848 }
00849 curContig->SetRepeatSeqGap(false);
00850 }
00851 gapChunk = 0;
00852 }
00853 seqChunk++;
00854 seqLength++;
00855 }else{
00856 gapChunk++;
00857 if(seqChunk == 10){
00858 seqChunk = 0;
00859 seqChunkCount++;
00860 if(seqChunkCount == 6){
00861
00862 seqChunkCount = 0;
00863 }
00864 }
00865 if(ch == '\n')
00866 lineStart = i + 1;
00867 }
00868 i++;
00869 }
00870 break;
00871 case 15:
00872 if((ch == '\n')&&(buf[lineStart+1] == '/')){
00873 curContig->SetSectEnd(gnContigSequence, lineStart - 2 + streamPos);
00874 curContig->SetSeqLength(seqLength);
00875 m_contigList.push_back(curContig);
00876 curContig = 0;
00877 curSpec->SetLength(seqLength);
00878 curSpec = 0;
00879 seqLength = 0;
00880 lineStart = i + 1;
00881 sectionStart = i + 1;
00882 readState = 0;
00883 }
00884 break;
00885 }
00886 }
00887 streamPos += bufReadLen;
00888 }
00889 if(curContig != 0){
00890 curContig->SetSectEnd(gnContigSequence, streamPos - 1);
00891 curContig->SetSeqLength(seqLength);
00892 m_contigList.push_back(curContig);
00893 curSpec->SetLength(seqLength);
00894 }
00895 if(curSpec != 0)
00896 if((curFrag->GetFeatureListLength() == 0) && (curFrag->GetHeaderListLength() == 0)
00897 &&(curSpec->GetLength() == 0)){
00898 m_spec->RemoveSpec(m_spec->GetSpecListLength() - 1);
00899 delete curFrag;
00900 }
00901 m_ifstream.clear();
00902 return true;
00903 }