00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "gn/gnFilter.h"
00013 #include "gn/gnFeature.h"
00014 #include "gn/gnGBKSource.h"
00015 #include "gn/gnSourceSpec.h"
00016 #include "gn/gnSourceHeader.h"
00017 #include "gn/gnSourceQualifier.h"
00018 #include "gn/gnLocation.h"
00019 #include "gn/gnStringTools.h"
00020 #include "gn/gnDebug.h"
00021 #include "gn/gnStringQualifier.h"
00022 #include <string>
00023
00024 gnGBKSource::gnGBKSource()
00025 {
00026 m_openString = "";
00027 m_pFilter = gnFilter::fullDNASeqFilter();
00028 if(m_pFilter == NULL){
00029 DebugMsg("Error using static sequence filters.");
00030 }
00031 }
00032 gnGBKSource::gnGBKSource( const gnGBKSource& s ) : gnFileSource(s)
00033 {
00034 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00035 for( ; iter != s.m_contigList.end(); ++iter )
00036 {
00037 m_contigList.push_back( (*iter)->Clone() );
00038 }
00039 }
00040 gnGBKSource::~gnGBKSource()
00041 {
00042 m_ifstream.close();
00043 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00044 for( ; iter != m_contigList.end(); ++iter )
00045 {
00046 gnFileContig* fg = *iter;
00047 *iter = 0;
00048 delete fg;
00049 }
00050 }
00051 boolean gnGBKSource::HasContig( const string& name ) const
00052 {
00053 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00054 {
00055 if( name == m_contigList[i]->GetName() )
00056 return true;
00057 }
00058 return false;
00059 }
00060 uint32 gnGBKSource::GetContigID( const string& name ) const
00061 {
00062 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00063 {
00064 if( name == m_contigList[i]->GetName() )
00065 return i;
00066 }
00067 return ALL_CONTIGS;
00068 }
00069 string gnGBKSource::GetContigName( const uint32 i ) const
00070 {
00071 if( i < m_contigList.size() )
00072 {
00073 return m_contigList[i]->GetName();
00074 }
00075 return "";
00076 }
00077 gnSeqI gnGBKSource::GetContigSeqLength( const uint32 i ) const
00078 {
00079 if( i == ALL_CONTIGS)
00080 return m_spec->GetLength();
00081 if( i < m_contigList.size() )
00082 {
00083 return m_contigList[i]->GetSeqLength();
00084 }
00085 return GNSEQI_ERROR;
00086 }
00087
00088 boolean gnGBKSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI ){
00089 uint64 startPos = 0;
00090 uint64 readableBytes = 0;
00091 if( !SeqSeek( start, contigI, startPos, readableBytes ) )
00092 {
00093 bufLen = 0;
00094 return false;
00095 }
00096
00097 if( contigI == ALL_CONTIGS )
00098 {
00099 uint32 curLen = 0;
00100 uint64 bytesRead = 0;
00101 while (curLen < bufLen)
00102 {
00103
00104 if(readableBytes <= 0)
00105 if( !SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){
00106 bufLen = curLen;
00107 return true;
00108 }
00109
00110 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00111 Array<gnSeqC> array_buf( readLen );
00112 gnSeqC* tmpBuf = array_buf.data;
00113
00114
00115 m_ifstream.read(tmpBuf, readLen);
00116 uint64 gc = m_ifstream.gcount();
00117 bytesRead += gc;
00118 readableBytes -= gc;
00119 for(uint32 i=0; i < gc; i++){
00120 if( m_pFilter->IsValid(tmpBuf[i]) ){
00121 buf[curLen] = tmpBuf[i];
00122 curLen++;
00123 }
00124 }
00125 if(m_ifstream.eof()){
00126 m_ifstream.clear();
00127 bufLen = curLen;
00128 return true;
00129 }
00130 }
00131 bufLen = curLen;
00132 }
00133 else if( contigI < m_contigList.size() )
00134 {
00135 uint32 curLen = 0;
00136
00137 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00138 bufLen = bufLen < contigSize ? bufLen : contigSize;
00139 while (curLen < bufLen)
00140 {
00141 uint64 readLen = bufLen - curLen;
00142 Array<gnSeqC> array_buf( readLen );
00143 gnSeqC* tmpBuf = array_buf.data;
00144
00145
00146 m_ifstream.read(tmpBuf, readLen);
00147 uint64 gc = m_ifstream.gcount();
00148
00149
00150 for(uint32 i=0; i < gc; i++){
00151 if( m_pFilter->IsValid(tmpBuf[i]) ){
00152 buf[curLen] = tmpBuf[i];
00153 curLen++;
00154 }
00155 }
00156 if(m_ifstream.eof()){
00157 m_ifstream.clear();
00158 bufLen = curLen;
00159 return true;
00160 }
00161 }
00162 bufLen = curLen;
00163 }
00164 return true;
00165
00166 }
00167
00168
00169
00170
00171 boolean gnGBKSource::SeqSeek( const gnSeqI start, const uint32& contigI, uint64& startPos, uint64& readableBytes )
00172 {
00173 if( contigI == ALL_CONTIGS )
00174 {
00175
00176 gnSeqI curIndex = 0;
00177 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00178 for( ; iter != m_contigList.end(); ++iter )
00179 {
00180 uint64 len = (*iter)->GetSeqLength();
00181 if( (curIndex + len) > start )
00182 break;
00183 curIndex += len;
00184 }
00185 if( iter == m_contigList.end() )
00186 return false;
00187
00188 gnSeqI startIndex = start - curIndex;
00189 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00190 }
00191 else if( contigI < m_contigList.size() )
00192 {
00193 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00194 }
00195 return false;
00196 }
00197
00198 boolean gnGBKSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes )
00199 {
00200 readableBytes = 0;
00201 uint32 curLen = 0;
00202
00203 startPos = contig.GetSectStartEnd(gnContigSequence).first;
00204 m_ifstream.seekg( startPos, ios::beg );
00205 if( m_ifstream.eof() ){
00206 ErrorMsg("ERROR in gnGBKSource::Incorrect contig start position, End of file reached!\n");
00207 return false;
00208 }
00209 while( true )
00210 {
00211
00212
00213 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00214 if(tmpbufsize == 0){
00215 ErrorMsg("ERROR in gnGBKSource: stored contig size is incorrect.");
00216 return false;
00217 }
00218 uint64 startOffset = start;
00219 if(contig.HasRepeatSeqGap()){
00220 startOffset += (9 + m_newlineSize) * (start / 60 + 1) + start / 10 + 1;
00221 if( m_newlineSize == 2 )
00222 startOffset--;
00223 startPos+=startOffset;
00224 m_ifstream.seekg(startPos , ios::beg);
00225 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00226 return true;
00227 }
00228
00229
00230 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;
00231 Array<char> array_buf( tmpbufsize );
00232 char* tmpbuf = array_buf.data;
00233
00234 m_ifstream.read( tmpbuf, tmpbufsize );
00235 if( m_ifstream.eof() ){
00236 ErrorMsg("ERROR in gnGBKSource::Read End of file reached!\n");
00237 return false;
00238 }
00239 for( uint32 i=0; i < tmpbufsize; ++i ){
00240 if( m_pFilter->IsValid(tmpbuf[i]) ){
00241 if( curLen >= start ){
00242 startPos += i;
00243 m_ifstream.seekg( startPos, ios::beg );
00244 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00245 return true;
00246 }
00247 ++curLen;
00248 }
00249 }
00250 startPos += tmpbufsize;
00251 }
00252 return true;
00253 }
00254
00255 void gnGBKSource::FormatString(string& data, uint32 offset, uint32 width){
00256
00257 string::size_type newline_loc = data.find_first_of('\n', 0);
00258 while(newline_loc != string::npos){
00259 if(data[newline_loc-1] == '\r')
00260 newline_loc--;
00261 string::size_type text_loc = newline_loc;
00262 while((data[text_loc] == ' ') ||(data[text_loc] == ' ')||(data[text_loc] == '\n')||(data[text_loc] == '\r')){
00263 text_loc++;
00264 if(text_loc+1 == data.length())
00265 break;
00266 }
00267 data = (data.substr(0, newline_loc) + " " + data.substr(text_loc));
00268 newline_loc = data.find_first_of('\n', 0);
00269 }
00270
00271 string output_string = "";
00272 for(uint32 charI = 0; charI < data.length();){
00273
00274 string::size_type base_loc = charI;
00275 string append_string;
00276 while(base_loc - charI <= width){
00277 string::size_type space_loc = data.find_first_of(' ', base_loc+1);
00278 if(space_loc - charI < width)
00279 base_loc = space_loc;
00280 else if(base_loc == charI){
00281
00282 append_string = data.substr(charI, width);
00283 charI+=width;
00284 }else{
00285 append_string = data.substr(charI, base_loc - charI);
00286 charI = base_loc;
00287 }
00288 }
00289 output_string += string(offset, ' ') + append_string;
00290 if(charI + width < data.length())
00291 output_string += "\r\n";
00292 }
00293 data = output_string;
00294 }
00295
00296 void gnGBKSource::WriteHeader(gnMultiSpec* spec, const string& hdr, ofstream& m_ofstream){
00297 gnBaseHeader* gpbh = NULL;
00298 uint32 header_index = 0;
00299 do{
00300 gpbh = spec->GetHeader(hdr, header_index);
00301 if(gpbh != NULL)
00302 m_ofstream << gpbh->GetHeader();
00303 header_index++;
00304 }while(gpbh != NULL);
00305 }
00306
00307 boolean gnGBKSource::Write(gnSequence& seq, const string& filename){
00308 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00309 if(!m_ofstream.is_open())
00310 return false;
00311
00312 string newline = "\r\n";
00313 gnGenomeSpec* spec = seq.GetSpec();
00314
00315
00316 if(spec->GetHeaderListLength() == 1){
00317 gnBaseHeader *gpbh = spec->GetHeader(0);
00318 string name = gpbh->GetHeaderName();
00319
00320 if(string::npos != name.find(".SEQ")){
00321 string header = gpbh->GetHeader();
00322 m_ofstream << header;
00323 }
00324 }
00325
00326 Array<gnSeqC> array_buf( BUFFER_SIZE );
00327 gnSeqC *bases = array_buf.data;
00328
00329 for(uint32 specI = 0; specI < spec->GetSpecListLength(); specI++){
00330 gnFragmentSpec* subSpec = spec->GetSpec(specI);
00331
00332
00333 m_ofstream << "LOCUS ";
00334
00335 string contigName = subSpec->GetName();
00336 if(contigName.length() > SEQ_LOCUS_NAME_LENGTH)
00337 contigName = contigName.substr(0, SEQ_LOCUS_NAME_LENGTH);
00338 uint32 filler_size = SEQ_LOCUS_NAME_LENGTH - contigName.length();
00339 m_ofstream << contigName << string(filler_size, ' ');
00340
00341 string length_string = uintToString(subSpec->GetLength());
00342 filler_size = SEQ_LOCUS_SIZE_LENGTH - length_string.size();
00343 m_ofstream << string(filler_size, ' ') << length_string << " bp ";
00344
00345 string dnatype = string(SEQ_LOCUS_DNATYPE_LENGTH, ' ');
00346 uint32 head_look_i = 0;
00347 gnBaseHeader* gpbh = subSpec->GetHeader("LOCUS", head_look_i);
00348 if(gpbh != NULL)
00349 dnatype = gpbh->GetHeader().substr(SEQ_LOCUS_DNATYPE_OFFSET, SEQ_LOCUS_DNATYPE_LENGTH);
00350 m_ofstream << dnatype << string(2, ' ');
00351
00352 string circular = subSpec->IsCircular() ? string("circular ") : string(10, ' ');
00353 m_ofstream << circular;
00354
00355 string division = string(SEQ_LOCUS_DIVCODE_LENGTH, ' ');
00356 if(gpbh != NULL)
00357 division = gpbh->GetHeader().substr(SEQ_LOCUS_DIVCODE_OFFSET, SEQ_LOCUS_DIVCODE_LENGTH);
00358 m_ofstream << division;
00359
00360 string date = string(SEQ_LOCUS_DATE_LENGTH, ' ');
00361 if(gpbh != NULL)
00362 date = gpbh->GetHeader().substr(SEQ_LOCUS_DATE_OFFSET, SEQ_LOCUS_DATE_LENGTH);
00363 m_ofstream << string(7, ' ') << date << "\r\n";
00364
00365
00366 WriteHeader(subSpec, "DEFINITION", m_ofstream);
00367 WriteHeader(subSpec, "ACCESSION", m_ofstream);
00368 WriteHeader(subSpec, "VERSION", m_ofstream);
00369 WriteHeader(subSpec, "KEYWORDS", m_ofstream);
00370 WriteHeader(subSpec, "SEGMENT", m_ofstream);
00371 WriteHeader(subSpec, "SOURCE", m_ofstream);
00372 WriteHeader(subSpec, "REFERENCE", m_ofstream);
00373 WriteHeader(subSpec, "COMMENT", m_ofstream);
00374
00375
00376 m_ofstream << "FEATURES Location/Qualifiers" << "\r\n";
00377 for(uint32 featureI = 0; featureI < subSpec->GetFeatureListLength(); featureI++){
00378
00379 gnBaseFeature *gpmf = subSpec->GetFeature(featureI);
00380 string featureName = gpmf->GetName();
00381 m_ofstream << string(SEQ_SUBTAG_COLUMN, ' ') << featureName;
00382 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET - featureName.length() - SEQ_SUBTAG_COLUMN, ' ');
00383
00384 uint32 location_count = gpmf->GetLocationListLength();
00385 uint32 line_pos = SEQ_FEATURE_LOC_OFFSET;
00386 uint32 parenthesis_count = 0;
00387 if(location_count > 1){
00388 m_ofstream << "join(";
00389 line_pos += 5;
00390 parenthesis_count++;
00391 }
00392 gnLocation::gnLocationType loc_type = gpmf->GetLocationType();
00393 switch(loc_type){
00394 case gnLocation::LT_Standard:
00395 break;
00396 case gnLocation::LT_Complement:
00397 m_ofstream << "complement(";
00398 line_pos += 11;
00399 parenthesis_count++;
00400 break;
00401 case gnLocation::LT_Order:
00402 m_ofstream << "order(";
00403 line_pos += 6;
00404 parenthesis_count++;
00405 break;
00406 case gnLocation::LT_Group:
00407 m_ofstream << "group(";
00408 parenthesis_count++;
00409 line_pos += 6;
00410 break;
00411 case gnLocation::LT_OneOf:
00412 m_ofstream << "one-of(";
00413 parenthesis_count++;
00414 line_pos += 7;
00415 break;
00416 default:
00417 break;
00418 }
00419
00420 string location;
00421 for(uint32 locationI = 0; locationI < location_count; locationI++){
00422 gnLocation gpl = gpmf->GetLocation(locationI);
00423 if(gpl.IsStartBoundLonger())
00424 location += ">";
00425 if(gpl.IsStartBoundShorter())
00426 location += "<";
00427 location += uintToString(gpl.GetStart());
00428 gnSeqI end_loc = gpl.GetEnd();
00429 if(end_loc != 0){
00430 switch(gpl.GetType()){
00431 case gnLocation::LT_BetweenBases:
00432 location += "^";
00433 break;
00434 case gnLocation::LT_OneOf:
00435 location += ".";
00436 break;
00437 default:
00438 location += "..";
00439 break;
00440 }
00441 if(gpl.IsEndBoundShorter())
00442 location += "<";
00443 if(gpl.IsEndBoundLonger())
00444 location += ">";
00445 location+= uintToString(end_loc);
00446 }
00447 if(locationI +1 < location_count)
00448 location += ",";
00449 else{
00450 for(;parenthesis_count > 0; parenthesis_count--)
00451 location += ")";
00452 }
00453
00454 if(line_pos + location.length() < SEQ_COLUMN_WIDTH){
00455 m_ofstream << location;
00456 line_pos += location.length();
00457 }else{
00458 m_ofstream << "\r\n" << string(SEQ_FEATURE_LOC_OFFSET, ' ') << location;
00459 line_pos = SEQ_FEATURE_LOC_OFFSET + location.length();
00460 }
00461 location = "";
00462 }
00463 m_ofstream << "\r\n";
00464
00465
00466
00467 uint32 qualifier_count = gpmf->GetQualifierListLength();
00468 for(uint32 qualifierI = 0; qualifierI < qualifier_count; qualifierI++){
00469 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET, ' ');
00470 gnBaseQualifier* qualifier = gpmf->GetQualifier(qualifierI);
00471 m_ofstream << "/" << qualifier->GetName() << "=";
00472
00473 string qually = string(qualifier->GetValue());
00474
00475
00476 m_ofstream << qually << "\r\n";
00477 }
00478 if(gpmf != NULL)
00479 delete gpmf;
00480 }
00481
00482
00483 gnSeqI readOffset = seq.contigStart(specI);
00484 gnSeqI readLength = seq.contigLength(specI);
00485
00486
00487 m_ofstream << "BASE COUNT ";
00488 gnSeqI a_count=0, c_count=0, g_count=0, t_count=0, other_count=0;
00489 gnSeqI countLen = readLength + readOffset;
00490 for(gnSeqI countI = readOffset; countI < countLen;){
00491 gnSeqI writeLen = countLen - countI < BUFFER_SIZE ? countLen - countI : BUFFER_SIZE;
00492 if(!seq.ToArray(bases, writeLen, countI))
00493 return false;
00494 gnSeqI a, c, g, t, other;
00495 BaseCount(string(bases, writeLen), a, c, g, t, other);
00496 a_count += a;
00497 c_count += c;
00498 g_count += g;
00499 t_count += t;
00500 other_count += other;
00501 countI += writeLen;
00502 }
00503 m_ofstream << uintToString(a_count) << " a ";
00504 m_ofstream << uintToString(c_count) << " c ";
00505 m_ofstream << uintToString(g_count) << " g ";
00506 m_ofstream << uintToString(t_count) << " t ";
00507 m_ofstream << uintToString(other_count) << " others" << "\r\n";
00508
00509 string origin = "ORIGIN\r\n";
00510 head_look_i = 0;
00511 gpbh = subSpec->GetHeader("ORIGIN", head_look_i);
00512 if(gpbh != NULL)
00513 origin = gpbh->GetHeader();
00514 m_ofstream << origin;
00515
00516
00517 gnSeqI contig_bases = 0;
00518 while(readLength > 0){
00519 gnSeqI writeLen = readLength < BUFFER_SIZE + 20 ? readLength : BUFFER_SIZE + 20;
00520 boolean success = seq.ToArray(bases, writeLen, readOffset);
00521 if(!success)
00522 return false;
00523
00524 for(gnSeqI curbaseI = 0; curbaseI < writeLen; curbaseI += 60){
00525 string baseIndexStr = uintToString(contig_bases + curbaseI +1);
00526 m_ofstream << string(SEQ_BASES_INDEX_END - baseIndexStr.length(), ' ');
00527 m_ofstream << baseIndexStr;
00528 for(gnSeqI base_offset = 0; base_offset <= 50; base_offset+=10){
00529 if(writeLen <= curbaseI + base_offset)
00530 break;
00531 int64 print_length = writeLen - (curbaseI + base_offset);
00532 print_length = print_length > 10 ? 10 : print_length;
00533 m_ofstream << ' ' << string(bases + curbaseI + base_offset, print_length);
00534 }
00535 m_ofstream << "\r\n";
00536 }
00537 readLength -= writeLen;
00538 readOffset += writeLen;
00539 contig_bases += writeLen;
00540 }
00541 m_ofstream << "//\r\n";
00542 }
00543
00544 m_ofstream.close();
00545 return true;
00546 }
00547
00548 gnFileContig* gnGBKSource::GetFileContig( const uint32 contigI ) const{
00549 if(m_contigList.size() > contigI)
00550 return m_contigList[contigI];
00551 return NULL;
00552 }
00553
00554
00555 boolean gnGBKSource::ParseStream( istream& fin )
00556 {
00557
00558 uint32 readState = 0;
00559 uint32 lineStart = 0;
00560
00561 uint32 sectionStart = 0;
00562 uint64 streamPos = 0;
00563 uint64 bufReadLen = 0;
00564 uint64 remainingBuffer = 0;
00565 Array<char> array_buf( BUFFER_SIZE );
00566 char* buf = array_buf.data;
00567 gnFragmentSpec* curFrag = 0;
00568 gnSourceSpec* curSpec = 0;
00569 gnSourceHeader *curHeader;
00570 gnFeature* curFeature;
00571 gnFileContig* curContig = 0;
00572 gnLocation::gnLocationType curBaseLocationType;
00573 gnSeqI curLocationStart;
00574 int32 curStartLength = 0;
00575 int32 curEndLength = 0;
00576 string curLocContig = "";
00577 string curQualifierName;
00578 uint64 curQualifierStart;
00579 string curContigName = "";
00580 gnSeqI seqLength = 0;
00581 gnSeqI seqChunk, seqChunkCount, gapChunk;
00582 boolean corruptWarning = false;
00583
00584
00585 DetermineNewlineType();
00586
00587 m_spec = new gnGenomeSpec();
00588 while( !fin.eof() )
00589 {
00590 if(sectionStart > 0){
00591 if(readState == 14)
00592 sectionStart = lineStart;
00593 remainingBuffer = bufReadLen - sectionStart;
00594 memmove(buf, buf+sectionStart, remainingBuffer);
00595 }
00596
00597 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer);
00598 streamPos -= remainingBuffer;
00599 lineStart -= sectionStart;
00600 sectionStart = 0;
00601 bufReadLen = fin.gcount();
00602 bufReadLen += remainingBuffer;
00603
00604 for( uint32 i=remainingBuffer ; i < bufReadLen ; i++ )
00605 {
00606 char ch = buf[i];
00607 switch( readState )
00608 {
00609 case 0:
00610
00611 if((ch == '\n')&&(buf[lineStart] != ' ')&&(buf[lineStart] != ' ')){
00612 if(curSpec == NULL){
00613 curSpec = new gnSourceSpec(this, m_spec->GetSpecListLength());
00614 curFrag = new gnFragmentSpec();
00615 curFrag->AddSpec(curSpec);
00616 curSpec->SetSourceName(m_openString);
00617 m_spec->AddSpec(curFrag);
00618 }
00619 if(lineStart != sectionStart){
00620 uint32 j = SEQ_HEADER_NAME_LENGTH-1;
00621 for(; j > 0; j--)
00622 if((buf[sectionStart+j] != ' ')&&(buf[sectionStart+j] != ' '))
00623 break;
00624 string header_name = string(buf+sectionStart, j+1);
00625 curHeader = new gnSourceHeader(this, header_name, sectionStart + streamPos, lineStart - sectionStart);
00626
00627 if(strncmp(&buf[lineStart], "LOCUS", 5) == 0)
00628 m_spec->AddHeader(curHeader);
00629 else
00630 curFrag->AddHeader(curHeader);
00631 sectionStart = lineStart;
00632 }
00633
00634 if(strncmp(&buf[lineStart], "FEATURES", 8) == 0){
00635 sectionStart = i + 1;
00636 readState = 1;
00637 }else if(strncmp(&buf[lineStart], "ORIGIN", 6) == 0){
00638 curHeader = new gnSourceHeader(this, string("ORIGIN"), sectionStart + streamPos, i - sectionStart + 1);
00639 curFrag->AddHeader(curHeader);
00640 curContig = new gnFileContig();
00641 curContig->SetName(curContigName);
00642 curContigName = "";
00643 readState = 13;
00644 }else if(strncmp(&buf[lineStart], "LOCUS", 5) == 0){
00645 if(strncmp(&buf[lineStart+SEQ_LOCUS_CIRCULAR_COLUMN-1], "circular", 8) == 0)
00646 curFrag->SetCircular(true);
00647 uint32 j = SEQ_LOCUS_NAME_LENGTH;
00648 for(; j >= 0; j--)
00649 if((buf[lineStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')&&(buf[sectionStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' '))
00650 break;
00651 curContigName = string(buf+lineStart+SEQ_LOCUS_NAME_COLUMN-1, j+1);
00652 curFrag->SetName(curContigName);
00653 }
00654 }
00655 if(ch == '\n'){
00656 lineStart = i + 1;
00657 }
00658 break;
00659 case 1:
00660 if((ch == ' ')||(ch == ' ')){
00661 break;
00662 }else if(ch == '\n'){
00663 lineStart = i + 1;
00664 sectionStart = i + 1;
00665 break;
00666 }else if(sectionStart == i){
00667 i--;
00668 readState = 0;
00669 sectionStart = i + 1;
00670 break;
00671 }else if((i - lineStart == SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1))){
00672 sectionStart = i;
00673 readState = 2;
00674 }
00675 case 2:
00676 if((ch == ' ')||(ch == ' ')){
00677 string featureName(buf+sectionStart, i - sectionStart);
00678 curFeature = new gnFeature(featureName);
00679 curFrag->AddFeature(curFeature);
00680 sectionStart = i + 1;
00681 readState = 3;
00682 }
00683 break;
00684 case 3:
00685 if((ch == ' ')||(ch == ' ')){
00686 break;
00687 }else if((ch == '\r')||(ch == '\n')){
00688 lineStart = i+1;
00689 break;
00690 }
00691 sectionStart = i;
00692 readState = 4;
00693
00694
00695
00696 case 4:
00697 if((ch == ' ')||(ch == ' ')||(ch == '(')||(ch == '.')||(ch=='^')||(ch==':')){
00698 string starter(buf+sectionStart, i - sectionStart);
00699 if(ch == '('){
00700 if(starter == "complement")
00701 curFeature->SetLocationType(gnLocation::LT_Complement);
00702 else if(starter == "order")
00703 curFeature->SetLocationType(gnLocation::LT_Order);
00704 else if(starter == "group")
00705 curFeature->SetLocationType(gnLocation::LT_Group);
00706 else if(starter == "one-of")
00707 curFeature->SetLocationType(gnLocation::LT_OneOf);
00708 sectionStart = i + 1;
00709 break;
00710 }else if(ch == ':'){
00711 curLocContig = starter;
00712 sectionStart = i + 1;
00713 break;
00714 }
00715 curLocationStart = atoi(starter.c_str());
00716 readState = 6;
00717 if(ch == '.'){
00718
00719 readState = 5;
00720 sectionStart = i + 1;
00721 break;
00722 }else if(ch == '^'){
00723 curBaseLocationType = gnLocation::LT_BetweenBases;
00724 }else if((ch == ' ')||(ch == ' ')){
00725
00726 gnLocation curLocation(curLocationStart, curLocationStart);
00727 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00728 readState = 7;
00729 }
00730 sectionStart = i + 1;
00731
00732 }else if(ch == '<'){
00733 curStartLength = -1;
00734 sectionStart = i + 1;
00735 }else if(ch == '>'){
00736 curStartLength = 1;
00737 sectionStart = i + 1;
00738 }
00739 break;
00740 case 5:
00741 if(ch == '.'){
00742 curBaseLocationType = gnLocation::LT_Standard;
00743 readState = 6;
00744 sectionStart = i + 1;
00745 break;
00746 }
00747 curBaseLocationType = gnLocation::LT_OneOf;
00748 case 6:
00749 if(ch == '>'){
00750 curEndLength = 1;
00751 sectionStart = i + 1;
00752 }else if(ch == '<'){
00753 curEndLength = -1;
00754 sectionStart = i + 1;
00755 }else if((ch == ' ')||(ch == ' ')||(ch == ',')){
00756
00757 string ender(buf+sectionStart, i - sectionStart);
00758 gnSeqI curLocationEnd = atoi(ender.c_str());
00759 gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType);
00760 curEndLength = 0;
00761 curStartLength = 0;
00762 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00763 readState = ch == ',' ? 3 : 7;
00764 sectionStart = i+1;
00765 }
00766 break;
00767 case 7:
00768 if((ch != ' ')&&(ch != ' ')&&(lineStart == i)){
00769 sectionStart = i;
00770 readState = 0;
00771 i--;
00772 }else if((ch != ' ')&&(ch != ' ')&&((lineStart == i - SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1)))){
00773 sectionStart = i;
00774 readState = 2;
00775 i--;
00776 }else if(ch == ','){
00777 sectionStart = i+1;
00778 readState = 3;
00779 }else if(ch == '/'){
00780 sectionStart = i+1;
00781 readState = 8;
00782 }else if(ch == '\n')
00783 lineStart = i + 1;
00784 break;
00785 case 8:
00786 if(ch == '='){
00787 curQualifierName = string(buf+sectionStart, i - sectionStart);
00788 readState = 9;
00789 sectionStart = i+1;
00790 }else if( ch == '\r' || ch == '\n' ){
00791
00792 curQualifierName = string(buf+sectionStart, i - sectionStart);
00793 curFeature->AddQualifier( new gnStringQualifier( curQualifierName, "" ));
00794 readState = 7;
00795 sectionStart = i+1;
00796 }
00797 break;
00798 case 9:
00799 if(ch == '"'){
00800 readState = 10;
00801 sectionStart = i;
00802 curQualifierStart = i + streamPos;
00803 }else if(ch == '['){
00804 readState = 11;
00805 sectionStart = i;
00806 }else if((ch == '\r')||(ch == '\n')){
00807 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00808 sectionStart = i+1;
00809 readState = 7;
00810 }
00811 break;
00812 case 10:
00813 if(ch == '"')
00814 readState = 11;
00815 if(ch == '\n'){
00816 lineStart = i + 1;
00817 }
00818 break;
00819 case 11:
00820 if(ch != '"'){
00821 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, curQualifierStart, i - sectionStart));
00822 sectionStart = i+1;
00823 readState = 7;
00824 if(ch == '\n')
00825 lineStart = i + 1;
00826 }else
00827 readState = 10;
00828 break;
00829 case 12:
00830 if(ch == ']'){
00831 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00832 sectionStart = i+1;
00833 readState = 7;
00834 }
00835 break;
00836 case 13:
00837 curContig->SetSectStart(gnContigSequence, i - 1 + streamPos);
00838 curContig->SetRepeatSeqGap(true);
00839 seqChunk = 0;
00840 seqChunkCount = 0;
00841 gapChunk = m_newlineSize + 1;
00842 readState = 14;
00843 break;
00844 case 14:
00845 while(i < bufReadLen){
00846 ch = buf[i];
00847 if((ch == '/')&&(i==lineStart)){
00848 readState = 15;
00849 break;
00850 }else if(m_pFilter->IsValid(ch)){
00851 if(gapChunk > 0){
00852 if((gapChunk > 1 && seqChunkCount > 0) ||
00853 (gapChunk != 10 + m_newlineSize && seqChunkCount == 0)){
00854 if( !corruptWarning ){
00855 ErrorMsg("File is corrupt. Proceed with caution.");
00856 corruptWarning = true;
00857 }
00858 curContig->SetRepeatSeqGap(false);
00859 }
00860 gapChunk = 0;
00861 }
00862 seqChunk++;
00863 seqLength++;
00864 }else{
00865 gapChunk++;
00866 if(seqChunk == 10){
00867 seqChunk = 0;
00868 seqChunkCount++;
00869 if(seqChunkCount == 6){
00870
00871 seqChunkCount = 0;
00872 }
00873 }
00874 if(ch == '\n')
00875 lineStart = i + 1;
00876 }
00877 i++;
00878 }
00879 break;
00880 case 15:
00881 if((ch == '\n')&&(buf[lineStart+1] == '/')){
00882 curContig->SetSectEnd(gnContigSequence, lineStart - m_newlineSize + streamPos);
00883 curContig->SetSeqLength(seqLength);
00884 m_contigList.push_back(curContig);
00885 curContig = 0;
00886 curSpec->SetLength(seqLength);
00887 curSpec = 0;
00888 seqLength = 0;
00889 lineStart = i + 1;
00890 sectionStart = i + 1;
00891 readState = 0;
00892 }
00893 break;
00894 }
00895 }
00896 streamPos += bufReadLen;
00897 }
00898 if(curContig != 0){
00899 curContig->SetSectEnd(gnContigSequence, streamPos - 1);
00900 curContig->SetSeqLength(seqLength);
00901 m_contigList.push_back(curContig);
00902 curSpec->SetLength(seqLength);
00903 }
00904 if(curSpec != 0)
00905 if((curFrag->GetFeatureListLength() == 0) && (curFrag->GetHeaderListLength() == 0)
00906 &&(curSpec->GetLength() == 0)){
00907 m_spec->RemoveSpec(m_spec->GetSpecListLength() - 1);
00908 delete curFrag;
00909 }
00910 m_ifstream.clear();
00911 return true;
00912 }