00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00013
00014 #include "gn/gnFeature.h"
00015 #include "gn/gnGBKSource.h"
00016 #include "gn/gnSourceSpec.h"
00017 #include "gn/gnSourceHeader.h"
00018 #include "gn/gnSourceQualifier.h"
00019 #include "gn/gnLocation.h"
00020 #include "gn/gnStringTools.h"
00021 #include "gn/gnDebug.h"
00022 #include <string>
00023
00024 gnGBKSource::gnGBKSource()
00025 {
00026 m_openString = "";
00027 m_pFilter = gnFilter::fullDNASeqFilter();
00028 if(m_pFilter == NULL){
00029 DebugMsg("Error using static sequence filters.");
00030 }
00031 }
00032 gnGBKSource::gnGBKSource( const gnGBKSource& s ) : gnFileSource(s)
00033 {
00034 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00035 for( ; iter != s.m_contigList.end(); ++iter )
00036 {
00037 m_contigList.push_back( (*iter)->Clone() );
00038 }
00039 }
00040 gnGBKSource::~gnGBKSource()
00041 {
00042 m_ifstream.close();
00043 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00044 for( ; iter != m_contigList.end(); ++iter )
00045 {
00046 gnFileContig* fg = *iter;
00047 *iter = 0;
00048 delete fg;
00049 }
00050 }
00051 boolean gnGBKSource::HasContig( const string& name ) const
00052 {
00053 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00054 {
00055 if( name == m_contigList[i]->GetName() )
00056 return true;
00057 }
00058 return false;
00059 }
00060 uint32 gnGBKSource::GetContigID( const string& name ) const
00061 {
00062 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00063 {
00064 if( name == m_contigList[i]->GetName() )
00065 return i;
00066 }
00067 return ALL_CONTIGS;
00068 }
00069 string gnGBKSource::GetContigName( const uint32 i ) const
00070 {
00071 if( i < m_contigList.size() )
00072 {
00073 return m_contigList[i]->GetName();
00074 }
00075 return "";
00076 }
00077 gnSeqI gnGBKSource::GetContigSeqLength( const uint32 i ) const
00078 {
00079 if( i == ALL_CONTIGS)
00080 return m_spec->GetLength();
00081 if( i < m_contigList.size() )
00082 {
00083 return m_contigList[i]->GetSeqLength();
00084 }
00085 return GNSEQI_ERROR;
00086 }
00087
00088 boolean gnGBKSource::SeqRead( const gnSeqI start, char* buf, uint32& bufLen, const uint32 contigI ){
00089 uint64 startPos = 0;
00090 uint64 readableBytes = 0;
00091 if( !SeqSeek( start, contigI, startPos, readableBytes ) )
00092 {
00093 bufLen = 0;
00094 return false;
00095 }
00096
00097 if( contigI == ALL_CONTIGS )
00098 {
00099 uint32 curLen = 0;
00100 uint64 bytesRead = 0;
00101 while (curLen < bufLen)
00102 {
00103
00104 if(readableBytes <= 0)
00105 if( !SeqSeek( start + curLen, contigI, startPos, readableBytes ) ){
00106 bufLen = curLen;
00107 return true;
00108 }
00109
00110 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00111 gnSeqC* tmpBuf = new gnSeqC[readLen];
00112
00113
00114 m_ifstream.read(tmpBuf, readLen);
00115 uint64 gc = m_ifstream.gcount();
00116 bytesRead += gc;
00117 readableBytes -= gc;
00118 for(uint32 i=0; i < gc; i++){
00119 if( m_pFilter->IsValid(tmpBuf[i]) ){
00120 buf[curLen] = tmpBuf[i];
00121 curLen++;
00122 }
00123 }
00124 delete[] tmpBuf;
00125 if(m_ifstream.eof()){
00126 m_ifstream.clear();
00127 bufLen = curLen;
00128 return true;
00129 }
00130 }
00131 bufLen = curLen;
00132 }
00133 else if( contigI < m_contigList.size() )
00134 {
00135 uint32 curLen = 0;
00136
00137 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00138 bufLen = bufLen < contigSize ? bufLen : contigSize;
00139 while (curLen < bufLen)
00140 {
00141 uint64 readLen = bufLen - curLen;
00142 gnSeqC* tmpBuf = new gnSeqC[readLen];
00143
00144
00145 m_ifstream.read(tmpBuf, readLen);
00146 uint64 gc = m_ifstream.gcount();
00147
00148
00149 for(uint32 i=0; i < gc; i++){
00150 if( m_pFilter->IsValid(tmpBuf[i]) ){
00151 buf[curLen] = tmpBuf[i];
00152 curLen++;
00153 }
00154 }
00155 if(m_ifstream.eof()){
00156 m_ifstream.clear();
00157 bufLen = curLen;
00158 return true;
00159 }
00160 delete[] tmpBuf;
00161 }
00162 bufLen = curLen;
00163 }
00164 return true;
00165
00166 }
00167
00168
00169
00170
00171 boolean gnGBKSource::SeqSeek( const gnSeqI start, const uint32& contigI, uint64& startPos, uint64& readableBytes )
00172 {
00173 if( contigI == ALL_CONTIGS )
00174 {
00175
00176 gnSeqI curIndex = 0;
00177 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00178 for( ; iter != m_contigList.end(); ++iter )
00179 {
00180 uint64 len = (*iter)->GetSeqLength();
00181 if( (curIndex + len) > start )
00182 break;
00183 curIndex += len;
00184 }
00185 if( iter == m_contigList.end() )
00186 return false;
00187
00188 gnSeqI startIndex = start - curIndex;
00189 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00190 }
00191 else if( contigI < m_contigList.size() )
00192 {
00193 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00194 }
00195 return false;
00196 }
00197
00198 boolean gnGBKSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes )
00199 {
00200 readableBytes = 0;
00201 uint32 curLen = 0;
00202
00203 startPos = contig.GetSectStartEnd(gnContigSequence).first;
00204 m_ifstream.seekg( startPos, ios::beg );
00205 if( m_ifstream.eof() ){
00206 ErrorMsg("ERROR in gnGBKSource::Incorrect contig start position, End of file reached!\n");
00207 return false;
00208 }
00209 while( true )
00210 {
00211
00212
00213 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00214 if(tmpbufsize == 0){
00215 ErrorMsg("ERROR in gnGBKSource: stored contig size is incorrect.");
00216 return false;
00217 }
00218 uint64 startOffset = start;
00219 if(contig.HasRepeatSeqGap()){
00220 startOffset += 10 * (start / 60) + start / 10 + 11;
00221 startPos+=startOffset;
00222 m_ifstream.seekg(startPos , ios::beg);
00223 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00224 return true;
00225 }
00226
00227
00228 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;
00229 char *tmpbuf = new char[tmpbufsize];
00230 m_ifstream.read( tmpbuf, tmpbufsize );
00231 if( m_ifstream.eof() ){
00232 ErrorMsg("ERROR in gnGBKSource::Read End of file reached!\n");
00233 delete[] tmpbuf;
00234 return false;
00235 }
00236 for( uint32 i=0; i < tmpbufsize; ++i ){
00237 if( m_pFilter->IsValid(tmpbuf[i]) ){
00238 if( curLen >= start ){
00239 startPos += i;
00240 m_ifstream.seekg( startPos, ios::beg );
00241 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00242 delete[] tmpbuf;
00243 return true;
00244 }
00245 ++curLen;
00246 }
00247 }
00248 startPos += tmpbufsize;
00249 delete[] tmpbuf;
00250 }
00251 return true;
00252 }
00253
00254 void gnGBKSource::FormatString(string& data, uint32 offset, uint32 width){
00255
00256 string::size_type newline_loc = data.find_first_of('\n', 0);
00257 while(newline_loc != string::npos){
00258 if(data[newline_loc-1] == '\r')
00259 newline_loc--;
00260 string::size_type text_loc = newline_loc;
00261 while((data[text_loc] == ' ') ||(data[text_loc] == ' ')||(data[text_loc] == '\n')||(data[text_loc] == '\r')){
00262 text_loc++;
00263 if(text_loc+1 == data.length())
00264 break;
00265 }
00266 data = (data.substr(0, newline_loc) + " " + data.substr(text_loc));
00267 newline_loc = data.find_first_of('\n', 0);
00268 }
00269
00270 string output_string = "";
00271 for(uint32 charI = 0; charI < data.length();){
00272
00273 string::size_type base_loc = charI;
00274 string append_string;
00275 while(base_loc - charI <= width){
00276 string::size_type space_loc = data.find_first_of(' ', base_loc+1);
00277 if(space_loc - charI < width)
00278 base_loc = space_loc;
00279 else if(base_loc == charI){
00280
00281 append_string = data.substr(charI, width);
00282 charI+=width;
00283 }else{
00284 append_string = data.substr(charI, base_loc - charI);
00285 charI = base_loc;
00286 }
00287 }
00288 output_string += string(offset, ' ') + append_string;
00289 if(charI + width < data.length())
00290 output_string += "\r\n";
00291 }
00292 data = output_string;
00293 }
00294
00295 void gnGBKSource::WriteHeader(gnMultiSpec* spec, const string& hdr, ofstream& m_ofstream){
00296 gnBaseHeader* gpbh = NULL;
00297 uint32 header_index = 0;
00298 do{
00299 gpbh = spec->GetHeader(hdr, header_index);
00300 if(gpbh != NULL)
00301 m_ofstream << gpbh->GetHeader();
00302 header_index++;
00303 }while(gpbh != NULL);
00304 }
00305
00306 boolean gnGBKSource::Write(gnSequence& seq, const string& filename){
00307 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00308 if(!m_ofstream.is_open())
00309 return false;
00310
00311 string newline = "\r\n";
00312 gnGenomeSpec* spec = seq.GetSpec();
00313
00314
00315 if(spec->GetHeaderListLength() == 1){
00316 gnBaseHeader *gpbh = spec->GetHeader(0);
00317 string name = gpbh->GetHeaderName();
00318
00319 if(string::npos != name.find(".SEQ")){
00320 string header = gpbh->GetHeader();
00321 m_ofstream << header;
00322 }
00323 }
00324
00325 gnSeqC *bases = new gnSeqC[BUFFER_SIZE];
00326
00327 for(uint32 specI = 0; specI < spec->GetSpecListLength(); specI++){
00328 gnFragmentSpec* subSpec = spec->GetSpec(specI);
00329
00330
00331 m_ofstream << "LOCUS ";
00332
00333 string contigName = subSpec->GetName();
00334 if(contigName.length() > SEQ_LOCUS_NAME_LENGTH)
00335 contigName = contigName.substr(0, SEQ_LOCUS_NAME_LENGTH);
00336 uint32 filler_size = SEQ_LOCUS_NAME_LENGTH - contigName.length();
00337 m_ofstream << contigName << string(filler_size, ' ');
00338
00339 string length_string = uintToString(subSpec->GetLength());
00340 filler_size = SEQ_LOCUS_SIZE_LENGTH - length_string.size();
00341 m_ofstream << string(filler_size, ' ') << length_string << " bp ";
00342
00343 string dnatype = string(SEQ_LOCUS_DNATYPE_LENGTH, ' ');
00344 uint32 head_look_i = 0;
00345 gnBaseHeader* gpbh = subSpec->GetHeader("LOCUS", head_look_i);
00346 if(gpbh != NULL)
00347 dnatype = gpbh->GetHeader().substr(SEQ_LOCUS_DNATYPE_OFFSET, SEQ_LOCUS_DNATYPE_LENGTH);
00348 m_ofstream << dnatype << string(2, ' ');
00349
00350 string circular = subSpec->IsCircular() ? string("circular ") : string(10, ' ');
00351 m_ofstream << circular;
00352
00353 string division = string(SEQ_LOCUS_DIVCODE_LENGTH, ' ');
00354 if(gpbh != NULL)
00355 division = gpbh->GetHeader().substr(SEQ_LOCUS_DIVCODE_OFFSET, SEQ_LOCUS_DIVCODE_LENGTH);
00356 m_ofstream << division;
00357
00358 string date = string(SEQ_LOCUS_DATE_LENGTH, ' ');
00359 if(gpbh != NULL)
00360 date = gpbh->GetHeader().substr(SEQ_LOCUS_DATE_OFFSET, SEQ_LOCUS_DATE_LENGTH);
00361 m_ofstream << string(7, ' ') << date << "\r\n";
00362
00363
00364 WriteHeader(subSpec, "DEFINITION", m_ofstream);
00365 WriteHeader(subSpec, "ACCESSION", m_ofstream);
00366 WriteHeader(subSpec, "VERSION", m_ofstream);
00367 WriteHeader(subSpec, "KEYWORDS", m_ofstream);
00368 WriteHeader(subSpec, "SEGMENT", m_ofstream);
00369 WriteHeader(subSpec, "SOURCE", m_ofstream);
00370 WriteHeader(subSpec, "REFERENCE", m_ofstream);
00371 WriteHeader(subSpec, "COMMENT", m_ofstream);
00372
00373
00374 m_ofstream << "FEATURES Location/Qualifiers" << "\r\n";
00375 for(uint32 featureI = 0; featureI < subSpec->GetFeatureListLength(); featureI++){
00376
00377 gnBaseFeature *gpmf = subSpec->GetFeature(featureI);
00378 string featureName = gpmf->GetName();
00379 m_ofstream << string(SEQ_SUBTAG_COLUMN, ' ') << featureName;
00380 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET - featureName.length() - SEQ_SUBTAG_COLUMN, ' ');
00381
00382 uint32 location_count = gpmf->GetLocationListLength();
00383 uint32 line_pos = SEQ_FEATURE_LOC_OFFSET;
00384 uint32 parenthesis_count = 0;
00385 if(location_count > 1){
00386 m_ofstream << "join(";
00387 line_pos += 5;
00388 parenthesis_count++;
00389 }
00390 gnLocation::gnLocationType loc_type = gpmf->GetLocationType();
00391 switch(loc_type){
00392 case gnLocation::LT_Standard:
00393 break;
00394 case gnLocation::LT_Complement:
00395 m_ofstream << "complement(";
00396 line_pos += 11;
00397 parenthesis_count++;
00398 break;
00399 case gnLocation::LT_Order:
00400 m_ofstream << "order(";
00401 line_pos += 6;
00402 parenthesis_count++;
00403 break;
00404 case gnLocation::LT_Group:
00405 m_ofstream << "group(";
00406 parenthesis_count++;
00407 line_pos += 6;
00408 break;
00409 case gnLocation::LT_OneOf:
00410 m_ofstream << "one-of(";
00411 parenthesis_count++;
00412 line_pos += 7;
00413 break;
00414 default:
00415 break;
00416 }
00417
00418 string location;
00419 for(uint32 locationI = 0; locationI < location_count; locationI++){
00420 gnLocation gpl = gpmf->GetLocation(locationI);
00421 if(gpl.IsStartBoundLonger())
00422 location += ">";
00423 if(gpl.IsStartBoundShorter())
00424 location += "<";
00425 location += uintToString(gpl.GetStart());
00426 gnSeqI end_loc = gpl.GetEnd();
00427 if(end_loc != 0){
00428 switch(gpl.GetType()){
00429 case gnLocation::LT_BetweenBases:
00430 location += "^";
00431 break;
00432 case gnLocation::LT_OneOf:
00433 location += ".";
00434 break;
00435 default:
00436 location += "..";
00437 break;
00438 }
00439 if(gpl.IsEndBoundShorter())
00440 location += "<";
00441 if(gpl.IsEndBoundLonger())
00442 location += ">";
00443 location+= uintToString(end_loc);
00444 }
00445 if(locationI +1 < location_count)
00446 location += ",";
00447 else{
00448 for(;parenthesis_count > 0; parenthesis_count--)
00449 location += ")";
00450 }
00451
00452 if(line_pos + location.length() < SEQ_COLUMN_WIDTH){
00453 m_ofstream << location;
00454 line_pos += location.length();
00455 }else{
00456 m_ofstream << "\r\n" << string(SEQ_FEATURE_LOC_OFFSET, ' ') << location;
00457 line_pos = SEQ_FEATURE_LOC_OFFSET + location.length();
00458 }
00459 location = "";
00460 }
00461 m_ofstream << "\r\n";
00462
00463
00464
00465 uint32 qualifier_count = gpmf->GetQualifierListLength();
00466 for(uint32 qualifierI = 0; qualifierI < qualifier_count; qualifierI++){
00467 m_ofstream << string(SEQ_FEATURE_LOC_OFFSET, ' ');
00468 gnBaseQualifier* qualifier = gpmf->GetQualifier(qualifierI);
00469 m_ofstream << "/" << qualifier->GetName() << "=";
00470
00471 string qually = string(qualifier->GetValue());
00472
00473
00474 m_ofstream << qually << "\r\n";
00475 }
00476 if(gpmf != NULL)
00477 delete gpmf;
00478 }
00479
00480
00481 gnSeqI readOffset = seq.contigStart(specI);
00482 gnSeqI readLength = seq.contigLength(specI);
00483
00484
00485 m_ofstream << "BASE COUNT ";
00486 gnSeqI a_count=0, c_count=0, g_count=0, t_count=0, other_count=0;
00487 gnSeqI countLen = readLength + readOffset;
00488 for(gnSeqI countI = readOffset; countI < countLen;){
00489 gnSeqI writeLen = countLen - countI < BUFFER_SIZE ? countLen - countI : BUFFER_SIZE;
00490 if(!seq.ToArray(bases, writeLen, countI))
00491 return false;
00492 gnSeqI a, c, g, t, other;
00493 BaseCount(string(bases, writeLen), a, c, g, t, other);
00494 a_count += a;
00495 c_count += c;
00496 g_count += g;
00497 t_count += t;
00498 other_count += other;
00499 countI += writeLen;
00500 }
00501 m_ofstream << uintToString(a_count) << " a ";
00502 m_ofstream << uintToString(c_count) << " c ";
00503 m_ofstream << uintToString(g_count) << " g ";
00504 m_ofstream << uintToString(t_count) << " t ";
00505 m_ofstream << uintToString(other_count) << " others" << "\r\n";
00506
00507 string origin = "ORIGIN\r\n";
00508 head_look_i = 0;
00509 gpbh = subSpec->GetHeader("ORIGIN", head_look_i);
00510 if(gpbh != NULL)
00511 origin = gpbh->GetHeader();
00512 m_ofstream << origin;
00513
00514
00515 gnSeqI contig_bases = 0;
00516 while(readLength > 0){
00517 gnSeqI writeLen = readLength < BUFFER_SIZE + 20 ? readLength : BUFFER_SIZE + 20;
00518 boolean success = seq.ToArray(bases, writeLen, readOffset);
00519 if(!success)
00520 return false;
00521
00522 for(gnSeqI curbaseI = 0; curbaseI < writeLen; curbaseI += 60){
00523 string baseIndexStr = uintToString(contig_bases + curbaseI +1);
00524 m_ofstream << string(SEQ_BASES_INDEX_END - baseIndexStr.length(), ' ');
00525 m_ofstream << baseIndexStr;
00526 for(gnSeqI base_offset = 0; base_offset <= 50; base_offset+=10){
00527 if(writeLen <= curbaseI + base_offset)
00528 break;
00529 int64 print_length = writeLen - (curbaseI + base_offset);
00530 print_length = print_length > 10 ? 10 : print_length;
00531 m_ofstream << ' ' << string(bases + curbaseI + base_offset, print_length);
00532 }
00533 m_ofstream << "\r\n";
00534 }
00535 readLength -= writeLen;
00536 readOffset += writeLen;
00537 contig_bases += writeLen;
00538 }
00539 m_ofstream << "//\r\n";
00540 }
00541 delete[] bases;
00542
00543 m_ofstream.close();
00544 return true;
00545 }
00546
00547 gnFileContig* gnGBKSource::GetFileContig( const uint32 contigI ) const{
00548 if(m_contigList.size() > contigI)
00549 return m_contigList[contigI];
00550 return NULL;
00551 }
00552
00553
00554 boolean gnGBKSource::ParseStream( istream& fin )
00555 {
00556
00557 uint32 readState = 0;
00558 uint32 lineStart = 0;
00559
00560 uint32 sectionStart = 0;
00561 uint64 streamPos = 0;
00562 uint64 bufReadLen = 0;
00563 uint64 remainingBuffer = 0;
00564 char* buf = new char[BUFFER_SIZE];
00565 gnFragmentSpec* curFrag = 0;
00566 gnSourceSpec* curSpec = 0;
00567 gnSourceHeader *curHeader;
00568 gnFeature* curFeature;
00569 gnFileContig* curContig = 0;
00570 gnLocation::gnLocationType curBaseLocationType;
00571 gnSeqI curLocationStart;
00572 int32 curStartLength = 0;
00573 int32 curEndLength = 0;
00574 string curLocContig = "";
00575 string curQualifierName;
00576 uint64 curQualifierStart;
00577 string curContigName = "";
00578 gnSeqI seqLength = 0;
00579 gnSeqI seqChunk, seqChunkCount, gapChunk;
00580 uint32 curNewlineSize = 0;
00581
00582 m_spec = new gnGenomeSpec();
00583 while( !fin.eof() )
00584 {
00585 if(sectionStart > 0){
00586 if(readState == 14)
00587 sectionStart = lineStart;
00588 remainingBuffer = bufReadLen - sectionStart;
00589 memmove(buf, buf+sectionStart, remainingBuffer);
00590 }
00591
00592 fin.read( buf + remainingBuffer, BUFFER_SIZE - remainingBuffer);
00593 streamPos -= remainingBuffer;
00594 lineStart -= sectionStart;
00595 sectionStart = 0;
00596 bufReadLen = fin.gcount();
00597 bufReadLen += remainingBuffer;
00598
00599 for( uint32 i=remainingBuffer ; i < bufReadLen ; i++ )
00600 {
00601 char ch = buf[i];
00602 switch( readState )
00603 {
00604 case 0:
00605
00606 if((ch == '\n')&&(buf[lineStart] != ' ')&&(buf[lineStart] != ' ')){
00607 if(curSpec == NULL){
00608 curSpec = new gnSourceSpec(this, m_spec->GetSpecListLength());
00609 curFrag = new gnFragmentSpec();
00610 curFrag->AddSpec(curSpec);
00611 curSpec->SetSourceName(m_openString);
00612 m_spec->AddSpec(curFrag);
00613 }
00614 if(lineStart != sectionStart){
00615 uint32 j = SEQ_HEADER_NAME_LENGTH-1;
00616 for(; j > 0; j--)
00617 if((buf[sectionStart+j] != ' ')&&(buf[sectionStart+j] != ' '))
00618 break;
00619 string header_name = string(buf+sectionStart, j+1);
00620 curHeader = new gnSourceHeader(this, header_name, sectionStart + streamPos, lineStart - sectionStart);
00621
00622 if(strncmp(&buf[lineStart], "LOCUS", 5) == 0)
00623 m_spec->AddHeader(curHeader);
00624 else
00625 curFrag->AddHeader(curHeader);
00626 sectionStart = lineStart;
00627 }
00628
00629 if(strncmp(&buf[lineStart], "FEATURES", 8) == 0){
00630 sectionStart = i + 1;
00631 readState = 1;
00632 }else if(strncmp(&buf[lineStart], "ORIGIN", 6) == 0){
00633 curHeader = new gnSourceHeader(this, string("ORIGIN"), sectionStart + streamPos, i - sectionStart + 1);
00634 curFrag->AddHeader(curHeader);
00635 curContig = new gnFileContig();
00636 curContig->SetName(curContigName);
00637 curContigName = "";
00638 readState = 13;
00639 }else if(strncmp(&buf[lineStart], "LOCUS", 5) == 0){
00640 if(strncmp(&buf[lineStart+SEQ_LOCUS_CIRCULAR_COLUMN-1], "circular", 8) == 0)
00641 curFrag->SetCircular(true);
00642 uint32 j = SEQ_LOCUS_NAME_LENGTH;
00643 for(; j >= 0; j--)
00644 if((buf[lineStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' ')&&(buf[sectionStart+SEQ_LOCUS_NAME_COLUMN+j-1] != ' '))
00645 break;
00646 curContigName = string(buf+lineStart+SEQ_LOCUS_NAME_COLUMN-1, j+1);
00647 curFrag->SetName(curContigName);
00648 }
00649 }
00650 if(ch == '\n'){
00651 if(curNewlineSize == 0){
00652
00653 if(buf[i-1] == '\r'){
00654
00655 m_newlineType = gnNewlineWindows;
00656 curNewlineSize = 2;
00657 }else{
00658 if(buf[i] == '\r')
00659 m_newlineType = gnNewlineMac;
00660 else
00661 m_newlineType = gnNewlineUnix;
00662
00663 curNewlineSize = 1;
00664 }
00665 }
00666 lineStart = i + 1;
00667 }
00668 break;
00669 case 1:
00670 if((ch == ' ')||(ch == ' ')){
00671 break;
00672 }else if(ch == '\n'){
00673 lineStart = i + 1;
00674 sectionStart = i + 1;
00675 break;
00676 }else if(sectionStart == i){
00677 i--;
00678 readState = 0;
00679 sectionStart = i + 1;
00680 break;
00681 }else if((i - lineStart == SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1))){
00682 sectionStart = i;
00683 readState = 2;
00684 }
00685 case 2:
00686 if((ch == ' ')||(ch == ' ')){
00687 string featureName(buf+sectionStart, i - sectionStart);
00688 curFeature = new gnFeature(featureName);
00689 curFrag->AddFeature(curFeature);
00690 sectionStart = i + 1;
00691 readState = 3;
00692 }
00693 break;
00694 case 3:
00695 if((ch == ' ')||(ch == ' ')){
00696 break;
00697 }else if((ch == '\r')||(ch == '\n')){
00698 lineStart = i+1;
00699 break;
00700 }
00701 sectionStart = i;
00702 readState = 4;
00703 case 4:
00704 if((ch == ' ')||(ch == ' ')||(ch == '(')||(ch == '.')||(ch=='^')||(ch==':')){
00705 string starter(buf+sectionStart, i - sectionStart);
00706 if(ch == '('){
00707 if(starter == "complement")
00708 curFeature->SetLocationType(gnLocation::LT_Complement);
00709 else if(starter == "order")
00710 curFeature->SetLocationType(gnLocation::LT_Order);
00711 else if(starter == "group")
00712 curFeature->SetLocationType(gnLocation::LT_Group);
00713 else if(starter == "one-of")
00714 curFeature->SetLocationType(gnLocation::LT_OneOf);
00715 sectionStart = i + 1;
00716 break;
00717 }else if(ch == ':'){
00718 curLocContig = starter;
00719 sectionStart = i + 1;
00720 break;
00721 }
00722 curLocationStart = atoi(starter.c_str());
00723 readState = 6;
00724 if(ch == '.'){
00725
00726 readState = 5;
00727 sectionStart = i + 1;
00728 break;
00729 }else if(ch == '^'){
00730 curBaseLocationType = gnLocation::LT_BetweenBases;
00731 }else if((ch == ' ')||(ch == ' ')){
00732
00733 gnLocation curLocation(curLocationStart, curLocationStart);
00734 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00735 readState = 7;
00736 }
00737 sectionStart = i + 1;
00738
00739 }else if(ch == '<'){
00740 curStartLength = -1;
00741 sectionStart = i + 1;
00742 }else if(ch == '>'){
00743 curStartLength = 1;
00744 sectionStart = i + 1;
00745 }
00746 break;
00747 case 5:
00748 if(ch == '.'){
00749 curBaseLocationType = gnLocation::LT_Standard;
00750 readState = 6;
00751 sectionStart = i + 1;
00752 break;
00753 }
00754 curBaseLocationType = gnLocation::LT_OneOf;
00755 case 6:
00756 if(ch == '>'){
00757 curEndLength = 1;
00758 sectionStart = i + 1;
00759 }else if(ch == '<'){
00760 curEndLength = -1;
00761 sectionStart = i + 1;
00762 }else if((ch == ' ')||(ch == ' ')||(ch == ',')){
00763
00764 string ender(buf+sectionStart, i - sectionStart);
00765 gnSeqI curLocationEnd = atoi(ender.c_str());
00766 gnLocation curLocation(curLocationStart, curStartLength, curLocationEnd, curEndLength, curBaseLocationType);
00767 curEndLength = 0;
00768 curStartLength = 0;
00769 curFeature->AddLocation(curLocation, curFeature->GetLocationListLength());
00770 readState = ch == ',' ? 3 : 7;
00771 sectionStart = i+1;
00772 }
00773 break;
00774 case 7:
00775 if((ch != ' ')&&(ch != ' ')&&(lineStart == i)){
00776 sectionStart = i;
00777 readState = 0;
00778 i--;
00779 }else if((ch != ' ')&&(ch != ' ')&&((lineStart == i - SEQ_SUBTAG_COLUMN)||((buf[lineStart]==' ')&&(i==lineStart+1)))){
00780 sectionStart = i;
00781 readState = 2;
00782 i--;
00783 }else if(ch == ','){
00784 sectionStart = i+1;
00785 readState = 3;
00786 }else if(ch == '/'){
00787 sectionStart = i+1;
00788 readState = 8;
00789 }else if(ch == '\n')
00790 lineStart = i + 1;
00791 break;
00792 case 8:
00793 if(ch == '='){
00794 curQualifierName = string(buf+sectionStart, i - sectionStart);
00795 readState = 9;
00796 sectionStart = i+1;
00797 }
00798 break;
00799 case 9:
00800 if(ch == '"'){
00801 readState = 10;
00802 sectionStart = i;
00803 curQualifierStart = i + streamPos;
00804 }else if(ch == '['){
00805 readState = 11;
00806 sectionStart = i;
00807 }else if((ch == '\r')||(ch == '\n')){
00808 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00809 sectionStart = i+1;
00810 readState = 7;
00811 }
00812 break;
00813 case 10:
00814 if(ch == '"')
00815 readState = 11;
00816 if(ch == '\n'){
00817 lineStart = i + 1;
00818 }
00819 break;
00820 case 11:
00821 if(ch != '"'){
00822 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, curQualifierStart, i - sectionStart));
00823 sectionStart = i+1;
00824 readState = 7;
00825 if(ch == '\n')
00826 lineStart = i + 1;
00827 }else
00828 readState = 10;
00829 break;
00830 case 12:
00831 if(ch == ']'){
00832 curFeature->AddQualifier(new gnSourceQualifier(this, curQualifierName, sectionStart + streamPos, i - sectionStart));
00833 sectionStart = i+1;
00834 readState = 7;
00835 }
00836 break;
00837 case 13:
00838 curContig->SetSectStart(gnContigSequence, i - 1 + streamPos);
00839 curContig->SetRepeatSeqGap(true);
00840 seqChunk = 0;
00841 seqChunkCount = 0;
00842 gapChunk = curNewlineSize + 1;
00843 readState = 14;
00844 break;
00845 case 14:
00846 while(i < bufReadLen){
00847 ch = buf[i];
00848 if((ch == '/')&&(i==lineStart)){
00849 readState = 15;
00850 break;
00851 }else if(m_pFilter->IsValid(ch)){
00852 if(gapChunk > 0){
00853 if((gapChunk > 1 && seqChunkCount > 0) ||
00854 (gapChunk != 10 + curNewlineSize && seqChunkCount == 0)){
00855
00856 ErrorMsg("File is corrupt. Proceed with caution.");
00857 curContig->SetRepeatSeqGap(false);
00858 }
00859 gapChunk = 0;
00860 }
00861 seqChunk++;
00862 seqLength++;
00863 }else{
00864 gapChunk++;
00865 if(seqChunk == 10){
00866 seqChunk = 0;
00867 seqChunkCount++;
00868 if(seqChunkCount == 6){
00869
00870 seqChunkCount = 0;
00871 }
00872 }
00873 if(ch == '\n')
00874 lineStart = i + 1;
00875 }
00876 i++;
00877 }
00878 break;
00879 case 15:
00880 if((ch == '\n')&&(buf[lineStart+1] == '/')){
00881 curContig->SetSectEnd(gnContigSequence, lineStart - 2 + streamPos);
00882 curContig->SetSeqLength(seqLength);
00883 m_contigList.push_back(curContig);
00884 curContig = 0;
00885 curSpec->SetLength(seqLength);
00886 curSpec = 0;
00887 seqLength = 0;
00888 lineStart = i + 1;
00889 sectionStart = i + 1;
00890 readState = 0;
00891 }
00892 break;
00893 }
00894 }
00895 streamPos += bufReadLen;
00896 }
00897 if(curContig != 0){
00898 curContig->SetSectEnd(gnContigSequence, streamPos - 1);
00899 curContig->SetSeqLength(seqLength);
00900 m_contigList.push_back(curContig);
00901 curSpec->SetLength(seqLength);
00902 }
00903 if(curSpec != 0)
00904 if((curFrag->GetFeatureListLength() == 0) && (curFrag->GetHeaderListLength() == 0)
00905 &&(curSpec->GetLength() == 0)){
00906 m_spec->RemoveSpec(m_spec->GetSpecListLength() - 1);
00907 delete curFrag;
00908 }
00909 m_ifstream.clear();
00910 delete[] buf;
00911 return true;
00912 }