00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00013
00014 #include "gn/gnBaseSpec.h"
00015 #include "gn/gnFilter.h"
00016 #include "gn/gnStringTools.h"
00017 #include "gn/gnSourceSpec.h"
00018 #include "gn/gnSourceHeader.h"
00019 #include "gn/gnDebug.h"
00020
00021 gnFASSource::gnFASSource()
00022 {
00023 m_openString = "";
00024 m_pFilter = gnFilter::fullDNASeqFilter();
00025 if(m_pFilter == NULL){
00026 DebugMsg("Error using static sequence filters.");
00027 }
00028 }
00029 gnFASSource::gnFASSource( const gnFASSource& s ) : gnFileSource(s)
00030 {
00031 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00032 for( ; iter != s.m_contigList.end(); ++iter )
00033 {
00034 m_contigList.push_back( (*iter)->Clone() );
00035 }
00036 }
00037 gnFASSource::~gnFASSource()
00038 {
00039 m_ifstream.close();
00040 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00041 for( ; iter != m_contigList.end(); ++iter )
00042 {
00043 gnFileContig* fg = *iter;
00044 *iter = 0;
00045 delete fg;
00046 }
00047 }
00048 boolean gnFASSource::HasContig( const string& nameStr ) const
00049 {
00050 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00051 {
00052 if( nameStr == m_contigList[i]->GetName() )
00053 return true;
00054 }
00055 return false;
00056 }
00057 uint32 gnFASSource::GetContigID( const string& name ) const
00058 {
00059 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00060 {
00061 if( name == m_contigList[i]->GetName() )
00062 return i;
00063 }
00064 return ALL_CONTIGS;
00065 }
00066 string gnFASSource::GetContigName( const uint32 i ) const
00067 {
00068 if( i < m_contigList.size() ){
00069 return m_contigList[i]->GetName();
00070 }
00071 return "";
00072 }
00073 gnSeqI gnFASSource::GetContigSeqLength( const uint32 i ) const
00074 {
00075 if( i < m_contigList.size() ){
00076 return m_contigList[i]->GetSeqLength();
00077 }else if( i == ALL_CONTIGS){
00078 gnSeqI seqlen = 0;
00079 for(uint32 j=0; j < m_contigList.size(); j++)
00080 seqlen += m_contigList[j]->GetSeqLength();
00081 return seqlen;
00082 }
00083 return GNSEQI_ERROR;
00084 }
00085 gnFileContig* gnFASSource::GetContig( const uint32 i ) const
00086 {
00087 if( i < m_contigList.size() ){
00088 return m_contigList[i];
00089 }
00090 return 0;
00091 }
00092
00093 boolean gnFASSource::SeqRead( const gnSeqI start, char* buf, uint32& bufLen, const uint32 contigI )
00094 {
00095 m_ifstream.clear();
00096 uint32 contigIndex = contigI;
00097 uint64 startPos = 0;
00098 uint64 readableBytes = 0;
00099 if( !SeqSeek( start, contigIndex, startPos, readableBytes ) ){
00100 bufLen = 0;
00101 return false;
00102 }
00103
00104 if( contigI == ALL_CONTIGS ){
00105 uint32 curLen = 0;
00106 uint64 bytesRead = 0;
00107 while (curLen < bufLen){
00108
00109 if(readableBytes <= 0)
00110 if( !SeqSeek( start + curLen, contigIndex, startPos, readableBytes ) ){
00111 bufLen = curLen;
00112 return true;
00113 }
00114
00115 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00116 gnSeqC* tmpBuf = new gnSeqC[readLen];
00117
00118
00119 m_ifstream.read(tmpBuf, readLen);
00120 uint64 gc = m_ifstream.gcount();
00121 bytesRead += gc;
00122 readableBytes -= gc;
00123
00124
00125 for(uint32 i=0; i < gc; i++){
00126 if( m_pFilter->IsValid(tmpBuf[i]) ){
00127 buf[curLen] = tmpBuf[i];
00128 curLen++;
00129 }
00130 }
00131 delete[] tmpBuf;
00132 if(m_ifstream.eof()){
00133 m_ifstream.clear();
00134 bufLen = curLen;
00135 return true;
00136 }
00137 }
00138 bufLen = curLen;
00139 }
00140 else if( contigI < m_contigList.size() )
00141 {
00142 uint32 curLen = 0;
00143
00144 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00145 bufLen = bufLen < contigSize ? bufLen : contigSize;
00146 while (curLen < bufLen)
00147 {
00148 uint64 readLen = bufLen - curLen;
00149 gnSeqC* tmpBuf = new gnSeqC[readLen];
00150
00151
00152 m_ifstream.read(tmpBuf, readLen);
00153 uint64 gc = m_ifstream.gcount();
00154
00155
00156 for(uint32 i=0; i < gc; i++){
00157 if( m_pFilter->IsValid(tmpBuf[i]) ){
00158 buf[curLen] = tmpBuf[i];
00159 curLen++;
00160 }
00161 }
00162 if(m_ifstream.eof()){
00163 m_ifstream.clear();
00164 bufLen = curLen;
00165 return true;
00166 }
00167 }
00168 bufLen = curLen;
00169 }
00170 return true;
00171 }
00172
00173
00174
00175
00176
00177
00178 boolean gnFASSource::SeqSeek( const gnSeqI start, const uint32 contigI, uint64& startPos, uint64& readableBytes )
00179 {
00180 if( contigI == ALL_CONTIGS ){
00181
00182 gnSeqI curIndex = 0;
00183 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00184 for( ; iter != m_contigList.end(); ++iter )
00185 {
00186 uint64 len = (*iter)->GetSeqLength();
00187 if( (curIndex + len) > start )
00188 break;
00189 curIndex += len;
00190 }
00191 if( iter == m_contigList.end() )
00192 return false;
00193
00194 gnSeqI startIndex = start - curIndex;
00195 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00196 }
00197 else if( contigI < m_contigList.size() )
00198 {
00199 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00200 }
00201 return false;
00202 }
00203
00204
00205
00206 boolean gnFASSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ){
00207 readableBytes = 0;
00208 uint32 curLen = 0;
00209
00210 startPos = contig.GetSectStartEnd(gnContigSequence).first;
00211
00212 if(contig.HasRepeatSeqGap())
00213 if(contig.GetRepeatSeqGapSize().first > 0)
00214 if(contig.GetRepeatSeqGapSize().second > 0){
00215
00216 startPos += start + (start/contig.GetRepeatSeqGapSize().first)*contig.GetRepeatSeqGapSize().second;
00217 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00218 m_ifstream.seekg( startPos, ios::beg );
00219 return true;
00220 }
00221 m_ifstream.seekg( startPos, ios::beg );
00222 if( m_ifstream.eof() ){
00223 ErrorMsg("ERROR in gnFASSource::Incorrect contig start position, End of file reached!\n");
00224 return false;
00225 }
00226 while( true ){
00227
00228
00229 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00230 if(tmpbufsize == 0){
00231 ErrorMsg("ERROR in gnFASSource: stored contig size is incorrect.\n");
00232 return false;
00233 }
00234 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;
00235 char *tmpbuf = new char[tmpbufsize];
00236 m_ifstream.read( tmpbuf, tmpbufsize );
00237 if( m_ifstream.eof() ){
00238 ErrorMsg("ERROR in gnFASSource::Read End of file reached!\n");
00239 delete[] tmpbuf;
00240 return false;
00241 }
00242 for( uint32 i=0; i < tmpbufsize; ++i ){
00243 if( m_pFilter->IsValid(tmpbuf[i]) ){
00244 if( curLen >= start ){
00245 startPos += i;
00246 m_ifstream.seekg( startPos, ios::beg );
00247 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00248 delete[] tmpbuf;
00249 return true;
00250 }
00251 ++curLen;
00252 }
00253 }
00254 startPos += tmpbufsize;
00255 delete[] tmpbuf;
00256 }
00257 return true;
00258 }
00259
00260
00261 boolean gnFASSource::Write(gnBaseSource *source, const string& filename){
00262 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00263 if(!m_ofstream.is_open())
00264 return false;
00265 uint32 contigCount = source->GetContigListLength();
00266 for(uint32 contigI = 0; contigI < contigCount; contigI++){
00267 string contigName = source->GetContigName(contigI);
00268 m_ofstream << ">" << contigName << ";\n";
00269 gnSeqI seqLength = source->GetContigSeqLength(contigI);
00270 while(seqLength > 0){
00271 gnSeqI writeLen = seqLength < BUFFER_SIZE ? seqLength : BUFFER_SIZE;
00272 gnSeqC *bases = new gnSeqC[writeLen+1];
00273 boolean success = source->SeqRead(0, bases, writeLen, contigI);
00274 if(!success)
00275 return false;
00276 bases[writeLen] = '\0';
00277 m_ofstream << bases << "\n";
00278 delete[] bases;
00279 seqLength -= writeLen;
00280 }
00281 }
00282 m_ofstream.close();
00283 return true;
00284 }
00285
00286
00287 void gnFASSource::Write(gnSequence& seq, const string& filename, boolean write_coords, boolean enforce_unique_names){
00288 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00289 if(!m_ofstream.is_open())
00290 Throw_gnEx(FileNotOpened());
00291 Write(seq, m_ofstream, write_coords, enforce_unique_names);
00292 m_ofstream.close();
00293 }
00294
00295 void gnFASSource::Write(gnSequence& seq, ostream& m_ostream, boolean write_coords, boolean enforce_unique_names){
00296 vector<string> contigNameList;
00297 gnSeqC *bases = new gnSeqC[BUFFER_SIZE];
00298 gnGenomeSpec* spec = seq.GetSpec();
00299
00300
00301 string newline = "\r\n";
00302 gnSeqI readOffset = 1;
00303
00304 for(uint32 fragI = 0; fragI < seq.contigListLength(); fragI++){
00305
00306 string contigName = seq.contigName(fragI);
00307
00308 if(enforce_unique_names){
00309 uint32 name_count = 0;
00310 for(uint32 i=0; i < contigNameList.size(); i++)
00311 if(contigNameList[i] == contigName)
00312 name_count++;
00313 contigNameList.push_back(contigName);
00314 if(name_count > 0)
00315 contigName += "_" + uintToString(name_count);
00316 }
00317
00318 gnFragmentSpec* subSpec = spec->GetSpec(fragI);
00319 gnBaseHeader *gpbh = subSpec->GetHeader(0);
00320 string header = "";
00321 if(gpbh != NULL){
00322 header = gpbh->GetHeader();
00323
00324 uint32 newlinepos = header.find_first_of('\n', 0);
00325 if(newlinepos != string::npos){
00326 if(header[newlinepos-1] == '\r')
00327 newlinepos--;
00328 header = header.substr(0, newlinepos);
00329 }
00330 }
00331
00332
00333 gnSeqI readLength = seq.contigLength(fragI);
00334 m_ostream << ">" << contigName;
00335 if(write_coords)
00336 m_ostream << " " << "(" << readOffset << ", " << readOffset + readLength - 1 << ")";
00337 m_ostream << "; " << header << newline;
00338
00339 gnSeqI linePos = 0;
00340 while(readLength > 0){
00341 gnSeqI read_chunk_size = (BUFFER_SIZE / FAS_LINE_WIDTH) * FAS_LINE_WIDTH;
00342 gnSeqI writeLen = readLength < read_chunk_size ? readLength : read_chunk_size;
00343
00344 if(!seq.ToArray(bases, writeLen, readOffset))
00345 return;
00346 for(gnSeqI curbase = 0; curbase < writeLen; curbase += FAS_LINE_WIDTH){
00347 gnSeqI writeout_size = writeLen - curbase < FAS_LINE_WIDTH ? writeLen - curbase : FAS_LINE_WIDTH;
00348 m_ostream << string(bases + curbase, writeout_size) << newline;
00349 }
00350 readLength -= writeLen;
00351 readOffset += writeLen;
00352 }
00353 if(linePos != 0)
00354 m_ostream << newline;
00355 }
00356
00357 delete[] bases;
00358 }
00359
00360 gnGenomeSpec *gnFASSource::GetSpec() const{
00361 gnGenomeSpec *spec = new gnGenomeSpec();
00362 for(uint32 i=0; i < m_contigList.size(); i++){
00363
00364 gnFragmentSpec *fragmentSpec = new gnFragmentSpec();
00365 gnSourceSpec *contigSpec = new gnSourceSpec((gnBaseSource*)this, i);
00366
00367 spec->AddSpec(fragmentSpec, i);
00368 fragmentSpec->AddSpec(contigSpec);
00369
00370 fragmentSpec->SetName(m_contigList[i]->GetName());
00371 fragmentSpec->SetSourceName(m_openString);
00372 contigSpec->SetName(m_contigList[i]->GetName());
00373 contigSpec->SetSourceName(m_openString);
00374
00375 pair<uint32, uint32> headsect = m_contigList[i]->GetSectStartEnd(gnContigHeader);
00376 if(headsect.first != headsect.second){
00377 gnSourceHeader *gpsh = new gnSourceHeader((gnBaseSource *)this, string(""), headsect.first, headsect.second - headsect.first);
00378 fragmentSpec->AddHeader(gpsh, 0);
00379 }
00380 }
00381 return spec;
00382 }
00383
00384 gnFileContig* gnFASSource::GetFileContig( const uint32 contigI ) const{
00385 if(m_contigList.size() > contigI)
00386 return m_contigList[contigI];
00387 return NULL;
00388 }
00389
00390 boolean gnFASSource::ParseStream( istream& fin )
00391 {
00392
00393 uint32 readState = 0;
00394 gnFileContig* currentContig = 0;
00395 string nameFStr;
00396 uint64 seqLength = 0, gapLength = 0;
00397
00398 uint64 streamPos = 0;
00399 uint64 bufReadLen = 0;
00400 char* buf = new char[BUFFER_SIZE];
00401 boolean paren_hit = false;
00402 uint32 curNewlineSize = 0;
00403 uint32 repeatSeqSize = 0;
00404
00405
00406
00407 fin.getline( buf, BUFFER_SIZE);
00408 fin.seekg(-2, ios::cur);
00409 fin.read( buf, 2);
00410 fin.seekg(0);
00411 if(buf[1] == '\n'){
00412 if(buf[0] == '\r'){
00413 m_newlineType = gnNewlineWindows;
00414 curNewlineSize = 2;
00415 }else{
00416 curNewlineSize = 1;
00417 if(buf[1] == '\r')
00418 m_newlineType = gnNewlineMac;
00419 else
00420 m_newlineType = gnNewlineUnix;
00421 }
00422 }
00423
00424 while( !fin.eof() )
00425 {
00426
00427 fin.read( buf, BUFFER_SIZE);
00428 streamPos += bufReadLen;
00429 bufReadLen = fin.gcount();
00430 for( uint32 i=0 ; i < bufReadLen ; i++ )
00431 {
00432 char ch = buf[i];
00433 switch( readState )
00434 {
00435 case 0:
00436 if( !((buf[0] == '>') || !m_pFilter->IsValid(buf[0])) )
00437 {
00438 delete[] buf;
00439 return false;
00440 }
00441 readState = 1;
00442 case 1:
00443 if( ch == '>' )
00444 {
00445 seqLength = 0; gapLength = 0;
00446 currentContig = new gnFileContig();
00447 currentContig->SetFileStart( streamPos + i );
00448 currentContig->SetRepeatSeqGap(true);
00449 currentContig->SetRepeatSeqSize( repeatSeqSize );
00450 currentContig->SetRepeatGapSize( curNewlineSize );
00451 readState = 2;
00452 paren_hit = false;
00453 nameFStr = "";
00454 }
00455 else
00456 ++gapLength;
00457 break;
00458 case 2:
00459 if( isNewLine(ch) || ch == ';')
00460 {
00461 currentContig->SetName( nameFStr );
00462 currentContig->SetSectStart( gnContigHeader, streamPos + i + 1 );
00463 if( ch == ';' )
00464 readState = 3;
00465 else{
00466 readState = 4;
00467 if(ch == '\r')
00468 currentContig->SetSectStart( gnContigHeader, streamPos + i + 2 );
00469 }
00470 }
00471 else if( ch == '(' ){
00472 if(isSpace(buf[i-1])){
00473
00474 nameFStr = nameFStr.substr(0, nameFStr.length()-1);
00475 }
00476 paren_hit = true;
00477 }else if((!isSpace(ch) || nameFStr.length()) && !paren_hit )
00478 nameFStr += ch;
00479 break;
00480 case 3:
00481 if( isNewLine(ch) )
00482 readState = 4;
00483 break;
00484 case 4:
00485 if( ch == '>' )
00486 {
00487 readState = 3;
00488 }
00489 else if( m_pFilter->IsValid(ch) )
00490 {
00491 currentContig->SetSectEnd( gnContigHeader, streamPos + i );
00492 currentContig->SetSectStart( gnContigSequence, streamPos + i);
00493 seqLength = 1; gapLength = 0;
00494 readState = 5;
00495 }
00496 break;
00497 case 5:
00498 while(i < bufReadLen){
00499 ch = buf[i];
00500 if( m_pFilter->IsValid(ch) ){
00501 if(gapLength > 0){
00502 if(seqLength != repeatSeqSize)
00503 currentContig->SetRepeatSeqGap(false);
00504 if(gapLength != curNewlineSize)
00505
00506 currentContig->SetRepeatSeqGap(false);
00507 currentContig->AddToSeqLength( seqLength );
00508 seqLength = 0;
00509 gapLength = 0;
00510 }
00511 seqLength++;
00512 }else if( ch == '>'){
00513 currentContig->AddToSeqLength( seqLength );
00514 currentContig->SetSectEnd( gnContigSequence, streamPos + i - 1);
00515 currentContig->SetFileEnd( streamPos + i - 1 );
00516 m_contigList.push_back(currentContig);
00517 readState = 1;
00518 i--;
00519 break;
00520 }
00521 else if( isNewLine(ch) ){
00522 if( repeatSeqSize == 0 ){
00523 repeatSeqSize = seqLength;
00524 currentContig->SetRepeatSeqSize( repeatSeqSize );
00525 }
00526 gapLength++;
00527 }
00528 else{
00529
00530 currentContig->SetRepeatSeqGap(false);
00531 }
00532 i++;
00533 }
00534 break;
00535 default:
00536 ErrorMsg("ERROR");
00537 delete[] buf;
00538 return false;
00539 break;
00540 }
00541 }
00542 }
00543
00544 if( currentContig != 0 )
00545 {
00546 if( readState == 2 )
00547 {
00548 currentContig->SetName( nameFStr );
00549 }
00550 if( (readState >= 2) && (readState < 5) )
00551 {
00552 currentContig->SetSectEnd( gnContigHeader, streamPos + bufReadLen );
00553 }
00554 else if( readState == 5 )
00555 {
00556 currentContig->AddToSeqLength( seqLength );
00557 currentContig->SetSectEnd( gnContigSequence, streamPos + bufReadLen );
00558 }
00559 currentContig->SetFileEnd( streamPos + bufReadLen );
00560 m_contigList.push_back(currentContig);
00561 }
00562 m_ifstream.clear();
00563 delete[] buf;
00564 return true;
00565 }