00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "gn/gnFASSource.h"
00013 #include "gn/gnBaseSpec.h"
00014 #include "gn/gnFilter.h"
00015 #include "gn/gnStringTools.h"
00016 #include "gn/gnSourceSpec.h"
00017 #include "gn/gnSourceHeader.h"
00018 #include "gn/gnDebug.h"
00019
00020 gnFASSource::gnFASSource()
00021 {
00022 m_openString = "";
00023 m_pFilter = gnFilter::fullDNASeqFilter();
00024 if(m_pFilter == NULL){
00025 DebugMsg("Error using static sequence filters.");
00026 }
00027 }
00028 gnFASSource::gnFASSource( const gnFASSource& s ) : gnFileSource(s)
00029 {
00030 vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00031 for( ; iter != s.m_contigList.end(); ++iter )
00032 {
00033 m_contigList.push_back( (*iter)->Clone() );
00034 }
00035 }
00036 gnFASSource::~gnFASSource()
00037 {
00038 m_ifstream.close();
00039 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00040 for( ; iter != m_contigList.end(); ++iter )
00041 {
00042 gnFileContig* fg = *iter;
00043 *iter = 0;
00044 delete fg;
00045 }
00046 }
00047 boolean gnFASSource::HasContig( const string& nameStr ) const
00048 {
00049 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00050 {
00051 if( nameStr == m_contigList[i]->GetName() )
00052 return true;
00053 }
00054 return false;
00055 }
00056 uint32 gnFASSource::GetContigID( const string& name ) const
00057 {
00058 for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00059 {
00060 if( name == m_contigList[i]->GetName() )
00061 return i;
00062 }
00063 return ALL_CONTIGS;
00064 }
00065 string gnFASSource::GetContigName( const uint32 i ) const
00066 {
00067 if( i < m_contigList.size() ){
00068 return m_contigList[i]->GetName();
00069 }
00070 return "";
00071 }
00072 gnSeqI gnFASSource::GetContigSeqLength( const uint32 i ) const
00073 {
00074 if( i < m_contigList.size() ){
00075 return m_contigList[i]->GetSeqLength();
00076 }else if( i == ALL_CONTIGS){
00077 gnSeqI seqlen = 0;
00078 for(uint32 j=0; j < m_contigList.size(); j++)
00079 seqlen += m_contigList[j]->GetSeqLength();
00080 return seqlen;
00081 }
00082 return GNSEQI_ERROR;
00083 }
00084 gnFileContig* gnFASSource::GetContig( const uint32 i ) const
00085 {
00086 if( i < m_contigList.size() ){
00087 return m_contigList[i];
00088 }
00089 return 0;
00090 }
00091
00092 boolean gnFASSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI )
00093 {
00094 m_ifstream.clear();
00095 uint32 contigIndex = contigI;
00096 uint64 startPos = 0;
00097 uint64 readableBytes = 0;
00098 if( !SeqSeek( start, contigIndex, startPos, readableBytes ) ){
00099 bufLen = 0;
00100 return false;
00101 }
00102
00103 if( contigI == ALL_CONTIGS ){
00104 uint32 curLen = 0;
00105 uint64 bytesRead = 0;
00106 while (curLen < bufLen){
00107
00108 if(readableBytes <= 0)
00109 if( !SeqSeek( start + curLen, contigIndex, startPos, readableBytes ) ){
00110 bufLen = curLen;
00111 return true;
00112 }
00113
00114 uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00115 Array<gnSeqC> array_buf( readLen );
00116 gnSeqC* tmpBuf = array_buf.data;
00117
00118
00119 m_ifstream.read(tmpBuf, readLen);
00120 uint64 gc = m_ifstream.gcount();
00121 bytesRead += gc;
00122 readableBytes -= gc;
00123
00124
00125 for(uint32 i=0; i < gc; i++){
00126 if( m_pFilter->IsValid(tmpBuf[i]) ){
00127 buf[curLen] = tmpBuf[i];
00128 curLen++;
00129 }
00130 }
00131 if(m_ifstream.eof()){
00132 m_ifstream.clear();
00133 bufLen = curLen;
00134 return true;
00135 }
00136 }
00137 bufLen = curLen;
00138 }
00139 else if( contigI < m_contigList.size() )
00140 {
00141 uint32 curLen = 0;
00142
00143 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00144 bufLen = bufLen < contigSize ? bufLen : contigSize;
00145 while (curLen < bufLen)
00146 {
00147 uint64 readLen = bufLen - curLen;
00148 Array<gnSeqC> array_buf( readLen );
00149 gnSeqC* tmpBuf = array_buf.data;
00150
00151
00152 m_ifstream.read(tmpBuf, readLen);
00153 uint64 gc = m_ifstream.gcount();
00154
00155
00156 for(uint32 i=0; i < gc; i++){
00157 if( m_pFilter->IsValid(tmpBuf[i]) ){
00158 buf[curLen] = tmpBuf[i];
00159 curLen++;
00160 }
00161 }
00162 if(m_ifstream.eof()){
00163 m_ifstream.clear();
00164 bufLen = curLen;
00165 return true;
00166 }
00167 }
00168 bufLen = curLen;
00169 }
00170 return true;
00171 }
00172
00173
00174
00175
00176
00177
00178 boolean gnFASSource::SeqSeek( const gnSeqI start, const uint32 contigI, uint64& startPos, uint64& readableBytes )
00179 {
00180 if( contigI == ALL_CONTIGS ){
00181
00182 gnSeqI curIndex = 0;
00183 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00184 for( ; iter != m_contigList.end(); ++iter )
00185 {
00186 uint64 len = (*iter)->GetSeqLength();
00187 if( (curIndex + len) > start )
00188 break;
00189 curIndex += len;
00190 }
00191 if( iter == m_contigList.end() )
00192 return false;
00193
00194 gnSeqI startIndex = start - curIndex;
00195 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00196 }
00197 else if( contigI < m_contigList.size() )
00198 {
00199 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00200 }
00201 return false;
00202 }
00203
00204
00205
00206 boolean gnFASSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ){
00207 readableBytes = 0;
00208 uint32 curLen = 0;
00209
00210 startPos = contig.GetSectStartEnd(gnContigSequence).first;
00211
00212 if(contig.HasRepeatSeqGap())
00213 if(contig.GetRepeatSeqGapSize().first > 0)
00214 if(contig.GetRepeatSeqGapSize().second > 0){
00215
00216 startPos += start + (start/contig.GetRepeatSeqGapSize().first)*contig.GetRepeatSeqGapSize().second;
00217 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00218 m_ifstream.seekg( startPos, ios::beg );
00219 return true;
00220 }
00221 m_ifstream.seekg( startPos, ios::beg );
00222 if( m_ifstream.eof() ){
00223 ErrorMsg("ERROR in gnFASSource::Incorrect contig start position, End of file reached!\n");
00224 return false;
00225 }
00226 while( true ){
00227
00228
00229 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00230 if(tmpbufsize == 0){
00231 ErrorMsg("ERROR in gnFASSource: stored contig size is incorrect.\n");
00232 return false;
00233 }
00234 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;
00235 Array<char> array_buf( tmpbufsize );
00236 char *tmpbuf = array_buf.data;
00237 m_ifstream.read( tmpbuf, tmpbufsize );
00238 if( m_ifstream.eof() ){
00239 ErrorMsg("ERROR in gnFASSource::Read End of file reached!\n");
00240 return false;
00241 }
00242 for( uint32 i=0; i < tmpbufsize; ++i ){
00243 if( m_pFilter->IsValid(tmpbuf[i]) ){
00244 if( curLen >= start ){
00245 startPos += i;
00246 m_ifstream.seekg( startPos, ios::beg );
00247 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00248 return true;
00249 }
00250 ++curLen;
00251 }
00252 }
00253 startPos += tmpbufsize;
00254 }
00255 return true;
00256 }
00257
00258
00259 boolean gnFASSource::Write(gnBaseSource *source, const string& filename){
00260 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00261 if(!m_ofstream.is_open())
00262 return false;
00263 uint32 contigCount = source->GetContigListLength();
00264 for(uint32 contigI = 0; contigI < contigCount; contigI++){
00265 string contigName = source->GetContigName(contigI);
00266 m_ofstream << ">" << contigName << ";\n";
00267 gnSeqI seqLength = source->GetContigSeqLength(contigI);
00268 while(seqLength > 0){
00269 gnSeqI writeLen = seqLength < BUFFER_SIZE ? seqLength : BUFFER_SIZE;
00270 Array<gnSeqC> array_buf( writeLen+1 );
00271 gnSeqC *bases = array_buf.data;
00272 boolean success = source->SeqRead(0, bases, writeLen, contigI);
00273 if(!success)
00274 return false;
00275 bases[writeLen] = '\0';
00276 m_ofstream << bases << "\n";
00277 seqLength -= writeLen;
00278 }
00279 }
00280 m_ofstream.close();
00281 return true;
00282 }
00283
00284
00285 void gnFASSource::Write(gnSequence& seq, const string& filename, boolean write_coords, boolean enforce_unique_names){
00286 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00287 if(!m_ofstream.is_open())
00288 Throw_gnEx(FileNotOpened());
00289 Write(seq, m_ofstream, write_coords, enforce_unique_names);
00290 m_ofstream.close();
00291 }
00292
00293 void gnFASSource::Write(gnSequence& seq, ostream& m_ostream, boolean write_coords, boolean enforce_unique_names){
00294 vector<string> contigNameList;
00295 Array<gnSeqC> array_buf( BUFFER_SIZE );
00296 gnSeqC *bases = array_buf.data;
00297 gnGenomeSpec* spec = seq.GetSpec();
00298
00299
00300 string newline = "\r\n";
00301 gnSeqI readOffset = 1;
00302
00303 for(uint32 fragI = 0; fragI < seq.contigListLength(); fragI++){
00304
00305 string contigName = seq.contigName(fragI);
00306
00307 if(enforce_unique_names){
00308 uint32 name_count = 0;
00309 for(uint32 i=0; i < contigNameList.size(); i++)
00310 if(contigNameList[i] == contigName)
00311 name_count++;
00312 contigNameList.push_back(contigName);
00313 if(name_count > 0)
00314 contigName += "_" + uintToString(name_count);
00315 }
00316
00317 gnFragmentSpec* subSpec = spec->GetSpec(fragI);
00318 gnBaseHeader *gpbh = subSpec->GetHeader(0);
00319 string header = "";
00320 if(gpbh != NULL){
00321 header = gpbh->GetHeader();
00322
00323 uint32 newlinepos = header.find_first_of('\n', 0);
00324 if(newlinepos != string::npos){
00325 if(header[newlinepos-1] == '\r')
00326 newlinepos--;
00327 header = header.substr(0, newlinepos);
00328 }
00329 }
00330
00331
00332 gnSeqI readLength = seq.contigLength(fragI);
00333 m_ostream << ">" << contigName;
00334 if(write_coords)
00335 m_ostream << " " << "(" << readOffset << ", " << readOffset + readLength - 1 << ")";
00336 m_ostream << "; " << header << newline;
00337
00338 gnSeqI linePos = 0;
00339 while(readLength > 0){
00340 gnSeqI read_chunk_size = (BUFFER_SIZE / FAS_LINE_WIDTH) * FAS_LINE_WIDTH;
00341 gnSeqI writeLen = readLength < read_chunk_size ? readLength : read_chunk_size;
00342
00343 if(!seq.ToArray(bases, writeLen, readOffset))
00344 return;
00345 for(gnSeqI curbase = 0; curbase < writeLen; curbase += FAS_LINE_WIDTH){
00346 gnSeqI writeout_size = writeLen - curbase < FAS_LINE_WIDTH ? writeLen - curbase : FAS_LINE_WIDTH;
00347 m_ostream << string(bases + curbase, writeout_size) << newline;
00348 }
00349 readLength -= writeLen;
00350 readOffset += writeLen;
00351 }
00352 if(linePos != 0)
00353 m_ostream << newline;
00354 }
00355
00356 }
00357
00358 gnGenomeSpec *gnFASSource::GetSpec() const{
00359 gnGenomeSpec *spec = new gnGenomeSpec();
00360 for(uint32 i=0; i < m_contigList.size(); i++){
00361
00362 gnFragmentSpec *fragmentSpec = new gnFragmentSpec();
00363 gnSourceSpec *contigSpec = new gnSourceSpec((gnBaseSource*)this, i);
00364
00365 spec->AddSpec(fragmentSpec, i);
00366 fragmentSpec->AddSpec(contigSpec);
00367
00368 fragmentSpec->SetName(m_contigList[i]->GetName());
00369 fragmentSpec->SetSourceName(m_openString);
00370 contigSpec->SetName(m_contigList[i]->GetName());
00371 contigSpec->SetSourceName(m_openString);
00372
00373 pair<uint32, uint32> headsect = m_contigList[i]->GetSectStartEnd(gnContigHeader);
00374 if(headsect.first != headsect.second){
00375 gnSourceHeader *gpsh = new gnSourceHeader((gnBaseSource *)this, string(""), headsect.first, headsect.second - headsect.first);
00376 fragmentSpec->AddHeader(gpsh, 0);
00377 }
00378 }
00379 return spec;
00380 }
00381
00382 gnFileContig* gnFASSource::GetFileContig( const uint32 contigI ) const{
00383 if(m_contigList.size() > contigI)
00384 return m_contigList[contigI];
00385 return NULL;
00386 }
00387
00388 boolean gnFASSource::ParseStream( istream& fin )
00389 {
00390
00391 uint32 readState = 0;
00392 gnFileContig* currentContig = 0;
00393 string nameFStr;
00394 uint64 seqLength = 0, gapLength = 0;
00395
00396 uint64 streamPos = 0;
00397 uint64 bufReadLen = 0;
00398 Array<gnSeqC> array_buf( BUFFER_SIZE );
00399 char* buf = array_buf.data;
00400 boolean paren_hit = false;
00401 uint32 repeatSeqSize = 0;
00402
00403
00404 DetermineNewlineType();
00405
00406
00407 while( !fin.eof() )
00408 {
00409
00410 fin.read( buf, BUFFER_SIZE);
00411 streamPos += bufReadLen;
00412 bufReadLen = fin.gcount();
00413 for( uint32 i=0 ; i < bufReadLen ; i++ )
00414 {
00415 char ch = buf[i];
00416 switch( readState )
00417 {
00418 case 0:
00419 if( !((buf[0] == '>') || !m_pFilter->IsValid(buf[0])) )
00420 {
00421 return false;
00422 }
00423 readState = 1;
00424 case 1:
00425 if( ch == '>' )
00426 {
00427 seqLength = 0; gapLength = 0;
00428 currentContig = new gnFileContig();
00429 currentContig->SetFileStart( streamPos + i );
00430 currentContig->SetRepeatSeqGap(true);
00431 currentContig->SetRepeatSeqSize( repeatSeqSize );
00432 currentContig->SetRepeatGapSize( m_newlineSize );
00433 readState = 2;
00434 paren_hit = false;
00435 nameFStr = "";
00436 }
00437 else
00438 ++gapLength;
00439 break;
00440 case 2:
00441 if( isNewLine(ch) || ch == ';')
00442 {
00443 currentContig->SetName( nameFStr );
00444 currentContig->SetSectStart( gnContigHeader, streamPos + i + 1 );
00445 if( ch == ';' )
00446 readState = 3;
00447 else{
00448 readState = 4;
00449 if(ch == '\r')
00450 currentContig->SetSectStart( gnContigHeader, streamPos + i + 2 );
00451 }
00452 }
00453 else if( ch == '(' ){
00454 if(isSpace(buf[i-1])){
00455
00456 nameFStr = nameFStr.substr(0, nameFStr.length()-1);
00457 }
00458 paren_hit = true;
00459 }else if((!isSpace(ch) || nameFStr.length()) && !paren_hit )
00460 nameFStr += ch;
00461 break;
00462 case 3:
00463 if( isNewLine(ch) )
00464 readState = 4;
00465 break;
00466 case 4:
00467 if( ch == '>' )
00468 {
00469 readState = 3;
00470 }
00471 else if( m_pFilter->IsValid(ch) )
00472 {
00473 currentContig->SetSectEnd( gnContigHeader, streamPos + i );
00474 currentContig->SetSectStart( gnContigSequence, streamPos + i);
00475 seqLength = 1; gapLength = 0;
00476 readState = 5;
00477 }
00478 break;
00479 case 5:
00480 while(i < bufReadLen){
00481 ch = buf[i];
00482 if( m_pFilter->IsValid(ch) ){
00483 if(gapLength > 0){
00484 if(seqLength != repeatSeqSize)
00485 currentContig->SetRepeatSeqGap(false);
00486 if(gapLength != m_newlineSize)
00487
00488 currentContig->SetRepeatSeqGap(false);
00489 currentContig->AddToSeqLength( seqLength );
00490 seqLength = 0;
00491 gapLength = 0;
00492 }
00493 seqLength++;
00494 }else if( ch == '>'){
00495 currentContig->AddToSeqLength( seqLength );
00496 currentContig->SetSectEnd( gnContigSequence, streamPos + i - 1);
00497 currentContig->SetFileEnd( streamPos + i - 1 );
00498 m_contigList.push_back(currentContig);
00499 readState = 1;
00500 i--;
00501 break;
00502 }
00503 else if( isNewLine(ch) ){
00504 if( repeatSeqSize == 0 ){
00505 repeatSeqSize = seqLength;
00506 currentContig->SetRepeatSeqSize( repeatSeqSize );
00507 }
00508 gapLength++;
00509 }
00510 else{
00511
00512 currentContig->SetRepeatSeqGap(false);
00513 }
00514 i++;
00515 }
00516 break;
00517 default:
00518 ErrorMsg("ERROR");
00519 return false;
00520 break;
00521 }
00522 }
00523 }
00524
00525 if( currentContig != 0 )
00526 {
00527 if( readState == 2 )
00528 {
00529 currentContig->SetName( nameFStr );
00530 }
00531 if( (readState >= 2) && (readState < 5) )
00532 {
00533 currentContig->SetSectEnd( gnContigHeader, streamPos + bufReadLen );
00534 }
00535 else if( readState == 5 )
00536 {
00537 currentContig->AddToSeqLength( seqLength );
00538 currentContig->SetSectEnd( gnContigSequence, streamPos + bufReadLen );
00539 }
00540 currentContig->SetFileEnd( streamPos + bufReadLen );
00541 m_contigList.push_back(currentContig);
00542 }
00543 m_ifstream.clear();
00544 return true;
00545 }