Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

gnFASSource.cpp

Go to the documentation of this file.
00001 /////////////////////////////////////////////////////////////////////////////
00002 // File:            gnFASSource.h
00003 // Purpose:         Implements gnBaseSource for .FAS files
00004 // Description:     
00005 // Changes:        
00006 // Version:         libGenome 0.5.1 
00007 // Author:          Aaron Darling 
00008 // Modified by:     
00009 // Copyright:       (c) Aaron Darling 
00010 // Licenses:        See COPYING file for details
00011 /////////////////////////////////////////////////////////////////////////////
00012 #include "gn/gnFASSource.h"
00013 #include "gn/gnBaseSpec.h"
00014 #include "gn/gnFilter.h"
00015 #include "gn/gnStringTools.h"
00016 #include "gn/gnSourceSpec.h"
00017 #include "gn/gnSourceHeader.h"
00018 #include "gn/gnDebug.h"
00019 
00020 gnFASSource::gnFASSource()
00021 {
00022         m_openString = "";
00023         m_pFilter = gnFilter::fullDNASeqFilter();
00024         if(m_pFilter == NULL){
00025                 DebugMsg("Error using static sequence filters.");
00026         }
00027 }
00028 gnFASSource::gnFASSource( const gnFASSource& s ) : gnFileSource(s)
00029 {
00030         vector< gnFileContig* >::const_iterator iter = s.m_contigList.begin();
00031         for( ; iter != s.m_contigList.end(); ++iter )
00032         {
00033                 m_contigList.push_back( (*iter)->Clone() );
00034         }
00035 }
00036 gnFASSource::~gnFASSource()
00037 {
00038         m_ifstream.close();
00039         vector< gnFileContig* >::iterator iter = m_contigList.begin();
00040         for( ; iter != m_contigList.end(); ++iter )
00041         {
00042                 gnFileContig* fg = *iter;
00043                 *iter = 0;
00044                 delete fg;
00045         }
00046 }
00047 boolean gnFASSource::HasContig( const string& nameStr ) const
00048 {
00049         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00050         {
00051                 if( nameStr == m_contigList[i]->GetName() )
00052                         return true;
00053         }
00054         return false;
00055 }
00056 uint32 gnFASSource::GetContigID( const string& name ) const
00057 {
00058         for(uint32 i = 0 ; i <= m_contigList.size(); i++ )
00059         {
00060                 if( name == m_contigList[i]->GetName() )
00061                         return i;
00062         }
00063         return ALL_CONTIGS;
00064 }
00065 string gnFASSource::GetContigName( const uint32 i ) const
00066 {
00067         if( i < m_contigList.size() ){
00068                 return m_contigList[i]->GetName();
00069         }
00070         return "";
00071 }
00072 gnSeqI gnFASSource::GetContigSeqLength( const uint32 i ) const
00073 {
00074         if( i < m_contigList.size() ){
00075                 return m_contigList[i]->GetSeqLength();
00076         }else if( i == ALL_CONTIGS){
00077                 gnSeqI seqlen = 0;
00078                 for(uint32 j=0; j < m_contigList.size(); j++)
00079                         seqlen += m_contigList[j]->GetSeqLength();
00080                 return seqlen;
00081         }
00082         return GNSEQI_ERROR;
00083 }
00084 gnFileContig* gnFASSource::GetContig( const uint32 i ) const
00085 {
00086         if( i < m_contigList.size() ){
00087                 return m_contigList[i];
00088         }
00089         return 0;
00090 }
00091 
00092 boolean gnFASSource::SeqRead( const gnSeqI start, char* buf, gnSeqI& bufLen, const uint32 contigI ) 
00093 {
00094         m_ifstream.clear();
00095         uint32 contigIndex = contigI;
00096         uint64 startPos = 0;
00097         uint64 readableBytes = 0;
00098         if( !SeqSeek( start, contigIndex, startPos, readableBytes ) ){
00099                 bufLen = 0;
00100                 return false;
00101         }
00102         
00103         if( contigI == ALL_CONTIGS ){
00104                 uint32 curLen = 0;
00105                 uint64 bytesRead = 0;
00106                 while (curLen < bufLen){
00107 //SeqSeek to start, Figure out how much can be read before SeqSeeking again.
00108                         if(readableBytes <= 0)  //Look out for zero length contigs!  IMPLEMENT ME
00109                                 if( !SeqSeek( start + curLen, contigIndex, startPos, readableBytes ) ){
00110                                         bufLen = curLen;
00111                                         return true;
00112                                 }
00113                         //readLen is the amount to read on this pass
00114                         uint64 readLen = (bufLen - curLen) < readableBytes ? (bufLen - curLen) : readableBytes;
00115                         Array<gnSeqC> array_buf( readLen );
00116                         gnSeqC* tmpBuf = array_buf.data;        //read into tmpBuf, then filter tmpBuf into curBuf
00117 
00118                         // read chars and filter
00119                         m_ifstream.read(tmpBuf, readLen);
00120                         uint64 gc = m_ifstream.gcount();
00121                         bytesRead += gc;
00122                         readableBytes -= gc;
00123 //                      cout << "Read " << gc << " chars from " << m_openString << "\n";
00124 //                      cout << "Checking character validity on: " << tmpBuf << "\n";
00125                         for(uint32 i=0; i < gc; i++){
00126                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00127                                         buf[curLen] = tmpBuf[i];
00128                                         curLen++;
00129                                 }
00130                         }
00131                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00132                                 m_ifstream.clear();
00133                                 bufLen = curLen;
00134                                 return true;
00135                         }
00136                 }
00137                 bufLen = curLen;
00138         }
00139         else if( contigI < m_contigList.size() )
00140         {
00141                 uint32 curLen = 0;
00142                 //check to see if the buffer is bigger than the contig.  if so truncate it.
00143                 gnSeqI contigSize = m_contigList[contigI]->GetSeqLength();
00144                 bufLen = bufLen < contigSize ? bufLen : contigSize;
00145                 while (curLen < bufLen)
00146                 {
00147                         uint64 readLen = bufLen - curLen;       //the amount to read on this pass
00148                         Array<gnSeqC> array_buf( readLen );     // use Array template to ensure proper deallocation
00149                         gnSeqC* tmpBuf = array_buf.data;        //read into tmpBuf, then filter tmpBuf into curBuf
00150 
00151                         // read chars and filter
00152                         m_ifstream.read(tmpBuf, readLen);
00153                         uint64 gc = m_ifstream.gcount();
00154 //                      cout << "Read " << gc << " chars from " << m_openString << "\n";
00155 //                      cout << "Checking character validity on: " << tmpBuf << "\n";
00156                         for(uint32 i=0; i < gc; i++){
00157                                 if( m_pFilter->IsValid(tmpBuf[i]) ){
00158                                         buf[curLen] = tmpBuf[i];
00159                                         curLen++;
00160                                 }
00161                         }
00162                         if(m_ifstream.eof()){   //we hit the end of the file.  bail out.
00163                                 m_ifstream.clear();
00164                                 bufLen = curLen;
00165                                 return true;
00166                         }
00167                 }
00168                 bufLen = curLen;
00169         }
00170         return true;
00171 }
00172 
00173 
00174 // private:
00175 // figures out which contig the sequence starts at then calls SeqStartPos to get the offset within that contig
00176 // returns startPos, the file offset where the sequence starts
00177 // returns true if successful, false otherwise
00178 boolean gnFASSource::SeqSeek( const gnSeqI start, const uint32 contigI, uint64& startPos, uint64& readableBytes )
00179 {
00180         if( contigI == ALL_CONTIGS ){
00181                 // find first contig
00182                 gnSeqI curIndex = 0;
00183                 vector< gnFileContig* >::iterator iter = m_contigList.begin();
00184                 for( ; iter != m_contigList.end(); ++iter )
00185                 {
00186                         uint64 len = (*iter)->GetSeqLength();
00187                         if( (curIndex + len) > start )
00188                                 break;
00189                         curIndex += len;
00190                 }
00191                 if( iter == m_contigList.end() )
00192                         return false;
00193                 // seek to start
00194                 gnSeqI startIndex = start - curIndex;  //startIndex is starting pos. within the contig
00195                 return SeqStartPos( startIndex, *(*iter), startPos, readableBytes );
00196         }
00197         else if( contigI < m_contigList.size() )
00198         {
00199                 return SeqStartPos( start, *(m_contigList[contigI]), startPos, readableBytes );
00200         }
00201         return false;
00202 }
00203 
00204 //Returns startPos, the file offset where the sequence starts.
00205 //IMPLEMENT ME! utilize HasRepeatSeqGap to skip slow character checking process (data isn't noisy)
00206 boolean gnFASSource::SeqStartPos( const gnSeqI start, gnFileContig& contig, uint64& startPos, uint64& readableBytes ){
00207         readableBytes = 0;
00208         uint32 curLen = 0;
00209         //seek to the file offset where the contig starts
00210         startPos = contig.GetSectStartEnd(gnContigSequence).first;      //set startPos to start where the contig starts
00211 
00212         if(contig.HasRepeatSeqGap())
00213                 if(contig.GetRepeatSeqGapSize().first > 0)
00214                         if(contig.GetRepeatSeqGapSize().second > 0){
00215 //check this    
00216                                 startPos += start + (start/contig.GetRepeatSeqGapSize().first)*contig.GetRepeatSeqGapSize().second;
00217                                 readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00218                                 m_ifstream.seekg( startPos, ios::beg );
00219                                 return true;
00220                         }
00221         m_ifstream.seekg( startPos, ios::beg );
00222         if( m_ifstream.eof() ){
00223                 ErrorMsg("ERROR in gnFASSource::Incorrect contig start position, End of file reached!\n");
00224                 return false;
00225         }
00226         while( true ){
00227                   // READ the rest of the contig skipping over invalid characters until we get to the starting base pair.
00228                   // startPos will contain the file offset with the starting base pair
00229                 uint32 tmpbufsize = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00230                 if(tmpbufsize == 0){
00231                         ErrorMsg("ERROR in gnFASSource: stored contig size is incorrect.\n");
00232                         return false;
00233                 }
00234                 tmpbufsize = tmpbufsize < BUFFER_SIZE ? tmpbufsize : BUFFER_SIZE;  //read in the smaller of the two.
00235                 Array<char> array_buf( tmpbufsize );    // use Array template to ensure proper deallocation
00236                 char *tmpbuf = array_buf.data;
00237                 m_ifstream.read( tmpbuf, tmpbufsize );
00238                 if( m_ifstream.eof() ){
00239                         ErrorMsg("ERROR in gnFASSource::Read End of file reached!\n");
00240                         return false;
00241                 }
00242                 for( uint32 i=0; i < tmpbufsize; ++i ){
00243                         if( m_pFilter->IsValid(tmpbuf[i]) ){
00244                                 if( curLen >= start ){ //stop when we reach the starting offset within this contig
00245                                         startPos += i;
00246                                         m_ifstream.seekg( startPos, ios::beg );  //seek to startPos
00247                                         readableBytes = contig.GetSectStartEnd(gnContigSequence).second - startPos;
00248                                         return true;
00249                                 }
00250                                 ++curLen;  //each time we read a valid b.p., increment the sequence length
00251                         }
00252                 }
00253                 startPos += tmpbufsize;
00254         }
00255         return true;
00256 }
00257 
00258 //write out the given source in this file format.
00259 boolean gnFASSource::Write(gnBaseSource *source, const string& filename){
00260         ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00261         if(!m_ofstream.is_open())
00262                 return false;
00263         uint32 contigCount = source->GetContigListLength();
00264         for(uint32 contigI = 0; contigI < contigCount; contigI++){
00265                 string contigName = source->GetContigName(contigI);
00266                 m_ofstream << ">" << contigName << ";\n";
00267                 gnSeqI seqLength = source->GetContigSeqLength(contigI);
00268                 while(seqLength > 0){
00269                         gnSeqI writeLen = seqLength < BUFFER_SIZE ? seqLength : BUFFER_SIZE;            
00270                         Array<gnSeqC> array_buf( writeLen+1 );  // use Array template to ensure proper deallocation
00271                         gnSeqC *bases = array_buf.data;
00272                         boolean success = source->SeqRead(0, bases, writeLen, contigI);
00273                         if(!success)
00274                                 return false;
00275                         bases[writeLen] = '\0';
00276                         m_ofstream << bases << "\n";
00277                         seqLength -= writeLen;
00278                 }
00279         }
00280         m_ofstream.close();
00281         return true;
00282 }
00283 
00284 //write out the given sequence in this file format.
00285 void gnFASSource::Write(gnSequence& seq, const string& filename, boolean write_coords, boolean enforce_unique_names){
00286         ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00287         if(!m_ofstream.is_open())
00288                 Throw_gnEx(FileNotOpened());
00289         Write(seq, m_ofstream, write_coords, enforce_unique_names);
00290         m_ofstream.close();
00291 }
00292 
00293 void gnFASSource::Write(gnSequence& seq, ostream& m_ostream, boolean write_coords, boolean enforce_unique_names){
00294         vector<string> contigNameList;
00295         Array<gnSeqC> array_buf( BUFFER_SIZE ); // use Array template to ensure proper deallocation
00296         gnSeqC *bases = array_buf.data;
00297         gnGenomeSpec* spec = seq.GetSpec();
00298         
00299         //set the newline type.
00300         string newline = "\r\n";
00301         gnSeqI readOffset = 1;
00302 
00303         for(uint32 fragI = 0; fragI < seq.contigListLength(); fragI++){
00304                 //write out contig name and header
00305                 string contigName = seq.contigName(fragI);
00306 
00307                 if(enforce_unique_names){
00308                         uint32 name_count = 0;
00309                         for(uint32 i=0; i < contigNameList.size(); i++)
00310                                 if(contigNameList[i] == contigName)
00311                                         name_count++;
00312                         contigNameList.push_back(contigName);
00313                         if(name_count > 0)
00314                                 contigName += "_" + uintToString(name_count);
00315                 }
00316 
00317                 gnFragmentSpec* subSpec = spec->GetSpec(fragI);
00318                 gnBaseHeader *gpbh = subSpec->GetHeader(0);
00319                 string header = "";
00320                 if(gpbh != NULL){
00321                         header = gpbh->GetHeader();
00322                         //delete everything after the first newline.
00323                         uint32 newlinepos = header.find_first_of('\n', 0);
00324                         if(newlinepos != string::npos){
00325                                 if(header[newlinepos-1] == '\r')
00326                                         newlinepos--;
00327                                 header = header.substr(0, newlinepos);
00328                         }
00329                 }       //IMPLEMENT ME!! Does header line need to be < 80 chars?
00330 
00331                 //write out the sequence
00332                 gnSeqI readLength = seq.contigLength(fragI);
00333                 m_ostream << ">" << contigName;
00334                 if(write_coords)
00335                         m_ostream << " " << "(" << readOffset << ", " << readOffset + readLength - 1 << ")";
00336                 m_ostream << "; " << header << newline;
00337 
00338                 gnSeqI linePos = 0;
00339                 while(readLength > 0){  //buffer the read/writes
00340                         gnSeqI read_chunk_size = (BUFFER_SIZE / FAS_LINE_WIDTH) * FAS_LINE_WIDTH;
00341                         gnSeqI writeLen = readLength < read_chunk_size ? readLength : read_chunk_size;
00342                         
00343                         if(!seq.ToArray(bases, writeLen, readOffset))
00344                                 return;
00345                         for(gnSeqI curbase = 0; curbase < writeLen; curbase += FAS_LINE_WIDTH){
00346                                 gnSeqI writeout_size = writeLen - curbase < FAS_LINE_WIDTH ? writeLen - curbase : FAS_LINE_WIDTH;
00347                                 m_ostream << string(bases + curbase, writeout_size) << newline;
00348                         }
00349                         readLength -= writeLen;
00350                         readOffset += writeLen;
00351                 }
00352                 if(linePos != 0)
00353                         m_ostream << newline;
00354         }
00355 
00356 }
00357 
00358 gnGenomeSpec *gnFASSource::GetSpec() const{
00359         gnGenomeSpec *spec = new gnGenomeSpec();
00360         for(uint32 i=0; i < m_contigList.size(); i++){
00361                 //create specs for the fragment and contig levels
00362                 gnFragmentSpec *fragmentSpec = new gnFragmentSpec();
00363                 gnSourceSpec *contigSpec = new gnSourceSpec((gnBaseSource*)this, i);
00364                 //set up the spec tree structure
00365                 spec->AddSpec(fragmentSpec, i);
00366                 fragmentSpec->AddSpec(contigSpec);
00367 
00368                 fragmentSpec->SetName(m_contigList[i]->GetName());
00369                 fragmentSpec->SetSourceName(m_openString);
00370                 contigSpec->SetName(m_contigList[i]->GetName());
00371                 contigSpec->SetSourceName(m_openString);
00372                 
00373                 pair<uint32, uint32> headsect = m_contigList[i]->GetSectStartEnd(gnContigHeader);
00374                 if(headsect.first != headsect.second){
00375                         gnSourceHeader *gpsh = new gnSourceHeader((gnBaseSource *)this, string(""), headsect.first, headsect.second - headsect.first);
00376                         fragmentSpec->AddHeader(gpsh, 0);
00377                 }
00378         }
00379         return spec;
00380 }
00381 
00382 gnFileContig* gnFASSource::GetFileContig( const uint32 contigI ) const{
00383         if(m_contigList.size() > contigI)
00384                 return m_contigList[contigI];
00385         return NULL;
00386 }
00387 
00388 boolean gnFASSource::ParseStream( istream& fin )
00389 {
00390         // INIT temp varables
00391         uint32 readState = 0;
00392         gnFileContig* currentContig = 0;
00393         string nameFStr;
00394         uint64 seqLength = 0, gapLength = 0;
00395         // INIT buffer
00396         uint64 streamPos = 0;
00397         uint64 bufReadLen = 0;
00398         Array<gnSeqC> array_buf( BUFFER_SIZE ); // use Array template to ensure proper deallocation
00399         char* buf = array_buf.data;
00400         boolean paren_hit = false;
00401         uint32 repeatSeqSize = 0;
00402 
00403         //decide what type of newlines we have
00404         DetermineNewlineType();
00405 
00406         
00407         while( !fin.eof() )
00408         {
00409                   // read chars
00410                 fin.read( buf, BUFFER_SIZE);
00411                 streamPos += bufReadLen;
00412                 bufReadLen = fin.gcount();
00413                 for( uint32 i=0 ; i < bufReadLen ; i++ )
00414                 {
00415                         char ch = buf[i];
00416                         switch( readState )
00417                         {
00418                                 case 0: // CHECK file validity
00419                                         if( !((buf[0] == '>') || !m_pFilter->IsValid(buf[0])) )
00420                                         {
00421                                                 return false; // NOT a .fas file
00422                                         }
00423                                         readState = 1;
00424                                 case 1: // BRANCH
00425                                         if( ch == '>' )
00426                                         {
00427                                                 seqLength = 0; gapLength = 0; // reset count
00428                                                 currentContig = new gnFileContig();
00429                                                 currentContig->SetFileStart( streamPos + i );
00430                                                 currentContig->SetRepeatSeqGap(true);
00431                                                 currentContig->SetRepeatSeqSize( repeatSeqSize );
00432                                                 currentContig->SetRepeatGapSize( m_newlineSize );
00433                                                 readState = 2;
00434                                                 paren_hit = false;
00435                                                 nameFStr = "";
00436                                         }
00437                                         else
00438                                                 ++gapLength;
00439                                         break;
00440                                 case 2: // > CONTIG NAME
00441                                         if( isNewLine(ch) || ch == ';')
00442                                         {
00443                                                 currentContig->SetName( nameFStr );
00444                                                 currentContig->SetSectStart( gnContigHeader, streamPos + i + 1 );
00445                                                 if( ch == ';' )
00446                                                         readState = 3;
00447                                                 else{
00448                                                         readState = 4;
00449                                                         if(ch == '\r')
00450                                                                 currentContig->SetSectStart( gnContigHeader, streamPos + i + 2 );
00451                                                 }
00452                                         }
00453                                         else if( ch == '(' ){
00454                                                 if(isSpace(buf[i-1])){
00455                                                         //delete the last char in nameFStr
00456                                                         nameFStr = nameFStr.substr(0, nameFStr.length()-1);
00457                                                 }
00458                                                 paren_hit = true;
00459                                         }else if((!isSpace(ch) || nameFStr.length()) && !paren_hit )
00460                                                 nameFStr += ch;
00461                                         break;
00462                                 case 3: // >... ; REMARK
00463                                         if( isNewLine(ch) )
00464                                                 readState = 4;
00465                                         break;
00466                                 case 4: // >... ; /n NEWLINE BRANCH
00467                                         if( ch == '>' )
00468                                         {
00469                                                 readState = 3;
00470                                         }
00471                                         else if( m_pFilter->IsValid(ch) )
00472                                         {
00473                                                 currentContig->SetSectEnd( gnContigHeader, streamPos + i );
00474                                                 currentContig->SetSectStart( gnContigSequence, streamPos + i);
00475                                                 seqLength = 1; gapLength = 0;
00476                                                 readState = 5;
00477                                         }
00478                                         break;
00479                                 case 5: // SEQUENCE
00480                                         while(i < bufReadLen){
00481                                                 ch = buf[i];
00482                                                 if( m_pFilter->IsValid(ch) ){
00483                                                         if(gapLength > 0){
00484                                                                 if(seqLength != repeatSeqSize)
00485                                                                         currentContig->SetRepeatSeqGap(false);
00486                                                                 if(gapLength != m_newlineSize)
00487                                                                         //file is corrupt, can't do jumps.
00488                                                                         currentContig->SetRepeatSeqGap(false);
00489                                                                 currentContig->AddToSeqLength( seqLength );
00490                                                                 seqLength = 0;
00491                                                                 gapLength = 0;
00492                                                         }
00493                                                         seqLength++;
00494                                                 }else if( ch == '>'){
00495                                                         currentContig->AddToSeqLength( seqLength );
00496                                                         currentContig->SetSectEnd( gnContigSequence, streamPos + i - 1);
00497                                                         currentContig->SetFileEnd( streamPos + i - 1 );
00498                                                         m_contigList.push_back(currentContig);
00499                                                         readState = 1;
00500                                                         i--;
00501                                                         break;
00502                                                 }
00503                                                 else if( isNewLine(ch) ){
00504                                                         if( repeatSeqSize == 0 ){
00505                                                                 repeatSeqSize = seqLength;
00506                                                                 currentContig->SetRepeatSeqSize( repeatSeqSize );
00507                                                         }
00508                                                         gapLength++;
00509                                                 }
00510                                                 else{
00511                                                         // Error cannot do nice jumps
00512                                                         currentContig->SetRepeatSeqGap(false);
00513                                                 }
00514                                                 i++;
00515                                         }
00516                                         break;
00517                                 default:
00518                                         ErrorMsg("ERROR");
00519                                         return false;
00520                                         break;
00521                         }
00522                 }// for all buf
00523         }// while !eof
00524         // CLEAN UP
00525         if( currentContig != 0 )
00526         {
00527                 if( readState == 2 )
00528                 {
00529                         currentContig->SetName( nameFStr );
00530                 }
00531                 if( (readState >= 2) && (readState < 5) )
00532                 {
00533                         currentContig->SetSectEnd( gnContigHeader, streamPos + bufReadLen );
00534                 }
00535                 else if( readState ==  5 )
00536                 {
00537                         currentContig->AddToSeqLength( seqLength );
00538                         currentContig->SetSectEnd( gnContigSequence, streamPos + bufReadLen );
00539                 }                       
00540                 currentContig->SetFileEnd( streamPos + bufReadLen );
00541                 m_contigList.push_back(currentContig);
00542         }
00543         m_ifstream.clear();
00544         return true;
00545 }

Generated on Fri May 9 12:57:49 2003 for libGenome by doxygen1.3-rc3