Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

gnDNXSource.cpp

Go to the documentation of this file.
00001 /////////////////////////////////////////////////////////////////////////////
00002 // File:            gnDNXSource.h
00003 // Purpose:         Implements gnBaseSource for .DNX files
00004 // Description:     
00005 // Changes:        
00006 // Version:         libGenome 0.5.1 
00007 // Author:          Aaron Darling 
00008 // Modified by:     
00009 // Copyright:       (c) Aaron Darling 
00010 // Licenses:        See COPYING file for details
00011 /////////////////////////////////////////////////////////////////////////////
00012 #include "gn/gnDNXSource.h"
00013 #include "gn/gnSourceSpec.h"
00014 #include "gn/gnStringSpec.h"
00015 #include "gn/gnSourceFactory.h"
00016 #include "gn/gnFASSource.h"
00017 #include "gn/gnGBKSource.h"
00018 #include "gn/gnBaseHeader.h"
00019 #include "gn/gnFilter.h"
00020 #include "gn/gnDebug.h"
00021 #include "gn/gnStringTools.h"
00022 #include <string>
00023 
00024 gnDNXSource::gnDNXSource()
00025 {
00026         m_DNXSpec = new gnGenomeSpec();
00027         m_pFilter = gnFilter::fullDNASeqFilter();
00028         if(m_pFilter == NULL){
00029                 DebugMsg("Error using static sequence filters.");
00030         }
00031 }
00032 
00033 gnDNXSource::gnDNXSource( const gnDNXSource& s ) : gnFileSource(s)
00034 {
00035         if(s.m_DNXSpec != NULL)
00036                 m_DNXSpec = s.m_DNXSpec->Clone();
00037 }
00038 
00039 gnDNXSource::~gnDNXSource()
00040 {
00041         m_ifstream.close();
00042         delete m_DNXSpec;
00043 }
00044 boolean gnDNXSource::HasContig( const string& name ) const
00045 {
00046         for(uint32 contigI = 0; contigI < m_DNXSpec->GetSpecListLength(); contigI++){
00047                 if(m_DNXSpec->GetSpec(contigI)->GetName() == name)
00048                         return true;
00049         }
00050         return false;
00051 }
00052 uint32 gnDNXSource::GetContigID( const string& name ) const
00053 {
00054         for(uint32 contigI = 0; contigI < m_DNXSpec->GetSpecListLength(); contigI++){
00055                 if(m_DNXSpec->GetSpec(contigI)->GetName() == name)
00056                         return contigI;
00057         }
00058         return ALL_CONTIGS;
00059 }
00060 string gnDNXSource::GetContigName( const uint32 i ) const
00061 {
00062         if(i < m_DNXSpec->GetSpecListLength()){
00063                 gnBaseSpec *gnbs = m_DNXSpec->GetSpec(i);
00064                 return gnbs->GetName();
00065         }
00066         return "";
00067 }
00068 gnSeqI gnDNXSource::GetContigSeqLength( const uint32 i ) const
00069 {
00070         if( i == ALL_CONTIGS){
00071                 return m_DNXSpec->GetLength();
00072         }else if(i < m_DNXSpec->GetSpecListLength()){
00073                 gnBaseSpec *gnbs = m_DNXSpec->GetSpec(i);
00074                 return gnbs->GetLength();
00075         }
00076         return 0;
00077 }
00078 //read raw data from the file
00079 
00080 void gnDNXSource::ValidateName(string& name){
00081         if(name == ""){ //make a random one.
00082                 name.resize(4);
00083                 srand(time(NULL));
00084                 for(int i=0; i < 4; i++)        
00085                         name[i] = (rand() % 26) + 64;
00086         }
00087 }
00088 
00089 boolean gnDNXSource::Write(gnGenomeSpec* spec, const string& filename){
00090         ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00091         gnSourceFactory* m_sSourceFactory = gnSourceFactory::GetSourceFactory();
00092         if(!m_ofstream.is_open())
00093                 return false;
00094         for(uint32 i=0; i < spec->GetSpecListLength(); i++){    //each of these will be dnx statements
00095                 gnFragmentSpec* curStatementSpec = spec->GetSpec(i);
00096                 string sourceName = spec->GetSourceName();
00097                 string statementName = spec->GetName();
00098                 if(!m_sSourceFactory->HasSource(sourceName)){
00099                         ValidateName(statementName);
00100                         statementName += ".seq";
00101                         m_ofstream << statementName << "=";
00102                 }else
00103                         m_ofstream << sourceName << "=";
00104                 for(uint32 j=0; j < curStatementSpec->GetSpecListLength(); j++){        //each of these will be the files
00105                                                                                                                         //referred to by the dnx statement
00106                         gnContigSpec* curSubSpec = curStatementSpec->GetSpec(i);
00107                         sourceName = curStatementSpec->GetSourceName();
00108                         string contigName = curStatementSpec->GetName();
00109                         if(!m_sSourceFactory->HasSource(sourceName)){
00110                                 ValidateName(contigName);
00111                                 string writename = contigName+".seq";
00112                                 gnSequence gns = *curSubSpec;
00113                                 gnGBKSource::Write(gns, writename);
00114                                 m_ofstream << writename;
00115                         }else
00116                                 m_ofstream << sourceName;
00117                         if( j + 1 < curStatementSpec->GetSpecListLength())
00118                                 m_ofstream << "+";
00119                 }
00120                 m_ofstream << ";";
00121                 gnBaseHeader *gpbh = spec->GetHeader(0);
00122                 string header = "";
00123                 if(gpbh != NULL){
00124                         header = gpbh->GetHeader();
00125                         //delete everything after the first newline.
00126                         uint32 newlinepos = header.find_first_of('\n', 0);
00127                         if(newlinepos != string::npos)
00128                                 header = header.substr(0, newlinepos - 1);
00129                 }
00130                 m_ofstream << header << "\r\n";
00131         }
00132         m_ofstream.close();
00133         return true;
00134 }
00135 
00136 gnFileContig* gnDNXSource::GetFileContig( const uint32 contigI ) const{
00137         return NULL;    //returning NULL
00138 }
00139 
00140 //reads an inputstream and creates fills the spec vector appropriately
00141 boolean gnDNXSource::ParseStream( istream& fin )
00142 {
00143         // INIT temp varables
00144         uint32 readState = 0;  //10 - currently inside a comment
00145         uint32 sectionStart = 0;
00146         gnFragmentSpec* currentFragSpec = 0;
00147         gnBaseSource *currentSource;
00148         string currentSourceName;
00149         uint32 currentContig = ALL_CONTIGS;
00150         uint32 currentSeqStart = 0;
00151         boolean currentRevComp = false;
00152         // INIT buffer
00153         uint64 bufReadLen = 0;
00154         uint64 remainingBuffer = 0;
00155         Array<char> array_buf( BUFFER_SIZE );
00156         char* buf = array_buf.data;
00157         string curliteral;
00158         
00159         //Get the source factory and add the current dnx path to it.
00160         gnSourceFactory *sourceFactory = gnSourceFactory::GetSourceFactory();
00161         sourceFactory->AddPath(getPathString(m_openString));
00162 
00163         while( !fin.eof() )
00164         {
00165                 if(sectionStart > 0){
00166                         remainingBuffer = bufReadLen - sectionStart;
00167                         if(readState == 5){     //add literal
00168                                 curliteral += string(buf, sectionStart, remainingBuffer);
00169                                 remainingBuffer = 0;
00170                                 sectionStart = bufReadLen;
00171                         }else
00172                                 memmove(buf, buf+sectionStart, remainingBuffer);
00173                 }
00174                   // read chars
00175                 fin.read( buf + remainingBuffer, BUFFER_SIZE - (bufReadLen - sectionStart));
00176                 sectionStart = 0;
00177                 bufReadLen = fin.gcount() + remainingBuffer;
00178                 
00179                 for( uint32 i=0 ; i < bufReadLen ; i++ )
00180                 {
00181                         char ch = buf[i];
00182                         switch( readState )
00183                         {
00184                                 case 0: // Get name of genome
00185                                         if(ch == '='){
00186                                                 //genome name is from sectionStart to i
00187                                                 string contigName(buf+sectionStart, i - sectionStart);
00188                                                 currentFragSpec = new gnFragmentSpec();
00189                                                 currentFragSpec->SetName(contigName);
00190                                                 currentFragSpec->SetSourceName(m_openString);
00191                                                 m_DNXSpec->AddSpec(currentFragSpec);
00192                                                 sectionStart = i+1;
00193                                                 readState = 1;
00194                                         }
00195                                         break;
00196                                 case 1: // Ignore whitespace before filename or literal
00197                                         if((ch == ' ')||(ch == '        '))
00198                                                 break;
00199                                 case 2: // Are we getting a new source file name or a literal?
00200                                         if(ch == '"'){ //getting a literal
00201                                                 readState = 5;
00202                                                 sectionStart = i+1;
00203                                                 break;
00204                                         }
00205                                         readState = 3;
00206                                         sectionStart = i;
00207                                 case 3: // Get a new source file name
00208                                         //stop on >, (, +, and \n
00209                                         if(ch == '\n' && sectionStart == i -1){
00210                                                 if(buf[sectionStart]=='\r'){
00211                                                         sectionStart = i + 1;
00212                                                         break;
00213                                                 }
00214                                         }
00215                                         if((ch == '+')||(ch == '>')||(ch == '(')||(ch == '\n')||(ch == ';')){
00216                                                 //use the entire source file
00217                                                 string seqfile(buf, sectionStart, i - sectionStart);
00218                                                 currentSourceName = seqfile;
00219                                                 currentSource = sourceFactory->AddSource(seqfile, true);
00220                                                 if (currentSource==NULL)
00221                                                 {
00222                                                         return false;
00223                                                 }
00224                                                 if((ch == '+')||(ch == '\n')||(ch == ';')){
00225                                                         gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource);
00226                                                         tmp_spec->SetSourceName(seqfile);
00227                                                         currentFragSpec->AddSpec(tmp_spec);
00228                                                         readState = 1;
00229                                                         if(ch == '\n'){ //reached the end of the statement.  parse another.
00230                                                                 readState = 0;
00231                                                         }else if(ch == ';'){    //hit a comment.
00232                                                                 readState = 9;
00233                                                         }
00234                                                 }else if(ch == '>'){  //select a contig to use
00235                                                         readState = 4;
00236                                                 }else if(ch == '('){  // use a specified section of the entire file
00237                                                         readState = 6;
00238                                                 }
00239                                                 sectionStart = i + 1;
00240                                         }
00241                                         break;
00242                                 case 4: // Get a specific contig to use
00243                                         //stop on (, +, ;, and \n
00244                                         if((ch == '+')||(ch == '\n')||(ch == ';')||(ch == '(')){
00245                                                 //use the entire contig
00246                                                 string contigname(buf, sectionStart, i - sectionStart);
00247                                                 currentContig = currentSource->GetContigID(contigname);
00248                                                 if((ch == '+')||(ch == '\n')||(ch == ';')){
00249                                                         gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource, currentContig);
00250                                                         tmp_spec->SetSourceName(currentSourceName);
00251                                                         currentFragSpec->AddSpec(tmp_spec);
00252                                                         readState = 1;
00253                                                         if(ch == '\n'){ //reached the end of the statement.  parse another.
00254                                                                 readState = 0;
00255                                                         }else if(ch == ';'){    //hit a comment.
00256                                                                 readState = 9;
00257                                                         }
00258                                                 }else if(ch == '('){  //use the specified section
00259                                                         readState = 6;
00260                                                 }
00261                                                 sectionStart = i + 1;
00262                                         }
00263                                         break;
00264                                 case 5: // read in a literal
00265                                         // stop on "
00266                                         if(ch == '"'){
00267                                                 //now create a string spec from sectionStart to i-1
00268                                                 string literal(buf, sectionStart, i - sectionStart);
00269                                                 if(curliteral.length() > 0){
00270                                                         literal += curliteral;
00271                                                         curliteral = "";
00272                                                 }
00273                                                 gnStringSpec *gpss = new gnStringSpec(literal, currentFragSpec->GetSpecListLength());
00274                                                 currentFragSpec->AddSpec(gpss);
00275                                         }
00276                                 case 6: // read in a specified section
00277                                         //stop on , or < or >
00278                                         if((ch == ',') || (ch == '<') || (ch == '>')){
00279                                                 string seqstartstring(buf, sectionStart, i - sectionStart);
00280                                                 if(seqstartstring == "lend"){
00281                                                         currentSeqStart = 0;
00282                                                 }else if (seqstartstring == "rend"){
00283                                                         currentSeqStart = GNSEQI_END;
00284                                                 }else
00285                                                         currentSeqStart = atoi(seqstartstring.c_str()) - 1;
00286                                                 if(ch == '<')
00287                                                         currentRevComp = true;
00288                                                 sectionStart = i + 1;
00289                                                 readState = 7;
00290                                         }
00291                                         break;
00292                                 case 7: // read in the second half of a specified section
00293                                         //stop on )
00294                                         if(ch == ')'){
00295                                                 string seqendstring(buf, sectionStart, i - sectionStart);
00296                                                 uint32 currentSeqEnd = GNSEQI_END;
00297                                                 if(seqendstring == "lend"){
00298                                                         currentSeqEnd = 0;
00299                                                 }else if (seqendstring == "rend"){
00300                                                         currentSeqEnd = GNSEQI_END;
00301                                                 }else
00302                                                         currentSeqEnd = atoi(seqendstring.c_str()) - 1;
00303                                                 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource, currentContig, currentSeqStart, currentSeqEnd, currentRevComp);
00304                                                 tmp_spec->SetSourceName(currentSourceName);
00305                                                 currentFragSpec->AddSpec(tmp_spec);
00306                                                 currentRevComp = false; //set it back to its default value.
00307                                                 sectionStart = i + 1;
00308                                                 readState = 8; //look for connective operator
00309                                         }
00310                                         break;
00311                                 case 8: //skip whitespace until a connective or terminating operator is reached.
00312                                         if(ch == '+'){
00313                                                 sectionStart = i + 1;
00314                                                 readState = 1; //start over
00315                                         }
00316                                         if(ch == '\n'){
00317                                                 sectionStart = i + 1;
00318                                                 readState = 0;
00319                                         }
00320                                         if(ch == ';'){
00321                                                 sectionStart = i + 1;
00322                                                 readState = 9;
00323                                         }
00324                                         break;
00325                                 case 9: //skip comment until newline.
00326                                         if(ch == '\n'){
00327                                                 sectionStart = i + 1;
00328                                                 readState = 0;
00329                                         }
00330                                         break;
00331                                 default:
00332                                         DebugMsg("ERROR in file\n");
00333                                         return false;
00334                                         break;
00335                         }
00336                 }// for all buf
00337         }// while !eof
00338         // CLEAN UP
00339         return true;
00340 }

Generated on Mon Feb 3 02:34:39 2003 for libGenome by doxygen1.3-rc3