00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "gn/gnDNXSource.h"
00013 #include "gn/gnSourceSpec.h"
00014 #include "gn/gnStringSpec.h"
00015 #include "gn/gnSourceFactory.h"
00016 #include "gn/gnFASSource.h"
00017 #include "gn/gnGBKSource.h"
00018 #include "gn/gnBaseHeader.h"
00019 #include "gn/gnFilter.h"
00020 #include "gn/gnDebug.h"
00021 #include "gn/gnStringTools.h"
00022 #include <string>
00023
00024 gnDNXSource::gnDNXSource()
00025 {
00026 m_DNXSpec = new gnGenomeSpec();
00027 m_pFilter = gnFilter::fullDNASeqFilter();
00028 if(m_pFilter == NULL){
00029 DebugMsg("Error using static sequence filters.");
00030 }
00031 }
00032
00033 gnDNXSource::gnDNXSource( const gnDNXSource& s ) : gnFileSource(s)
00034 {
00035 if(s.m_DNXSpec != NULL)
00036 m_DNXSpec = s.m_DNXSpec->Clone();
00037 }
00038
00039 gnDNXSource::~gnDNXSource()
00040 {
00041 m_ifstream.close();
00042 delete m_DNXSpec;
00043 }
00044 boolean gnDNXSource::HasContig( const string& name ) const
00045 {
00046 for(uint32 contigI = 0; contigI < m_DNXSpec->GetSpecListLength(); contigI++){
00047 if(m_DNXSpec->GetSpec(contigI)->GetName() == name)
00048 return true;
00049 }
00050 return false;
00051 }
00052 uint32 gnDNXSource::GetContigID( const string& name ) const
00053 {
00054 for(uint32 contigI = 0; contigI < m_DNXSpec->GetSpecListLength(); contigI++){
00055 if(m_DNXSpec->GetSpec(contigI)->GetName() == name)
00056 return contigI;
00057 }
00058 return ALL_CONTIGS;
00059 }
00060 string gnDNXSource::GetContigName( const uint32 i ) const
00061 {
00062 if(i < m_DNXSpec->GetSpecListLength()){
00063 gnBaseSpec *gnbs = m_DNXSpec->GetSpec(i);
00064 return gnbs->GetName();
00065 }
00066 return "";
00067 }
00068 gnSeqI gnDNXSource::GetContigSeqLength( const uint32 i ) const
00069 {
00070 if( i == ALL_CONTIGS){
00071 return m_DNXSpec->GetLength();
00072 }else if(i < m_DNXSpec->GetSpecListLength()){
00073 gnBaseSpec *gnbs = m_DNXSpec->GetSpec(i);
00074 return gnbs->GetLength();
00075 }
00076 return 0;
00077 }
00078
00079
00080 void gnDNXSource::ValidateName(string& name){
00081 if(name == ""){
00082 name.resize(4);
00083 srand(time(NULL));
00084 for(int i=0; i < 4; i++)
00085 name[i] = (rand() % 26) + 64;
00086 }
00087 }
00088
00089 boolean gnDNXSource::Write(gnGenomeSpec* spec, const string& filename){
00090 ofstream m_ofstream(filename.c_str(), ios::out | ios::binary);
00091 gnSourceFactory* m_sSourceFactory = gnSourceFactory::GetSourceFactory();
00092 if(!m_ofstream.is_open())
00093 return false;
00094 for(uint32 i=0; i < spec->GetSpecListLength(); i++){
00095 gnFragmentSpec* curStatementSpec = spec->GetSpec(i);
00096 string sourceName = spec->GetSourceName();
00097 string statementName = spec->GetName();
00098 if(!m_sSourceFactory->HasSource(sourceName)){
00099 ValidateName(statementName);
00100 statementName += ".seq";
00101 m_ofstream << statementName << "=";
00102 }else
00103 m_ofstream << sourceName << "=";
00104 for(uint32 j=0; j < curStatementSpec->GetSpecListLength(); j++){
00105
00106 gnContigSpec* curSubSpec = curStatementSpec->GetSpec(i);
00107 sourceName = curStatementSpec->GetSourceName();
00108 string contigName = curStatementSpec->GetName();
00109 if(!m_sSourceFactory->HasSource(sourceName)){
00110 ValidateName(contigName);
00111 string writename = contigName+".seq";
00112 gnSequence gns = *curSubSpec;
00113 gnGBKSource::Write(gns, writename);
00114 m_ofstream << writename;
00115 }else
00116 m_ofstream << sourceName;
00117 if( j + 1 < curStatementSpec->GetSpecListLength())
00118 m_ofstream << "+";
00119 }
00120 m_ofstream << ";";
00121 gnBaseHeader *gpbh = spec->GetHeader(0);
00122 string header = "";
00123 if(gpbh != NULL){
00124 header = gpbh->GetHeader();
00125
00126 uint32 newlinepos = header.find_first_of('\n', 0);
00127 if(newlinepos != string::npos)
00128 header = header.substr(0, newlinepos - 1);
00129 }
00130 m_ofstream << header << "\r\n";
00131 }
00132 m_ofstream.close();
00133 return true;
00134 }
00135
00136 gnFileContig* gnDNXSource::GetFileContig( const uint32 contigI ) const{
00137 return NULL;
00138 }
00139
00140
00141 boolean gnDNXSource::ParseStream( istream& fin )
00142 {
00143
00144 uint32 readState = 0;
00145 uint32 sectionStart = 0;
00146 gnFragmentSpec* currentFragSpec = 0;
00147 gnBaseSource *currentSource;
00148 string currentSourceName;
00149 uint32 currentContig = ALL_CONTIGS;
00150 uint32 currentSeqStart = 0;
00151 boolean currentRevComp = false;
00152
00153 uint64 bufReadLen = 0;
00154 uint64 remainingBuffer = 0;
00155 Array<char> array_buf( BUFFER_SIZE );
00156 char* buf = array_buf.data;
00157 string curliteral;
00158
00159
00160 gnSourceFactory *sourceFactory = gnSourceFactory::GetSourceFactory();
00161 sourceFactory->AddPath(getPathString(m_openString));
00162
00163 while( !fin.eof() )
00164 {
00165 if(sectionStart > 0){
00166 remainingBuffer = bufReadLen - sectionStart;
00167 if(readState == 5){
00168 curliteral += string(buf, sectionStart, remainingBuffer);
00169 remainingBuffer = 0;
00170 sectionStart = bufReadLen;
00171 }else
00172 memmove(buf, buf+sectionStart, remainingBuffer);
00173 }
00174
00175 fin.read( buf + remainingBuffer, BUFFER_SIZE - (bufReadLen - sectionStart));
00176 sectionStart = 0;
00177 bufReadLen = fin.gcount() + remainingBuffer;
00178
00179 for( uint32 i=0 ; i < bufReadLen ; i++ )
00180 {
00181 char ch = buf[i];
00182 switch( readState )
00183 {
00184 case 0:
00185 if(ch == '='){
00186
00187 string contigName(buf+sectionStart, i - sectionStart);
00188 currentFragSpec = new gnFragmentSpec();
00189 currentFragSpec->SetName(contigName);
00190 currentFragSpec->SetSourceName(m_openString);
00191 m_DNXSpec->AddSpec(currentFragSpec);
00192 sectionStart = i+1;
00193 readState = 1;
00194 }
00195 break;
00196 case 1:
00197 if((ch == ' ')||(ch == ' '))
00198 break;
00199 case 2:
00200 if(ch == '"'){
00201 readState = 5;
00202 sectionStart = i+1;
00203 break;
00204 }
00205 readState = 3;
00206 sectionStart = i;
00207 case 3:
00208
00209 if(ch == '\n' && sectionStart == i -1){
00210 if(buf[sectionStart]=='\r'){
00211 sectionStart = i + 1;
00212 break;
00213 }
00214 }
00215 if((ch == '+')||(ch == '>')||(ch == '(')||(ch == '\n')||(ch == ';')){
00216
00217 string seqfile(buf, sectionStart, i - sectionStart);
00218 currentSourceName = seqfile;
00219 currentSource = sourceFactory->AddSource(seqfile, true);
00220 if (currentSource==NULL)
00221 {
00222 return false;
00223 }
00224 if((ch == '+')||(ch == '\n')||(ch == ';')){
00225 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource);
00226 tmp_spec->SetSourceName(seqfile);
00227 currentFragSpec->AddSpec(tmp_spec);
00228 readState = 1;
00229 if(ch == '\n'){
00230 readState = 0;
00231 }else if(ch == ';'){
00232 readState = 9;
00233 }
00234 }else if(ch == '>'){
00235 readState = 4;
00236 }else if(ch == '('){
00237 readState = 6;
00238 }
00239 sectionStart = i + 1;
00240 }
00241 break;
00242 case 4:
00243
00244 if((ch == '+')||(ch == '\n')||(ch == ';')||(ch == '(')){
00245
00246 string contigname(buf, sectionStart, i - sectionStart);
00247 currentContig = currentSource->GetContigID(contigname);
00248 if((ch == '+')||(ch == '\n')||(ch == ';')){
00249 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource, currentContig);
00250 tmp_spec->SetSourceName(currentSourceName);
00251 currentFragSpec->AddSpec(tmp_spec);
00252 readState = 1;
00253 if(ch == '\n'){
00254 readState = 0;
00255 }else if(ch == ';'){
00256 readState = 9;
00257 }
00258 }else if(ch == '('){
00259 readState = 6;
00260 }
00261 sectionStart = i + 1;
00262 }
00263 break;
00264 case 5:
00265
00266 if(ch == '"'){
00267
00268 string literal(buf, sectionStart, i - sectionStart);
00269 if(curliteral.length() > 0){
00270 literal += curliteral;
00271 curliteral = "";
00272 }
00273 gnStringSpec *gpss = new gnStringSpec(literal, currentFragSpec->GetSpecListLength());
00274 currentFragSpec->AddSpec(gpss);
00275 }
00276 case 6:
00277
00278 if((ch == ',') || (ch == '<') || (ch == '>')){
00279 string seqstartstring(buf, sectionStart, i - sectionStart);
00280 if(seqstartstring == "lend"){
00281 currentSeqStart = 0;
00282 }else if (seqstartstring == "rend"){
00283 currentSeqStart = GNSEQI_END;
00284 }else
00285 currentSeqStart = atoi(seqstartstring.c_str()) - 1;
00286 if(ch == '<')
00287 currentRevComp = true;
00288 sectionStart = i + 1;
00289 readState = 7;
00290 }
00291 break;
00292 case 7:
00293
00294 if(ch == ')'){
00295 string seqendstring(buf, sectionStart, i - sectionStart);
00296 uint32 currentSeqEnd = GNSEQI_END;
00297 if(seqendstring == "lend"){
00298 currentSeqEnd = 0;
00299 }else if (seqendstring == "rend"){
00300 currentSeqEnd = GNSEQI_END;
00301 }else
00302 currentSeqEnd = atoi(seqendstring.c_str()) - 1;
00303 gnSourceSpec* tmp_spec = new gnSourceSpec(currentSource, currentContig, currentSeqStart, currentSeqEnd, currentRevComp);
00304 tmp_spec->SetSourceName(currentSourceName);
00305 currentFragSpec->AddSpec(tmp_spec);
00306 currentRevComp = false;
00307 sectionStart = i + 1;
00308 readState = 8;
00309 }
00310 break;
00311 case 8:
00312 if(ch == '+'){
00313 sectionStart = i + 1;
00314 readState = 1;
00315 }
00316 if(ch == '\n'){
00317 sectionStart = i + 1;
00318 readState = 0;
00319 }
00320 if(ch == ';'){
00321 sectionStart = i + 1;
00322 readState = 9;
00323 }
00324 break;
00325 case 9:
00326 if(ch == '\n'){
00327 sectionStart = i + 1;
00328 readState = 0;
00329 }
00330 break;
00331 default:
00332 DebugMsg("ERROR in file\n");
00333 return false;
00334 break;
00335 }
00336 }
00337 }
00338
00339 return true;
00340 }