00001 #include "gn/gnSequence.h"
00002 #include "gn/gnGBKSource.h"
00003 #include "gn/gnFASSource.h"
00004 #include <algorithm>
00005
00006 static boolean LocationLessthan(const gnLocation& a, const gnLocation& b);
00007
00008 static boolean LocationLessthan(const gnLocation& a, const gnLocation& b){
00009 return a.GetStart() < b.GetStart();
00010 }
00011
00012 static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b);
00013
00014 static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b){
00015 return a.GetEnd() < b.GetStart();
00016 }
00017
00018 static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b);
00019
00020 static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b){
00021 return a.GetEnd() - a.GetStart() < b.GetEnd() - b.GetStart();
00022 }
00023
00024
00025
00026 void ComplementLocation(vector<gnLocation>& l_list){
00027 if(l_list.size() < 2)
00028 return;
00029
00030 vector<gnLocation> comp_list;
00031
00032 sort(l_list.begin(), l_list.end(), &LocationLessthan);
00033
00034 for(uint32 locationI = 1; locationI < l_list.size(); locationI++){
00035
00036
00037 if( !l_list[locationI].Intersects( l_list[locationI-1] ) ){
00038
00039
00040 gnLocation between_loc;
00041 between_loc.SetStart(l_list[locationI-1].GetEnd());
00042 between_loc.SetEnd(l_list[locationI].GetStart());
00043
00044 comp_list.push_back(between_loc);
00045 }
00046
00047 }
00048
00049 l_list = comp_list;
00050 }
00051
00052 void DeleteRepeats(list<gnLocation>& loc_list){
00053
00054 loc_list.sort(&LocationLessthan);
00055
00056 list<gnLocation>::iterator loc_iter = loc_list.begin();
00057 list<gnLocation>::iterator cur_loc;
00058 for(; loc_iter != loc_list.end(); loc_iter++){
00059 cur_loc = loc_iter;
00060 cur_loc++;
00061 while(cur_loc != loc_list.end() && !LocationEndLessthan(*loc_iter, *cur_loc)){
00062 if(loc_iter->Intersects(*cur_loc)){
00063 *loc_iter = loc_iter->GetUnion(*cur_loc);
00064 list<gnLocation>::iterator to_del = cur_loc;
00065 cur_loc--;
00066 loc_list.erase(to_del);
00067 }
00068 cur_loc++;
00069 }
00070 }
00071 }
00072
00073 void WriteData(gnSequence& file_sequence, list<gnLocation>& loc_list, string& filename){
00074 gnSequence loc_seq;
00075 cout << "Gathering sequence data to write...\n";
00076 list<gnLocation>::iterator loc_iter = loc_list.begin();
00077 uint32 loc_count = loc_list.size();
00078 uint32 notice_interval = loc_count / 10;
00079 uint32 cur_locI = 0;
00080 for(;loc_iter != loc_list.end(); loc_iter++){
00081 loc_seq += file_sequence.ToString( loc_iter->GetEnd() - loc_iter->GetStart(), loc_iter->GetStart());
00082 if(cur_locI % notice_interval == 0){
00083 cout << (cur_locI / loc_count) * 10 << "%..";
00084 }
00085 cur_locI++;
00086 }
00087 cout << "\nWriting sequence data...\n";
00088 gnFASSource::Write(loc_seq, filename);
00089 }
00090
00091 void WriteList(list<gnLocation>& loc_list, string& filename){
00092 ofstream list_out(filename.c_str());
00093 list<gnLocation>::iterator loc_iter = loc_list.begin();
00094 for(;loc_iter != loc_list.end(); loc_iter++){
00095 list_out << loc_iter->GetStart() << '\t' << loc_iter->GetEnd() << '\n';
00096 }
00097 list_out.close();
00098 }
00099
00100 void print_usage(char* pname){
00101 cout << "Usage: " << pname << " <genbank file> <exon_list> <intron_list> <exon_seq> <intron_seq>\n";
00102 }
00103
00104 int main( int argc, char* argv[] )
00105 {
00106 boolean run_interactive = false;
00107 string seq_filename;
00108 string exon_list_filename;
00109 string intron_list_filename;
00110 string exon_seq_filename;
00111 string intron_seq_filename;
00112
00113 gnSequence file_sequence;
00114
00115 if(!run_interactive){
00116
00117 if(argc != 6){
00118 print_usage(argv[0]);
00119 return -1;
00120 }
00121
00122 seq_filename = argv[1];
00123 exon_list_filename = argv[2];
00124 intron_list_filename = argv[3];
00125 exon_seq_filename = argv[4];
00126 intron_seq_filename = argv[5];
00127 }else{
00128
00129
00130 cout << "Enter the name of the sequence file to load:\n" ;
00131 cin >> seq_filename;
00132 }
00133
00134
00135 if(file_sequence.LoadSource(seq_filename))
00136 {
00137 cout << seq_filename << " loaded successfully. " << file_sequence.length() << " base pairs.\n";
00138 }else{
00139 cout << "Error loading file.\n";
00140 return -1;
00141 }
00142
00143 uint32 feature_count = file_sequence.getFeatureListLength();
00144 uint32 cds_count = 0;
00145
00146
00147 gnSequence exon_seq;
00148 gnSequence intron_seq;
00149
00150 list<gnLocation> exon_list;
00151 list<gnLocation> intron_list;
00152
00153 cout << "There are " << feature_count << " known features.\n";
00154 for(uint32 featureI = 0; featureI < feature_count; featureI++){
00155 gnBaseFeature* cur_feat = file_sequence.getFeature(featureI);
00156 if(cur_feat == NULL){
00157 cout << "cur_feat is NULL, trace me!\n";
00158 cur_feat = file_sequence.getFeature(featureI);
00159 continue;
00160 }
00161 if(cur_feat->GetName() == "CDS"){
00162 cds_count++;
00163 uint32 loc_count = cur_feat->GetLocationListLength();
00164 vector<gnLocation> cur_exon_list, cur_intron_list;
00165
00166 for(uint32 locationI = 0; locationI < loc_count; locationI++){
00167 cur_exon_list.push_back(cur_feat->GetLocation(locationI));
00168 exon_list.push_back(cur_exon_list[locationI]);
00169 }
00170 if(loc_count >= 2){
00171 cur_intron_list = cur_exon_list;
00172 ComplementLocation(cur_intron_list);
00173 for(uint32 locationI = 0; locationI < cur_intron_list.size(); locationI++)
00174 intron_list.push_back(cur_intron_list[locationI]);
00175 }
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189 delete cur_feat;
00190 }
00191 }
00192
00193 cout << "There are " << cds_count << " cds features.\n";
00194
00195
00196 uint32 exon_count = exon_list.size();
00197 cout << "Deleting exon repeats... ";
00198 DeleteRepeats(exon_list);
00199 cout << "Eliminated " << exon_count - exon_list.size() << " repeats for a total of ";
00200 cout << exon_list.size() << " repeats\n";
00201 uint32 intron_count = intron_list.size();
00202 cout << "Deleting intron repeats... ";
00203 DeleteRepeats(intron_list);
00204 cout << "Eliminated " << intron_count - intron_list.size() << " repeats for a total of ";
00205 cout << intron_list.size() << " repeats\n";
00206 WriteList(exon_list, exon_list_filename);
00207 WriteList(intron_list, intron_list_filename);
00208 WriteData(file_sequence, exon_list, exon_seq_filename);
00209 WriteData(file_sequence, intron_list, intron_seq_filename);
00210 if(run_interactive)
00211 cin >> seq_filename;
00212 return 0;
00213 };