00001 #include "gn/gnSequence.h"
00002 #include <iostream>
00003 #include <fstream>
00004 #include <algorithm>
00005
00006 struct ExMem{
00007 gnSeqI length;
00008 int64 ex_start;
00009 int64 in_start;
00010
00011 };
00012
00013 static boolean LocationLessthan(const gnLocation& a, const gnLocation& b);
00014
00015 static boolean LocationLessthan(const gnLocation& a, const gnLocation& b){
00016 return a.GetStart() < b.GetStart();
00017 }
00018
00019 static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b);
00020
00021 static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b){
00022 return a.GetEnd() < b.GetStart();
00023 }
00024
00025 static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b);
00026
00027 static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b){
00028 return a.GetEnd() - a.GetStart() < b.GetEnd() - b.GetStart();
00029 }
00030
00031 static boolean ExMemLessthan(const ExMem& a, const ExMem& b){
00032 if(a.ex_start == b.ex_start){
00033 if(a.in_start == b.in_start){
00034 return a.length < b.length;
00035 }
00036 return a.in_start < b.in_start;
00037 }
00038 return a.ex_start < b.ex_start;
00039 }
00040
00041 void print_usage(char* pname){
00042 cout << "Usage: " << pname << " <genbank file> <exon_list> <intron_list> <mem_list> <regulation_network> <minimum match size>\n";
00043 }
00044
00045 void load_map_file(string& filename, vector<gnLocation>& loc_list){
00046 ifstream coord_file;
00047 coord_file.open(filename.c_str());
00048 if(!coord_file.is_open()){
00049 cout << "couldn't open file\n";
00050 return;
00051 }
00052
00053
00054 while(coord_file.good()){
00055 gnSeqI start_coord, end_coord;
00056 coord_file >> start_coord;
00057 coord_file >> end_coord;
00058
00059 gnLocation new_location(start_coord, end_coord);
00060 loc_list.push_back(new_location);
00061 }
00062 }
00063
00064 void map_coordinates(vector<gnLocation>& loc_list, vector<gnLocation>& map_list){
00065 gnSeqI curStart = 1;
00066 for(uint32 locationI = 0; locationI < loc_list.size(); locationI++){
00067 gnSeqI cur_len = loc_list[locationI].GetEnd() - loc_list[locationI].GetStart() + 1;
00068 gnLocation map_location(curStart, curStart+cur_len-1);
00069 map_list.push_back(map_location);
00070 curStart += cur_len;
00071 }
00072 }
00073
00074 void map_list_coordinates(list<gnLocation>& loc_list, list<gnLocation>& map_list){
00075 gnSeqI curStart = 1;
00076 list<gnLocation>::iterator loc_iter = loc_list.begin();
00077 for(; loc_iter != loc_list.end(); loc_iter++){
00078 gnSeqI cur_len = loc_iter->GetEnd() - loc_iter->GetStart() + 1;
00079 gnLocation map_location(curStart, curStart+cur_len-1);
00080 map_list.push_back(map_location);
00081 curStart += cur_len;
00082 }
00083 }
00084
00085 void print_feature(ostream& os, gnBaseFeature* cur_feat){
00086 os << cur_feat->GetName() << ": \n";
00087 for(uint32 i=0; i < cur_feat->GetQualifierListLength(); i++)
00088 os << cur_feat->GetQualifierName(i) << "\t" << cur_feat->GetQualifierValue(i) << "\n";
00089 os << "\n";
00090
00091 }
00092
00093 uint32 loc_binary_search(vector<gnLocation>& loc_list, uint32 startI, uint32 endI, gnLocation& query_loc){
00094 uint32 midI = ((endI - startI) / 2) + startI;
00095 if(startI == endI)
00096 return endI;
00097
00098 if(loc_list[midI].Intersects(query_loc)){
00099 return midI;
00100 }else if(loc_list[midI].GetStart() < query_loc.GetStart())
00101 return loc_binary_search(loc_list, midI + 1, endI, query_loc);
00102 else
00103 return loc_binary_search(loc_list, startI, midI, query_loc);
00104 }
00105
00106 int main(int argc, char* argv[]){
00107
00108 boolean run_interactive = false;
00109 string seq_filename;
00110 string exon_list_filename;
00111 string intron_list_filename;
00112 string mem_list_filename;
00113 string reg_net_filename;
00114 vector<gnLocation> exon_list;
00115 vector<gnLocation> intron_list;
00116 vector<gnLocation> exon_map_list;
00117 vector<gnLocation> intron_map_list;
00118 vector<ExMem> mem_list;
00119 uint32 minimum_match_size;
00120
00121
00122 if(argc != 7){
00123 print_usage(argv[0]);
00124 return -1;
00125 }
00126
00127 seq_filename = argv[1];
00128 exon_list_filename = argv[2];
00129 intron_list_filename = argv[3];
00130 mem_list_filename = argv[4];
00131 reg_net_filename = argv[5];
00132 minimum_match_size = atoi(argv[6]);
00133
00134 ifstream mem_file(mem_list_filename.c_str());
00135 if(!mem_file.is_open()){
00136 cout << "Error opening " << mem_list_filename << "\n";
00137 return -1;
00138 }
00139
00140 if(run_interactive){
00141 cout << "Give the name of the exon list to search\n";
00142 cin >> exon_list_filename;
00143 cout << "Give the name of the intron list to search\n";
00144 cin >> intron_list_filename;
00145 cout << "Give the name of the regulatory network to output\n";
00146 cin >> reg_net_filename;
00147
00148 }
00149 ofstream net_file(reg_net_filename.c_str());
00150
00151 if(!net_file.is_open()){
00152 cout << "Error opening regulatory network file: " << reg_net_filename << "\n";
00153 return -2;
00154 }
00155
00156 load_map_file(exon_list_filename, exon_list);
00157 load_map_file(intron_list_filename, intron_list);
00158
00159 cout << exon_list.size() << " unique exons loaded from file\n";
00160 cout << intron_list.size() << " unique introns loaded from file\n";
00161
00162
00163 gnSequence seq_file;
00164 if(run_interactive){
00165 cout << "Enter the name of the genbank sequence file you are using\n";
00166 cin >> seq_filename;
00167 }
00168 if(!seq_file.LoadSource(seq_filename)){
00169 cout << "Error loading file\n";
00170 return -1;
00171 }
00172 cout << "Sequence loaded successfully, " << seq_file.length() << " base pairs.\n";
00173
00174
00175 map_coordinates(exon_list, exon_map_list);
00176 map_coordinates(intron_list, intron_map_list);
00177
00178
00179 while(mem_file.good()){
00180 ExMem m;
00181 mem_file >> m.length;
00182 mem_file >> m.ex_start;
00183 mem_file >> m.in_start;
00184 if(m.length >= minimum_match_size)
00185 mem_list.push_back(m);
00186 }
00187 cout << mem_list.size() << " matches loaded.\n";
00188
00189 sort(mem_list.begin(), mem_list.end(), &ExMemLessthan);
00190
00191
00192 uint32 exonmapI = 0;
00193 uint32 notify_percent = 10;
00194 uint32 notify_interval = mem_list.size() / notify_percent;
00195 uint32 percent_count = 0;
00196 cout << "Searching for complementary matches:\n";
00197 for(uint32 memI = 0; memI < mem_list.size(); memI++){
00198 if(memI % notify_interval == 0){
00199 cout << percent_count << "%.. ";
00200 percent_count += notify_percent;
00201 }
00202
00203 gnLocation ex_map_loc(mem_list[memI].ex_start, mem_list[memI].ex_start + mem_list[memI].length - 1);
00204 for(; exonmapI < exon_map_list.size(); exonmapI++){
00205 if(exon_map_list[exonmapI].Intersects(ex_map_loc))
00206 break;
00207 }
00208
00209
00210 uint32 mapEnd = exonmapI;
00211 for(; mapEnd < exon_map_list.size(); mapEnd++){
00212 if(!exon_map_list[exonmapI].Intersects(ex_map_loc))
00213 break;
00214 }
00215 mapEnd--;
00216
00217 uint32 intronmapI = 0;
00218
00219 int64 cur_in_start = mem_list[memI].in_start;
00220 if(cur_in_start < 0)
00221 cur_in_start = -cur_in_start;
00222 gnLocation in_map_loc(cur_in_start, cur_in_start + mem_list[memI].length - 1);
00223 uint32 search_mapI = loc_binary_search(intron_map_list, 0, intron_map_list.size()-1, in_map_loc);
00224 intronmapI = search_mapI;
00225
00226
00227 for(; intronmapI >= 0; intronmapI--){
00228 if(!intron_map_list[intronmapI].Intersects(in_map_loc)){
00229 intronmapI++;
00230 break;
00231 }
00232 if(intronmapI == 0)
00233 break;
00234 }
00235
00236 uint32 intron_mapEnd = search_mapI;
00237 for(; intron_mapEnd < intron_map_list.size(); intron_mapEnd++){
00238 if(!intron_map_list[intronmapI].Intersects(in_map_loc))
00239 break;
00240 }
00241 intron_mapEnd--;
00242
00243
00244 vector<uint32> ex_feat_index, in_feat_index;
00245 vector<gnBaseFeature*> ex_feat_list;
00246 vector<gnBaseFeature*> in_feat_list;
00247 gnSeqI cur_match_len = mem_list[memI].length;
00248
00249
00250
00251 gnSeqI extra_exon_start = mem_list[memI].ex_start - exon_map_list[exonmapI].GetStart();
00252 gnSeqI cur_exon_len = exon_map_list[exonmapI].GetEnd() - exon_map_list[exonmapI].GetStart() + 1;
00253 gnSeqI max_exon_chunk = cur_exon_len - extra_exon_start;
00254 gnSeqI cur_exon_chunk = max_exon_chunk < cur_match_len ? max_exon_chunk : cur_match_len;
00255
00256
00257 gnSeqI extra_intron_start, cur_intron_len, max_intron_chunk, cur_intron_chunk;
00258 boolean complement = false;
00259 if(mem_list[memI].in_start > 0){
00260 extra_intron_start = mem_list[memI].in_start - intron_map_list[intronmapI].GetStart();
00261 cur_intron_len = intron_map_list[intronmapI].GetEnd() - intron_map_list[intronmapI].GetStart() + 1;
00262 max_intron_chunk = cur_intron_len - extra_intron_start;
00263 cur_intron_chunk = max_intron_chunk < mem_list[memI].length ? max_intron_chunk : mem_list[memI].length;
00264 }else{
00265
00266 if(cur_in_start >= intron_map_list[intron_mapEnd].GetStart())
00267 cur_intron_chunk = cur_match_len;
00268 else
00269 cur_intron_chunk = cur_in_start + cur_match_len - intron_map_list[intron_mapEnd].GetStart();
00270 complement = true;
00271 seq_file.getIntersectingFeatures(intron_list[intronmapI], in_feat_list, in_feat_index);
00272 }
00273
00274
00275 gnSeqI cur_chunk = cur_intron_chunk < cur_exon_chunk ? cur_intron_chunk : cur_exon_chunk;
00276
00277 gnLocation cur_exon_loc(exon_list[exonmapI].GetStart() + extra_exon_start, exon_list[exonmapI].GetStart() + cur_chunk);
00278 seq_file.getIntersectingFeatures(cur_exon_loc, ex_feat_list, ex_feat_index);
00279 if(mem_list[memI].in_start > 0){
00280 gnLocation cur_intron_loc(intron_list[intronmapI].GetStart() - 1, intron_list[intronmapI].GetEnd() + 1);
00281 seq_file.getIntersectingFeatures(cur_intron_loc, in_feat_list, in_feat_index);
00282 }else{
00283 gnLocation cur_intron_loc(intron_list[intron_mapEnd].GetStart() - 1, intron_list[intron_mapEnd].GetEnd() + 1);
00284 seq_file.getIntersectingFeatures(cur_intron_loc, in_feat_list, in_feat_index);
00285 }
00286 vector<gnBaseFeature*> ex_forward;
00287 vector<gnBaseFeature*> ex_reverse;
00288 vector<gnBaseFeature*> in_forward;
00289 vector<gnBaseFeature*> in_reverse;
00290
00291 for(uint32 featI = 0; featI < ex_feat_list.size(); featI++){
00292 string featName = ex_feat_list[featI]->GetName();
00293 if(featName == "mRNA" || featName == "CDS" || featName == "gene" )
00294 if(ex_feat_list[featI]->GetLocationType() == gnLocation::LT_Complement)
00295 ex_reverse.push_back(ex_feat_list[featI]);
00296 else
00297 ex_forward.push_back(ex_feat_list[featI]);
00298 }
00299 for(uint32 featI = 0; featI < in_feat_list.size(); featI++){
00300 string featName = in_feat_list[featI]->GetName();
00301 if(featName == "mRNA" || featName == "CDS" || featName == "gene" )
00302 if(in_feat_list[featI]->GetLocationType() == gnLocation::LT_Complement)
00303 in_reverse.push_back(in_feat_list[featI]);
00304 else
00305 in_forward.push_back(in_feat_list[featI]);
00306 }
00307
00308 if(complement){
00309 vector<gnBaseFeature*> tmp_in = in_forward;
00310 in_forward = in_reverse;
00311 in_reverse = tmp_in;
00312 }
00313
00314
00315 if((ex_forward.size() > 0 && in_reverse.size() > 0) || (in_forward.size() > 0 && ex_reverse.size() > 0)){
00316 net_file << "================================\n";
00317 net_file << "Mem: " << mem_list[memI].length << "\n";
00318 net_file << "This exon/intron matching size: " << cur_chunk << "\n";
00319 }
00320 if(ex_forward.size() > 0 && in_reverse.size() > 0){
00321 net_file << "Forward Exons:\n";
00322 for(uint32 featI = 0; featI < ex_forward.size(); featI++)
00323 print_feature(net_file, ex_forward[featI]);
00324 net_file << "Matching introns:\n";
00325 for(uint32 featI = 0; featI < in_reverse.size(); featI++)
00326 print_feature(net_file, in_reverse[featI]);
00327 }
00328 if(in_forward.size() > 0 && ex_reverse.size() > 0){
00329 net_file << "Reverse Exons:\n";
00330 for(uint32 featI = 0; featI < ex_reverse.size(); featI++)
00331 print_feature(net_file, ex_reverse[featI]);
00332 net_file << "Matching introns:\n";
00333 for(uint32 featI = 0; featI < in_forward.size(); featI++)
00334 print_feature(net_file, in_forward[featI]);
00335 }
00336
00337
00338 for(uint32 featI = 0; featI < ex_feat_list.size(); featI++)
00339 delete ex_feat_list[featI];
00340 for(uint32 featI = 0; featI < in_feat_list.size(); featI++)
00341 delete in_feat_list[featI];
00342
00343
00344
00345
00346
00347 }
00348 }
00349