Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

coordMapper.cpp

Go to the documentation of this file.
00001 #include "gn/gnSequence.h"
00002 #include <iostream>
00003 #include <fstream>
00004 #include <algorithm>
00005 
00006 struct ExMem{
00007         gnSeqI length;
00008         int64 ex_start;
00009         int64 in_start;
00010 
00011 };
00012 
00013 static boolean LocationLessthan(const gnLocation& a, const gnLocation& b);
00014 
00015 static boolean LocationLessthan(const gnLocation& a, const gnLocation& b){
00016         return a.GetStart() < b.GetStart();
00017 }
00018 
00019 static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b);
00020 
00021 static boolean LocationEndLessthan(const gnLocation& a, const gnLocation& b){
00022         return a.GetEnd() < b.GetStart();
00023 }
00024 
00025 static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b);
00026 
00027 static boolean LocationSizeLessthan(const gnLocation& a, const gnLocation& b){
00028         return a.GetEnd() - a.GetStart() < b.GetEnd() - b.GetStart();
00029 }
00030 
00031 static boolean ExMemLessthan(const ExMem& a, const ExMem& b){
00032         if(a.ex_start == b.ex_start){
00033                 if(a.in_start == b.in_start){
00034                         return a.length < b.length;
00035                 }
00036                 return a.in_start < b.in_start;
00037         }
00038         return a.ex_start < b.ex_start;
00039 }
00040 
00041 void print_usage(char* pname){
00042         cout << "Usage: " << pname << " <genbank file> <exon_list> <intron_list> <mem_list> <regulation_network> <minimum match size>\n";
00043 }
00044 
00045 void load_map_file(string& filename, vector<gnLocation>& loc_list){
00046         ifstream coord_file;
00047         coord_file.open(filename.c_str());
00048         if(!coord_file.is_open()){
00049                 cout << "couldn't open file\n";
00050                 return;
00051         }
00052 
00053         //read all the exon coordinates
00054         while(coord_file.good()){
00055                 gnSeqI start_coord, end_coord;
00056                 coord_file >> start_coord;
00057                 coord_file >> end_coord;
00058                 
00059                 gnLocation new_location(start_coord, end_coord);
00060                 loc_list.push_back(new_location);
00061         }
00062 }
00063 
00064 void map_coordinates(vector<gnLocation>& loc_list, vector<gnLocation>& map_list){
00065         gnSeqI curStart = 1;
00066         for(uint32 locationI = 0; locationI < loc_list.size(); locationI++){
00067                 gnSeqI cur_len = loc_list[locationI].GetEnd() - loc_list[locationI].GetStart() + 1;
00068                 gnLocation map_location(curStart, curStart+cur_len-1);
00069                 map_list.push_back(map_location);
00070                 curStart += cur_len;
00071         }
00072 }
00073 
00074 void map_list_coordinates(list<gnLocation>& loc_list, list<gnLocation>& map_list){
00075         gnSeqI curStart = 1;
00076         list<gnLocation>::iterator loc_iter = loc_list.begin();
00077         for(; loc_iter != loc_list.end(); loc_iter++){
00078                 gnSeqI cur_len = loc_iter->GetEnd() - loc_iter->GetStart() + 1;
00079                 gnLocation map_location(curStart, curStart+cur_len-1);
00080                 map_list.push_back(map_location);
00081                 curStart += cur_len;
00082         }
00083 }
00084 
00085 void print_feature(ostream& os, gnBaseFeature* cur_feat){
00086         os << cur_feat->GetName() << ": \n";
00087         for(uint32 i=0; i < cur_feat->GetQualifierListLength(); i++)
00088                 os << cur_feat->GetQualifierName(i) << "\t" << cur_feat->GetQualifierValue(i) << "\n";
00089         os << "\n";
00090 
00091 }
00092 
00093 uint32 loc_binary_search(vector<gnLocation>& loc_list, uint32 startI, uint32 endI, gnLocation& query_loc){
00094         uint32 midI = ((endI - startI) / 2) + startI;
00095         if(startI == endI)
00096                 return endI;
00097 
00098         if(loc_list[midI].Intersects(query_loc)){
00099                 return midI;
00100         }else if(loc_list[midI].GetStart() < query_loc.GetStart())
00101                 return loc_binary_search(loc_list, midI + 1, endI, query_loc);
00102         else
00103                 return loc_binary_search(loc_list, startI, midI, query_loc);
00104 }
00105 
00106 int main(int argc, char* argv[]){
00107         
00108         boolean run_interactive = false;
00109         string seq_filename;
00110         string exon_list_filename;
00111         string intron_list_filename;
00112         string mem_list_filename;
00113         string reg_net_filename;
00114         vector<gnLocation> exon_list;
00115         vector<gnLocation> intron_list;
00116         vector<gnLocation> exon_map_list;
00117         vector<gnLocation> intron_map_list;
00118         vector<ExMem> mem_list;
00119         uint32 minimum_match_size;
00120 
00121         // check for correct calling semantics
00122         if(argc != 7){
00123                 print_usage(argv[0]);
00124                 return -1;
00125         }
00126 
00127         seq_filename = argv[1];
00128         exon_list_filename = argv[2];
00129         intron_list_filename = argv[3];
00130         mem_list_filename = argv[4];
00131         reg_net_filename = argv[5];
00132         minimum_match_size = atoi(argv[6]);
00133         
00134         ifstream mem_file(mem_list_filename.c_str());
00135         if(!mem_file.is_open()){
00136                 cout << "Error opening " << mem_list_filename << "\n";
00137                 return -1;
00138         }
00139 
00140         if(run_interactive){
00141                 cout << "Give the name of the exon list to search\n";
00142                 cin >> exon_list_filename;
00143                 cout << "Give the name of the intron list to search\n";
00144                 cin >> intron_list_filename;
00145                 cout << "Give the name of the regulatory network to output\n";
00146                 cin >> reg_net_filename;
00147 
00148         }
00149         ofstream net_file(reg_net_filename.c_str());
00150         
00151         if(!net_file.is_open()){
00152                 cout << "Error opening regulatory network file: " << reg_net_filename << "\n";
00153                 return -2;
00154         }
00155         
00156         load_map_file(exon_list_filename, exon_list);
00157         load_map_file(intron_list_filename, intron_list);
00158         
00159         cout << exon_list.size() << " unique exons loaded from file\n";
00160         cout << intron_list.size() << " unique introns loaded from file\n";
00161         
00162         //now load the genbank file
00163         gnSequence seq_file;
00164         if(run_interactive){
00165                 cout << "Enter the name of the genbank sequence file you are using\n";
00166                 cin >> seq_filename;
00167         }
00168         if(!seq_file.LoadSource(seq_filename)){
00169                 cout << "Error loading file\n";
00170                 return -1;
00171         }
00172         cout << "Sequence loaded successfully, " << seq_file.length() << " base pairs.\n";
00173         
00174         //construct a mapping between coordinates...
00175         map_coordinates(exon_list, exon_map_list);
00176         map_coordinates(intron_list, intron_map_list);
00177         
00178         //now read the mem file
00179         while(mem_file.good()){
00180                 ExMem m;
00181                 mem_file >> m.length;
00182                 mem_file >> m.ex_start;
00183                 mem_file >> m.in_start;
00184                 if(m.length >= minimum_match_size)
00185                         mem_list.push_back(m);
00186         }
00187         cout << mem_list.size() << " matches loaded.\n";
00188         //sort the mem list
00189         sort(mem_list.begin(), mem_list.end(), &ExMemLessthan);
00190         
00191         //now get the intersection for each mem in the list...
00192         uint32 exonmapI = 0;
00193         uint32 notify_percent = 10;
00194         uint32 notify_interval = mem_list.size() / notify_percent;
00195         uint32 percent_count = 0;
00196         cout << "Searching for complementary matches:\n";
00197         for(uint32 memI = 0; memI < mem_list.size(); memI++){
00198                 if(memI % notify_interval == 0){
00199                         cout << percent_count << "%.. ";
00200                         percent_count += notify_percent;
00201                 }
00202                 //simple linear search for intersecting exon mapping
00203                 gnLocation ex_map_loc(mem_list[memI].ex_start, mem_list[memI].ex_start + mem_list[memI].length - 1);
00204                 for(; exonmapI < exon_map_list.size(); exonmapI++){
00205                         if(exon_map_list[exonmapI].Intersects(ex_map_loc))
00206                                 break;
00207                 }
00208                 
00209                 //continue to search for any other mappings that intersect
00210                 uint32 mapEnd = exonmapI;
00211                 for(; mapEnd < exon_map_list.size(); mapEnd++){
00212                         if(!exon_map_list[exonmapI].Intersects(ex_map_loc))
00213                                 break;
00214                 }
00215                 mapEnd--;
00216                 
00217                 uint32 intronmapI = 0; // intronmapI will contain the index of the first intersecting intron
00218                 //do a binary search for intersecting intron mappings
00219                 int64 cur_in_start = mem_list[memI].in_start;
00220                 if(cur_in_start < 0)
00221                         cur_in_start = -cur_in_start;
00222                 gnLocation in_map_loc(cur_in_start, cur_in_start + mem_list[memI].length - 1);
00223                 uint32 search_mapI = loc_binary_search(intron_map_list, 0, intron_map_list.size()-1, in_map_loc);
00224                 intronmapI = search_mapI;
00225 
00226                 //search backwards for previous intersections
00227                 for(; intronmapI >= 0; intronmapI--){
00228                         if(!intron_map_list[intronmapI].Intersects(in_map_loc)){
00229                                 intronmapI++;
00230                                 break;
00231                         }
00232                         if(intronmapI == 0)
00233                                 break;
00234                 }
00235                 //continue to search for any other mappings that intersect
00236                 uint32 intron_mapEnd = search_mapI;
00237                 for(; intron_mapEnd < intron_map_list.size(); intron_mapEnd++){
00238                         if(!intron_map_list[intronmapI].Intersects(in_map_loc))
00239                                 break;
00240                 }
00241                 intron_mapEnd--;
00242                 
00243                 //we have the mappings, now map the coordinates
00244                 vector<uint32> ex_feat_index, in_feat_index;
00245                 vector<gnBaseFeature*> ex_feat_list;
00246                 vector<gnBaseFeature*> in_feat_list;
00247                 gnSeqI cur_match_len = mem_list[memI].length;
00248                 
00249                 //find out how much of the first exon was matched
00250                 //extra exon start has the number of unmatched bases at the beginning of the exon
00251                 gnSeqI extra_exon_start = mem_list[memI].ex_start - exon_map_list[exonmapI].GetStart();
00252                 gnSeqI cur_exon_len = exon_map_list[exonmapI].GetEnd() - exon_map_list[exonmapI].GetStart() + 1;
00253                 gnSeqI max_exon_chunk = cur_exon_len - extra_exon_start;
00254                 gnSeqI cur_exon_chunk = max_exon_chunk < cur_match_len ? max_exon_chunk : cur_match_len;
00255                 
00256                 //find out how much of the first intron was matched
00257                 gnSeqI extra_intron_start, cur_intron_len, max_intron_chunk, cur_intron_chunk;
00258                 boolean complement = false;
00259                 if(mem_list[memI].in_start > 0){
00260                         extra_intron_start = mem_list[memI].in_start - intron_map_list[intronmapI].GetStart();
00261                         cur_intron_len = intron_map_list[intronmapI].GetEnd() - intron_map_list[intronmapI].GetStart() + 1;
00262                         max_intron_chunk = cur_intron_len - extra_intron_start;
00263                         cur_intron_chunk = max_intron_chunk < mem_list[memI].length ? max_intron_chunk : mem_list[memI].length;
00264                 }else{
00265                         //reverse complement, start at the end.
00266                         if(cur_in_start >= intron_map_list[intron_mapEnd].GetStart())
00267                                 cur_intron_chunk = cur_match_len;
00268                         else
00269                                 cur_intron_chunk = cur_in_start + cur_match_len - intron_map_list[intron_mapEnd].GetStart();
00270                         complement = true;
00271                         seq_file.getIntersectingFeatures(intron_list[intronmapI], in_feat_list, in_feat_index);
00272                 }
00273                 
00274                 //the current chunk will be the smaller of the two mappings
00275                 gnSeqI cur_chunk = cur_intron_chunk < cur_exon_chunk ? cur_intron_chunk : cur_exon_chunk;
00276 
00277                 gnLocation cur_exon_loc(exon_list[exonmapI].GetStart() + extra_exon_start, exon_list[exonmapI].GetStart() + cur_chunk);
00278                 seq_file.getIntersectingFeatures(cur_exon_loc, ex_feat_list, ex_feat_index);
00279                 if(mem_list[memI].in_start > 0){
00280                         gnLocation cur_intron_loc(intron_list[intronmapI].GetStart() - 1, intron_list[intronmapI].GetEnd() + 1);
00281                         seq_file.getIntersectingFeatures(cur_intron_loc, in_feat_list, in_feat_index);
00282                 }else{
00283                         gnLocation cur_intron_loc(intron_list[intron_mapEnd].GetStart() - 1, intron_list[intron_mapEnd].GetEnd() + 1);
00284                         seq_file.getIntersectingFeatures(cur_intron_loc, in_feat_list, in_feat_index);
00285                 }
00286                 vector<gnBaseFeature*> ex_forward;
00287                 vector<gnBaseFeature*> ex_reverse;
00288                 vector<gnBaseFeature*> in_forward;
00289                 vector<gnBaseFeature*> in_reverse;
00290 
00291                 for(uint32 featI = 0; featI < ex_feat_list.size(); featI++){
00292                         string featName = ex_feat_list[featI]->GetName();
00293                         if(featName == "mRNA" || featName == "CDS" || featName == "gene" )
00294                                 if(ex_feat_list[featI]->GetLocationType() == gnLocation::LT_Complement)
00295                                         ex_reverse.push_back(ex_feat_list[featI]);
00296                                 else
00297                                         ex_forward.push_back(ex_feat_list[featI]);
00298                 }
00299                 for(uint32 featI = 0; featI < in_feat_list.size(); featI++){
00300                         string featName = in_feat_list[featI]->GetName();
00301                         if(featName == "mRNA" || featName == "CDS" || featName == "gene" )
00302                                 if(in_feat_list[featI]->GetLocationType() == gnLocation::LT_Complement)
00303                                         in_reverse.push_back(in_feat_list[featI]);
00304                                 else
00305                                         in_forward.push_back(in_feat_list[featI]);
00306                 }
00307 
00308                 if(complement){
00309                         vector<gnBaseFeature*> tmp_in = in_forward;
00310                         in_forward = in_reverse;
00311                         in_reverse = tmp_in;
00312                 }
00313 
00314                 //now print out all the complementary features
00315                 if((ex_forward.size() > 0 && in_reverse.size() > 0) || (in_forward.size() > 0 && ex_reverse.size() > 0)){
00316                         net_file << "================================\n";
00317                         net_file << "Mem: " << mem_list[memI].length << "\n";
00318                         net_file << "This exon/intron matching size: " << cur_chunk << "\n";
00319                 }
00320                 if(ex_forward.size() > 0 && in_reverse.size() > 0){
00321                         net_file << "Forward Exons:\n";
00322                         for(uint32 featI = 0; featI < ex_forward.size(); featI++)
00323                                 print_feature(net_file, ex_forward[featI]);
00324                         net_file << "Matching introns:\n";
00325                         for(uint32 featI = 0; featI < in_reverse.size(); featI++)
00326                                 print_feature(net_file, in_reverse[featI]);
00327                 }
00328                 if(in_forward.size() > 0 && ex_reverse.size() > 0){
00329                         net_file << "Reverse Exons:\n";
00330                         for(uint32 featI = 0; featI < ex_reverse.size(); featI++)
00331                                 print_feature(net_file, ex_reverse[featI]);
00332                         net_file << "Matching introns:\n";
00333                         for(uint32 featI = 0; featI < in_forward.size(); featI++)
00334                                 print_feature(net_file, in_forward[featI]);
00335                 }
00336                 
00337                 //release memory
00338                 for(uint32 featI = 0; featI < ex_feat_list.size(); featI++)
00339                         delete ex_feat_list[featI];
00340                 for(uint32 featI = 0; featI < in_feat_list.size(); featI++)
00341                         delete in_feat_list[featI];
00342                 
00343                 //loop while there is stuff to match
00344 //              while(cur_match_len > 0){
00345                 
00346 //              }
00347         }
00348 }
00349 

Generated at Thu Apr 4 01:52:01 2002 for libGenome by doxygen1.2.8.1 written by Dimitri van Heesch, © 1997-2001