#!/usr/bin/perl -w # This perl script performs step 1 of the MDL analysis of a # set of concatenated loci (genes). This analysis finds # a. The parsimony score (as well as 1 MP tree) of each # group of loci. To this end, a large paup file is # created and Paup is called once from the script. # With n loci, there are 2^n - 1 non-empty groups of loci. # b. Then it evaluates all partitions of the whole data set # into a maximum of NgroupMax groups of loci. For each # partition the MP score is calculated. A partition # consisting of 3 groups, g1, g2 and g3 has score # score(g1)+score(g2)+score(g3). These scores have been # calculated in (a). # The output consists of 3 files. One file lists all groups # and another lists all partitions with a maximum of NgroupMax # groups. They indicate their scores and other info (Nchar, # Ntax, Ngroup). The third file gives a list of MP trees, one # for each group of genes. # The partition file will later need to be processed in R in # order to calculate the Description Length (DL) of each # partition and then to pinpoint the partition with the best # (minimum) DL. That will be step 2 (and explains the name of # this script). R functions are in "MDL.r". $doParsSearch = 0; $doNNI = 0; $NgroupMax = 4; $excludegaps = 1; #1=yes, 0=no. $gap_as_character = 1; if ($excludegaps){ $gap_as_character = 0;} $dataFile = "nDNA.genomes"; #$dataFile = "MDLrun.nex"; $paupFileName = "paupallgroups.nex"; $scoreFile = "scores"; $treeFile = "trees.nex"; if ($excludegaps){ $extension = "_nogap";} elsif ($gap_as_character){ $extension = "_gapAsChar";} else { $extension = "_gapNotaChar";} $alltreeFile = "alltrees$extension.nex"; $partitionScoreFile = "scores_partitions$extension"; $groupScoreFile = "scores_groups$extension"; $nniFile = "nni_dist"; #----- Read in the data file to find the gene names ------# open FHi, "<$dataFile"; while (){ if (/charset\s+(\w+)\s*=\s*/i){ if (!(($1 eq "5S")||($1 eq "ADHC"))){ push @genes, $1;} } if (/ntax\s*=\s*(\d+)/i){ $Ntax = $1; } } close FHi; $Ngenes = @genes; print "genes are @genes.\nThere are $Ntax taxa.\n"; #----- build the groups ----------------------# $Ngroup = (1 << $Ngenes) -1; # this is 2^$Ngenes-1. print "Ngroup is $Ngroup\n"; # group with index i (i in 1..$Ngroup) has gene j if for (1..$Ngroup){ $i=$_; $group=""; for (@genes){ $group .= $i % 2; $i = $i>>1; } push @groups, $group; } print "groups are @groups\n"; if($doParsSearch){ #--------- Make the paup file ---------------# unlink($scoreFile); unlink($treeFile); open FHo, ">$paupFileName"; print FHo "#NEXUS\n\nset warnroot=no warntree=no "; print FHo "increase=no maxtrees=$Ngroup;\n"; print FHo "execute $dataFile;\n\n"; print FHo "begin paup;\n"; if ($gap_as_character){ print FHo "pset gapmode=newstate;\n"; } foreach (@groups){ $group = $_; # if (substr($group,1,1) eq "1") {last;} # for testing purposes print FHo "include "; foreach (@genes){ if (substr($group,0,1,"") eq "1"){ print FHo "$_ "; } } print FHo "/ only;\n"; if($excludegaps){ print FHo "exclude missambig;\n";} print FHo "hsearch collapse=no;\n"; print FHo "Pscores 1 / scorefile=$scoreFile append=yes;\n"; print FHo "savetrees from=1 to=1 file=$treeFile format=altnexus append=yes;\n"; print FHo "\n"; } print FHo "gettrees file=$treeFile allblocks=yes;\n"; print FHo "savetrees file=$alltreeFile format=altnexus replace=yes;\n"; print FHo "quit;\nEND; [paup]"; close FHo; #-------- run the paup file -----------# system("paup paupallgroups.nex"); } # end of: if($doParsSearch) #------- get the results on individual groups ----# open FHi, "<$scoreFile"; $groupindex=0; while (){ if (/\b1\s+(\d+)/){ $group = $groups[$groupindex]; $scores{$group}=$1; $groupindex++; } } close FHi; # What follows get the number of character for each group. $groupindex=0; open FHi, "<$treeFile"; while (){ if (/Of the remaining\s+(\d+)/i){ $group = $groups[$groupindex]; $Nchar{$group}= $1; $groupindex++; } } close FHi; $maxNchar = $Nchar{"1"x$Ngenes}; # Nchar for the full group: 11...1 print "maxNchar is $maxNchar\n"; if ($doNNI){ open FHi, "<$nniFile"; print "Now I have to build an array of arrays...\n"; close FHi; } #-------- analyze partitions ----------# # The partition 1213 is the partition in # which genes 1 and 3 are in the same group (constrained # to have the same tree) while gene 2 is in a different # group (can have its own tree) and gene 4 is in another # different group (with another possibly different tree). # An upper bound on the number of partitions with a max # of # groups of NgroupMax is NgroupMax **(power) (Ngenes-1) # A better bound is 2*3*...*NgoupMax*NgroupMax... (Ngenes -1 terms) # List of partition: 1 to this upperbound. Partition with index i # is i1 i2 i3 ... iNgenes with i1=1 i2=1+ i mod 2, i2=1+ ... @partitionbase = ($NgroupMax) x ($Ngenes-1); for $i (0..($Ngenes-2)){ if ($partitionbase[$i]> $i+2){ $partitionbase[$i]=$i+2;} } $NpartMax = 1; for (@partitionbase){ $NpartMax *= $_;} for (0..($NpartMax-1)){ $partition = "1"; $partindex = $_; $Ng = 1; $good=1; for (@partitionbase){ $i = $partindex % $_; $j = $i+1; if ($j > $Ng+1){ $good=0; last;} if ($j > $Ng){ $Ng = $j;} $partition .= "$j"; $partindex -= $i; $partindex /= $_; } if ($good){ $partition{$_}=$partition; $Ngroups{$_}=$Ng; # number of groups --trees-- in the forest. } #if ($_ eq 2){ last;} # for testing purposes. } $Npart = keys %partition; print "There are $Npart partitions\n"; open FHo, ">$groupScoreFile"; print FHo "Ntax maxNchar gap_as_character\n"; print FHo "$Ntax $maxNchar $gap_as_character\n"; print FHo "[Data file: $dataFile, genes included are: @genes]\n"; print FHo "[Number of distinct groups: $Ngroup]\n"; if ($excludegaps){ print FHo "[sites with gaps excluded]\n"; } elsif ($gap_as_character) { print FHo "[gapped sites included, gap = extra character]\n"; } else { print FHo "[gapped sites included, gap NOT an extra character]\n"; } print FHo "group MP_score Nchar\n"; foreach (keys %scores) { print FHo "$_ $scores{$_} $Nchar{$_}\n"; } close FHo; open FHo, ">$partitionScoreFile"; print FHo "Ntax maxNchar gap_as_character\n"; print FHo "$Ntax $maxNchar $gap_as_character\n"; print FHo "[Data file: $dataFile, genes included are: @genes]\n"; print FHo "[Number of distinct groups: $Ngroup]\n"; print FHo "[Limit to the number of groups in any partition: $NgroupMax]\n"; print FHo "[Number of partitions: $Npart]\n"; if ($excludegaps){ print FHo "[sites with gaps excluded]\n"; } elsif ($gap_as_character) { print FHo "[gapped sites included, gap = extra character]\n"; } else { print FHo "[gapped sites included, gap NOT an extra character]\n"; } print FHo "partition MP_score Ngroups\n"; for $ind (keys %partition){ $totalscore = 0; # this is the sum of the groups' scores. for $g (1..$Ngroups{$ind}){ # I need to see what genes are in group $g # and get the score of this group. $group = ""; $partition = $partition{$ind}; # needs to be re-initialized for each group. foreach (1..$Ngenes){ $a=substr($partition,0,1,""); if ($a eq $g){ $group .= "1"; } else {$group .="0";} } $totalscore += $scores{$group}; } print FHo "$partition{$ind} $totalscore $Ngroups{$ind}\n"; }