#!/usr/bin/perl -w

# This perl script performs step 1 of the MDL analysis of a
# set of concatenated loci (genes). This analysis finds
# a. The parsimony score (as well as 1 MP tree) of each
#    group of loci. To this end, a large paup file is
#    created and Paup is called once from the script.
#    With n loci, there are 2^n - 1 non-empty groups of loci.
# b. Then it evaluates all partitions of the whole data set
#    into a maximum of NgroupMax groups of loci. For each
#    partition the MP score is calculated. A partition 
#    consisting of 3 groups, g1, g2 and g3 has score
#    score(g1)+score(g2)+score(g3). These scores have been
#    calculated in (a).
# The output consists of 3 files. One file lists all groups
# and  another lists all partitions with a maximum of NgroupMax
# groups. They indicate their scores and other info (Nchar,
# Ntax, Ngroup). The third file gives a list of MP trees, one
# for each group of genes.
# The partition file will later need to be processed in R in
# order to calculate the Description Length (DL) of each
# partition and then to pinpoint the partition with the best
# (minimum) DL. That will be step 2 (and explains the name of
# this script). R functions are in "MDL.r".

$doParsSearch = 0;
$doNNI = 0;
$NgroupMax = 4;
$excludegaps = 1;  #1=yes, 0=no.
$gap_as_character = 1;
if ($excludegaps){ $gap_as_character = 0;}

$dataFile = "nDNA.genomes";
#$dataFile = "MDLrun.nex";
$paupFileName = "paupallgroups.nex";
$scoreFile = "scores";
$treeFile = "trees.nex";
if ($excludegaps){ $extension = "_nogap";}
elsif ($gap_as_character){ $extension = "_gapAsChar";}
else { $extension = "_gapNotaChar";}
$alltreeFile = "alltrees$extension.nex";
$partitionScoreFile = "scores_partitions$extension";
$groupScoreFile = "scores_groups$extension";
$nniFile = "nni_dist";

#----- Read in the data file to find the gene names ------#
open FHi, "<$dataFile";
while (<FHi>){
  if (/charset\s+(\w+)\s*=\s*/i){
      if (!(($1 eq "5S")||($1 eq "ADHC"))){ push @genes, $1;}
  }
  if (/ntax\s*=\s*(\d+)/i){
      $Ntax = $1;
  }
}
close FHi;

$Ngenes = @genes;
print "genes are @genes.\nThere are $Ntax taxa.\n";

#----- build the groups ----------------------#
$Ngroup  = (1 << $Ngenes) -1;  # this is 2^$Ngenes-1.
print "Ngroup is $Ngroup\n";
# group with index i (i in 1..$Ngroup) has gene j if  
for (1..$Ngroup){
    $i=$_;
    $group="";
    for (@genes){
	$group .= $i % 2;
	$i = $i>>1;  
    }
    push @groups, $group;
}
print "groups are @groups\n";


if($doParsSearch){
#---------  Make the paup file ---------------#
 unlink($scoreFile);
 unlink($treeFile);

 open FHo, ">$paupFileName";
 print FHo "#NEXUS\n\nset warnroot=no warntree=no ";
 print FHo "increase=no maxtrees=$Ngroup;\n";
 print FHo "execute $dataFile;\n\n";
 print FHo "begin paup;\n";
 if ($gap_as_character){
     print FHo "pset gapmode=newstate;\n";
 }

foreach (@groups){
  $group = $_;
  # if (substr($group,1,1) eq "1") {last;}
  # for testing purposes

  print FHo "include ";
  foreach (@genes){
    if (substr($group,0,1,"") eq "1"){
      print FHo "$_ ";
    }
  }
  print FHo "/ only;\n";
  if($excludegaps){ print FHo "exclude missambig;\n";}
  print FHo "hsearch collapse=no;\n";
  print FHo "Pscores 1 / scorefile=$scoreFile append=yes;\n";
  print FHo "savetrees from=1 to=1 file=$treeFile format=altnexus append=yes;\n";
  print FHo "\n";

}

    print FHo "gettrees file=$treeFile allblocks=yes;\n";
    print FHo "savetrees file=$alltreeFile format=altnexus replace=yes;\n";
print FHo "quit;\nEND; [paup]";
close FHo;

#-------- run the paup file -----------#
system("paup paupallgroups.nex");
} # end of: if($doParsSearch)


#------- get the results on individual groups ----#
open FHi, "<$scoreFile";
$groupindex=0;
while (<FHi>){
    if (/\b1\s+(\d+)/){
	$group = $groups[$groupindex];
	$scores{$group}=$1;
        $groupindex++;
    }
}
close FHi;


# What follows get the number of character for each group. 
$groupindex=0;
open FHi, "<$treeFile";
while (<FHi>){
    if (/Of the remaining\s+(\d+)/i){
	$group = $groups[$groupindex];
	$Nchar{$group}= $1;
        $groupindex++;
    }
}
close FHi;
$maxNchar = $Nchar{"1"x$Ngenes}; # Nchar for the full group: 11...1
print "maxNchar is $maxNchar\n";

if ($doNNI){
    open FHi, "<$nniFile";
    print "Now I have to build an array of arrays...\n";
    close FHi;

}



#-------- analyze partitions ----------#
# The partition  1213 is the partition in
# which genes 1 and 3 are in the same group (constrained
# to have the same tree) while gene 2 is in a different
# group (can have its own tree) and gene 4 is in another
# different group (with another possibly different tree).

# An upper bound on the number of partitions with a max # of 
# groups of NgroupMax is NgroupMax **(power) (Ngenes-1)
# A better bound is 2*3*...*NgoupMax*NgroupMax... (Ngenes -1 terms)
# List of partition: 1 to this upperbound. Partition with index i
# is i1 i2 i3 ... iNgenes with i1=1 i2=1+ i mod 2, i2=1+ ... 
@partitionbase = ($NgroupMax) x ($Ngenes-1);
for $i (0..($Ngenes-2)){
    if ($partitionbase[$i]> $i+2){ $partitionbase[$i]=$i+2;}
}
$NpartMax = 1;
for (@partitionbase){ $NpartMax *= $_;}

for (0..($NpartMax-1)){
    $partition = "1";
    $partindex = $_;
    $Ng = 1;
    $good=1;
    for (@partitionbase){
	$i = $partindex % $_;
	$j = $i+1;
	if ($j > $Ng+1){ $good=0; last;}
	if ($j > $Ng){ $Ng = $j;}
	$partition .= "$j";
	$partindex -= $i;
	$partindex /= $_;
    }
    if ($good){
	$partition{$_}=$partition;
	$Ngroups{$_}=$Ng;      # number of groups --trees-- in the forest.
    }
    #if ($_ eq 2){ last;}  # for testing purposes.
}

$Npart = keys %partition;
print "There are $Npart partitions\n";

open FHo, ">$groupScoreFile";
print FHo "Ntax maxNchar gap_as_character\n";
print FHo "$Ntax $maxNchar $gap_as_character\n";

print FHo "[Data file: $dataFile, genes included are: @genes]\n";
print FHo "[Number of distinct groups: $Ngroup]\n";
if ($excludegaps){
    print FHo "[sites with gaps excluded]\n";
} elsif ($gap_as_character) {
    print FHo "[gapped sites included, gap = extra character]\n";
} else {
 print FHo "[gapped sites included, gap NOT an extra character]\n";
}
print FHo "group MP_score Nchar\n";
foreach (keys %scores) {
    print FHo "$_ $scores{$_} $Nchar{$_}\n";
}
close FHo;

open FHo, ">$partitionScoreFile";
print FHo "Ntax maxNchar gap_as_character\n";
print FHo "$Ntax $maxNchar $gap_as_character\n";

print FHo "[Data file: $dataFile, genes included are: @genes]\n";
print FHo "[Number of distinct groups: $Ngroup]\n";
print FHo "[Limit to the number of groups in any partition: $NgroupMax]\n";
print FHo "[Number of partitions: $Npart]\n";
if ($excludegaps){
 print FHo "[sites with gaps excluded]\n";
} elsif ($gap_as_character) {
 print FHo "[gapped sites included, gap = extra character]\n";
} else {
 print FHo "[gapped sites included, gap NOT an extra character]\n";
}

print FHo "partition MP_score Ngroups\n";
for $ind (keys %partition){
  $totalscore = 0;  # this is the sum of the groups' scores.
  for $g (1..$Ngroups{$ind}){
    # I need to see what genes are in group $g
    # and get the score of this group.
    $group = "";
    $partition = $partition{$ind};
    # needs to be re-initialized for each group.
    foreach (1..$Ngenes){
	$a=substr($partition,0,1,"");
	if ($a eq $g){
	    $group .= "1";
	} else {$group .="0";}
    }
    $totalscore += $scores{$group};
  }
  print FHo "$partition{$ind} $totalscore $Ngroups{$ind}\n";
}


