#!usr/bin/perl -w

# code to perform the covarion test.
# argument= a file name.
# The file should contain info about the data matrix, tree, clades,
#     the type of test to be done, the characters to exlude...
# Results will be located in a directory named "results" if not
#     otherwise specified (in the input file)
#     output: - a conclusion file
#             - the estimated tree, in case estimation has been performed.
#
# example of use:
# perl covarionTest.pl inputfile
# perl covarionTest.pl < inputfile
#
# that script calls the following other programs:
#   paup
#   resolvetree.pl for resolving trees with 0-length branches
#   seq-gen
#   oneTestStatistic (C program) for computing the value of the test
#                                statistic W on one data set.
#
# example of input file (or standard input):
#
# excludeChar=3 NBoot=10 ras=1 treeMethod=par treeFile=MLtree.nex
# matrixFile=myMatrix_nexus_or_phylip_format
# partition=Amborella,Calycanthus, Marchantia,Physcomitrella ,Anthoceros,Psilotum
# model=REV ncat=4 directory=myDirName
#
#
# - matrixFile: must be in phylip or nexus format.
# - treeMethod can be "parsimony" (or "par"), "likelihood" (or "like")
#   If "par", the MP tree is computed and used or bootstrapping.
#   If "like", it will be the ML tree. In all other cases, the
#   tree is to be read from a file.
# - the tree in treeFile must be in Nexus format.
# - model can be one of "HKY" "F84" or "REV". default=HKY. Indicates what
#   substitution model will be used for both estimation of parameters
#   and simulation.
# - ncat: integer value. default=4. Number of gamma rate categories, used
#   both in estimation and simulation.
# - nboot= number of bootstrap replicates. default=1000.
# - ras: type of test. 0: homogeneity test. 1: RAS vs RAS+COV test.
#   default=1.
# - directory: name of directory for output. default="results"
# - partition: list of one clade's taxon names, separated by commas.
# - ecludeChar: if =3 then 3d positions are ecluded. if =12 then 1st
#   and 2d positions are ecluded. Otherwise, all positions are included.

# the test statistic is performed on sites with no gaps only. Bootstrap
# replicates have a number of characters equal to the number of no-gaps
# characters of the matrix.

# Ooops! I'm not sure how ambiguous characters are treated. I need to look
# into the C program "oneTestStatistic".


# - I haven't checked yet that interleave is ok.
# - model=JC not permitted. Should I add another option like "nst=1"
#   and/or "basefrequencies=equal" to allow for simpler models?


# ---------------------------- random seed -----------------------
$seed = time ^ $$ ^ unpack "%L*", `ps axww | gzip`;
srand($seed);
# ----------------------------------------------------------------



$SeqGen = "./seq-gen";
$Paup = "paup";
$oneStat = "./oneTestStatistic";
$resolvetree= "resolvetree.pl";

$excludeChar=0; #0: no character excluded. 3: 3d positions excluded
                #12: 1st and 2d positions excluded
$NBoot = 1000;
$ras = 1; # 1 for the RAS vs RAS+COV test (left-sided test),
          # 0 for the homogeneity test (right-sided test).
$model="HKY"; # could also be F84 or REV.
$ncat = 4;    # number of gamma rate categories
$directory="results/";

# ----------------- Read standard input -------------------------------
while (<>){
  if (/\bexcludeChar\s*=\s*(\d+)/i){ $excludeChar =$1;}
  if (/\bNBoot\s*=\s*(\d+)/i){$NBoot =$1;}
  if (/\bras\s*=\s*(\d)/i){$ras=$1;}
  if (/\btreeMethod\s*=\s*(\w+)/i){$speed =$1;}
  if (/\bmodel\s*=\s*(\w+)/i){$model =$1;}
  if (/\bncat\s*=\s*(\d+)/i){$ncat =$1;}
  if (/\bmatrixFile\s*=\s*([^\s]+)/i){$DataMatrix =$1;}
  if (/\btreeFile\s*=\s*([^\s]+)/i){$ModelTreeNex =$1;}
  if (/\bpartition\s*=((\s*\w+\s*,)+\s*\w+)/i){$partition =$1;}
  if (/\bdirectory\s*=\s*(\w+)/i){$directory ="$1/";}
}
# ------------------------------------------------------------------

if ($excludeChar eq 12){print "first and second characters excluded\n";}
elsif ($excludeChar eq 3){print "third characters excluded\n";}
else {print "all characters included\n";}

if ($NBoot > 1){ print "$NBoot replicates will be simulated for parametric bootstrapping\n";}
else {die "bad value for NBoot: $NBoot\n";}

if ($ras){ print "test with rate variation: RAS vs RAS+COV\n";}
else{ print "homogeneity test\n";}

if ($speed && $speed =~ /^par/i){
  $speed="par";
  print "MP (maximum parsimony) tree will be computed and used for bootstrapping\n";}
elsif ($speed && $speed =~ /^like/i){
  $speed="like";
  print "ML (maximum likelihood) tree will be computed and used for bootstrapping\n";}
else{
  $speed="fast";
  die "I can't open tree file $ModelTreeNex\n" if (!(-r $ModelTreeNex));
  print "Tree read from file $ModelTreeNex\n";
}
if (!(-d $directory)) { mkdir $directory; }
else {die "ooops! I don't want to overwrite directory $directory\n";}

if ($DataMatrix){
  die "I can't open Data matrix file $DataMatrix\n" if (!(-r $DataMatrix));
  print "Data matrix read from file $DataMatrix (phylip or nexus format)\n";
} else{ die "no entry for matrixFile\n";}
if ($partition){
  @clade=split(/[(\s*,\s*)]/,$partition);
  foreach (@clade){ # this loop removes extra beginning spaces and empty names
    s/^\s*//;
    if (!($_ eq "")){ push @cladeA, $_;}
  }
  print "Partition defined by the group: @cladeA\n";
} else{ die "no entry for partition\n";}

if (($model eq "HKY")||($model eq "F84")||($model eq "REV")){
  print "Model for estimation and simulation: $model\n";
} else{ die "bad model of substitution: $model\n";}

if ($ncat > 1){ print "gamma rate distribution with $ncat categories\n";}
else{ die "bad number of gamma rate categories: $ncat\n";}

# ---------- read data matrix ----------------------------------------
# ---------- build data structure for the partition ------------------
open FH1, "<$DataMatrix";
while (<FH1>){
  chomp;
  if ($_){
    if (/^#nexus/i){
	$format="nexus";
    } else{ $format="phylip";}
    last;
  }
}
close FH1;
if ($format eq "phylip"){
  open FH1, "<$DataMatrix";
  while (<FH1>) {  # get nTax, NChar and taxon names
    @separatelines = split(/\r/,$_);
    foreach (@separatelines){
      if (/^\s*\d+\s+\d+\s*$/){
	($nTaxa,$NChar)=split;
      }
      elsif (/[ACGT]/) {
	$name=(split)[0];
	push @taxonNames, $name;
      }
    }
  }
  close FH1;
} elsif ($format eq "nexus"){
  open FH1, "<$DataMatrix";
  my $start;
  my $ntax;
  while (<FH1>) {
    @separatelines = split(/\r/,$_);
    foreach (@separatelines){
      if (/dimensions/i){
	if (/ntax\s*=\s*(\d+)/i){$nTaxa= $1;}
	if (/nchar\s*=\s*(\d+)/i){$NChar=$1;}
      }
      if (/end;/i){$start=0;}
      if ($start){
	if (/[ACGT]/) {
	  $ntax++;
	  if ($ntax<=$nTaxa){
	    push @taxonNames, (split)[0];
	  }	
	}
      }
      if (/matrix/i){$start=1;}
    }
  }
  close FH1;
}
$nTax=@taxonNames;
if (!($nTaxa eq $nTax)){ die "problem in reading the data matrix, on the number of taxa\n";}
foreach $tax (@taxonNames){
  my $notfound=1;
  foreach (@cladeA){
    if ($tax eq $_){ $notfound=0; last;}
  }
  if ($notfound){ push @cladeNA, $tax;}
}
$NtaxA = @cladeA;
$NtaxNA= @cladeNA;
if (($NtaxA<2)||($NtaxNA<2)){ die "each partition should contain at least 2 taxa\n";}
# -----------------------------------------------------------------------

$conclusionFile = $directory . "conclusion";
$tempDir = $directory."temp/";
if (!(-d $tempDir)) { mkdir $tempDir;}

$DataMatrixPhy = $tempDir . "DataMatrix.phy";
$DataMatrixNex = $tempDir . "DataMatrix.nex";
$PauplogFile = $tempDir . "Paup.log";
$EstimTreePhy = $tempDir . "PaupTree.altphy";
$EstimTreeNex = $tempDir . "PaupTree.nex";
$EstimTreeAltNex = $tempDir . "PaupTree.altnex";
$SGtreeFile= $tempDir . "SGtree.phy";
$newOriginalMatrixPhy = $tempDir . "originalDataMatrix.phy";
$CladeDefinitionFile = $tempDir . "CladeInfo.txt";

open FH4, ">$conclusionFile";
print FH4 "Covarion test:\t".($ras?"RAS vs RAS+COV\n":"homogeneous vs heterogeneous\n");
print FH4 "\nData matrix:\t$DataMatrix\n";
print FH4 "'clade' 1 has $NtaxA taxa: @cladeA\n";
print FH4 "'clade' 2 has $NtaxNA taxa: @cladeNA\n";
print FH4 "Number of characters:\t$NChar\n";
print FH4 "excluded positions:\t";
if ($excludeChar eq 12){ print FH4 "1st and 2d\n";}
elsif ($excludeChar eq 3){print FH4 "3d\n";}
else {print FH4 "none\n";}
close FH4;


# ----------------- write a new phylip data file --------------------------
# ----------------- excluding characters if necessary ---------------------

if ($excludeChar eq 12){ $NChar=int($NChar/3);}
elsif ($excludeChar eq 3) {$rest=$NChar%3; $NChar=int($NChar/3)*2+$rest;}


open FH1, "<$DataMatrix";
my $start;
if ($format eq "phylip"){$start=1;}
while (<FH1>) {
  @separatelines = split(/\r/,$_);
  foreach (@separatelines){
    if (/end;/i){$start=0;}
    if ($start){
      if (/[ACGT]/) {
	($name,$nucleotides)=split;
	while ($nucleotides){
	  $i= ++$i{$name};
	  $nuc=substr($nucleotides,0,1,"");
	  if ($excludeChar eq 3){ $condition = ($i%3?1:0);}
	  elsif ($excludeChar eq 12){ $condition = ($i%3?0:1);}
	  else{ $condition = 1;}
	  if ($condition){
	    $sequence{$name} .= $nuc;
	  }
	}
      }
    }
    if (/matrix/i){$start=1;}
  }
}
close FH1;


open FH2, ">$newOriginalMatrixPhy";
print FH2 "$nTaxa $NChar\n";
foreach (@taxonNames){ print FH2 "$_ $sequence{$_}\n";}
close FH2;

$DataMatrix=$newOriginalMatrixPhy;



# ---------- compute the test statistic on the original dataset -----------
# -------------------------------------------------------------------------
open FH1, ">$CladeDefinitionFile";
print FH1 "cladeA @cladeA\n";
print FH1 "cladeNA @cladeNA\n";
close FH1;


$LTestString = "$oneStat -c $CladeDefinitionFile < $DataMatrix";
$originalTestvalue =  `$LTestString`;
chomp $originalTestvalue;

open FH2, ">>$conclusionFile";
if ($excludeChar){ print FH2 "Number of included characters:\t$NChar\n\n";}
close FH2;


# --------------- Convert the data file into a nexus file ------------------
# --------------------------------------------------------------------------

open FH1, "<$DataMatrix";
open FH2, ">$DataMatrixNex";
print FH2 "#nexus\n";


print FH2 "\nbegin data;\ndimensions ntax=$nTaxa nchar=$NChar;\n";
print FH2 "format datatype=dna gap=-;\nmatrix\n";
while (<FH1>) {
  if (/[ACGT]/) { print FH2;}
}
print FH2 "\n;\nend;\n";
close FH1;

# ------------------ Paup block -------------------------

$paupSeed = int(10000*rand());

if (!($model eq "REV")){
  $PaupModel = "nst=2 basefreq=estimate tratio=estimate variant=$model ";}
else { $PaupModel = "nst=6 basefreq=estimate rmatrix=estimate ";}
if ($ras){$PaupModel .= "rates=gamma shape=estimate ncat=$ncat ";}  # +Gamma

print FH2 "\nbegin paup;\n";
print FH2 "set maxtrees=100 increase=no warntsave=no warnTree=no autoclose=yes status=no;\n";
print FH2 "constraints mycons=((@cladeA),(@cladeNA));\n";
if ($speed eq "par"){
  print FH2 "set criterion= parsimony;\n";
  print FH2 "hsearch swap=tbr rseed=$paupSeed enforce=yes constraints=mycons;\n";
  print FH2 "ConTree / treefile=$EstimTreeNex replace=yes;\n";
  print FH2 "Gettrees file=$EstimTreeNex;\n";
}
elsif ($speed eq "like"){
  print FH2 "set criterion= likelihood;\nlset $PaupModel;\n";
  print FH2 "hsearch swap=tbr enforce=yes constraint=mycons timelimit=1200;\n";
  print FH2 "ConTree / treefile=$EstimTreeNex replace=yes;\n";
  print FH2 "Gettrees file=$EstimTreeNex;\n";
}
else{
  print FH2 "Gettrees file=$ModelTreeNex;\n";
}
print FH2 "set criterion= likelihood ;\n";
print FH2 "lset $PaupModel;\n";
print FH2 "log start file=$PauplogFile replace=yes;\n";
print FH2 "lscore;\n";
print FH2 "lset basefreq=previous ".($ras?" shape=previous ":"");
if ($model eq "REV"){ print FH2 "rmatrix=previous ;\n";}
else{ print FH2 "tratio=previous ;\n";}
print FH2 "exclude gapped; \nlog stop;\n";
print FH2 "savetrees file=$EstimTreeAltNex format=altnexus brlens=yes replace=yes;\n";
print FH2 "quit;\n";
print FH2 "end;\n";
close FH2;


# --- run paup ---------
system( "$Paup $DataMatrixNex" );

# --- store the results (MPtree, parameters, number of no-gap sites,...) -----
if (($speed eq "par")||($speed eq "like")){
  if ($speed eq "par"){$MP="MP";} else{$MP="ML";}
  $MPtreeFile = $directory.$MP."tree.nex";
  system("cp $EstimTreeNex $MPtreeFile");
}

open FH1, "<$PauplogFile";
while (<FH1>){
  if ($start){
    if (/^Shape/){
      $start=0;
      $alpha=(split)[1];
    }
    elsif (/^\s+[ACGT]\s/){
      push @freq, (split)[1];
    }
    elsif (/^\s+[ACG][CGT]\s/){
      push @rmatrix, (split)[1];
    }
    elsif (/^\s*exp\. ratio/){
      $titv=(split)[-1];
    }
  }
  if (/^Base frequencies/){$start=1;}
  if (/Number of included characters/){
    $NChar=(split)[5];
  }
}
close FH1;

open FH1, "<$EstimTreeAltNex";
open FH2, ">$EstimTreePhy";
while (<FH1>){
  if (/tree PAUP_1/){
    my $tree = (split)[4];
    print FH2 "$tree\n";
  }
}


open FH1, ">>$conclusionFile";
if ($speed eq "par"){print FH1 "estimation:\tMP tree\n";}
elsif ($speed eq "like"){print FH1 "estimation:\tlikelihood tree\n";}
else { print FH1 "tree:\tfrom file $ModelTreeNex.\n";}
print FH1 "parameter estimates:\n\tbase frequencies (ACGT)= @freq\n";
if ($model eq "REV"){ print FH1 "\tRmatrix rates= @rmatrix\n";}
else {print FH1 "\ttitv ratio = $titv\n";}
if ($ras){ print FH1 "\tshape (alpha) = $alpha\n";}
print FH1 "Number of characters with no gaps:\t$NChar\n";
print FH1 "Test statistic:\t\t$originalTestvalue\n";
print FH1 "Number of bootstrap replicates:\t$NBoot\n";
close FH1;

# --------------- Resolve the tree with 0-length branches----------------
# -----------------------------------------------------------------------


system("perl $resolvetree $EstimTreePhy > $SGtreeFile");


# ---------------- Bootstrap analysis ------------------------------------
# ------------------------------------------------------------------------

@statvalues = ();
# see if alpha is not infinite ...
if ( ($ras && !($alpha =~ /\d/)) or !(($model eq "REV")or($titv =~ /\d/))){
  die "infinite parameter estimate\n";
}

print STDOUT "Parametric bootstrap starting: $NBoot replicates.\n";
# ----------------------  sequence simulation -------------------
$SGoptions = "-m$model -l $NChar -f @freq ";
if (($model eq "HKY")||($model eq "F84")){
  $SGoptions .= "-t $titv ";
} else {$SGoptions .= "-r @rmatrix ";}

if ($ras){
  $SGoptions .= "-a $alpha -g $ncat ";
}
$SGoptions .= "-q -or ";
$SGfiles = " < $SGtreeFile > $DataMatrixPhy";

foreach (1..$NBoot){
  print  STDOUT ".";
  $SGseed = int(100000*rand());
  $SGnewOptions = $SGoptions." -z $SGseed ";
  system("$SeqGen $SGnewOptions $SGfiles ");

  # ---- get the test statistic on that bootstrap dataset ---
  $statString = "$oneStat -c $CladeDefinitionFile < $DataMatrixPhy";
  $statvalue = `$statString`;
  chomp $statvalue;
  push @statvalues, $statvalue;
}
print STDOUT "\n";

# ------------ compute p-value -------------------------------
# ------------------------------------------------------------
$pvalue=0;
foreach (@statvalues){
  if ($ras?($_<=$originalTestvalue):($_>=$originalTestvalue)){
    $pvalue++;
  }
}
$pvalue /= @statvalues;
if ($pvalue eq 0){
  $minp = 1/$NBoot;
  $pvalue = "<$minp";
}
open FH1, ">>$conclusionFile";
print FH1 "p-value:\t$pvalue\n\n";
close FH1;
print "p-value is $pvalue\n\n";

# ------------ file management ------------------
unlink glob "$tempDir*";
rmdir $tempDir;
