#!/usr/bin/python
#
# Convert output from MEME program to a new format (one motif per line):
#
#  alphabet IUPAC-consensus PSSM-in-RMO[position][character]
#
#
#  for this, 'alphabet' is always "ACGT"
#
#

import sys,os,string;

USAGE = "usage:  " + sys.argv[0] + " meme-output-file";

INFINITY = float("99e+939");
ALPHABET = 'ACGT';
MAX_E_VALUE = INFINITY;	#IMPORTANT PARAM:  Don't output ANY motif with E-value greater (TODO:  make it an option)
#MIN_NT_REP = 0.0001;
MIN_NT_REP  = 0.2000;	# Minimum nucleotide probability necessary to be represented in the consensus

true = TRUE = 1;
false = FALSE = 0;

TAB = '\t';

#                  A  C  G  T
#----------------+-----------
#IUPAC[ 0] = 'X' | 0  0  0  0    ('X' also means 'N', but I'll use it for 0000 and 'N' for 1111)
#IUPAC[ 1] = 'T' | 0  0  0  1
#IUPAC[ 2] = 'G' | 0  0  1  0
#IUPAC[ 3] = 'K' | 0  0  1  1
#IUPAC[ 4] = 'C' | 0  1  0  0
#IUPAC[ 5] = 'Y' | 0  1  0  1
#IUPAC[ 6] = 'S' | 0  1  1  0
#IUPAC[ 7] = 'B' | 0  1  1  1
#IUPAC[ 8] = 'A' | 1  0  0  0
#IUPAC[ 9] = 'W' | 1  0  0  1
#IUPAC[10] = 'R' | 1  0  1  0
#IUPAC[11] = 'D' | 1  0  1  1
#IUPAC[12] = 'M' | 1  1  0  0
#IUPAC[13] = 'H' | 1  1  0  1
#IUPAC[14] = 'V' | 1  1  1  0
#IUPAC[15] = 'N' | 1  1  1  1

IUPAC = "XTGKCYSBAWRDMHVN";
IUPAC = "xTGkCysbAwrdmhvn";


# read one motif's worth of a MEME output file.
#	return (found motif?, consensus string, number-of-sites, E-Value, PSSM[site][character] -> probability)
def readmeme(instream=sys.stdin):
	
	line = "What, me worry?";
	while (line != ""):

		line = instream.readline();
		s = string.split(line);

		if (len(s) == 0): continue;
		
		# look for magic words:  "Motif 1 position-specific probability matrix"
		if (s[0] == "Motif" and string.join(s[2:]) == "position-specific probability matrix"):

			line = instream.readline();	# --------------------------------------------------------------------------------
			line = instream.readline(); # letter-probability matrix: alength= 4 w= 46 nsites= 5 E= 5.8e+001

			#sys.stderr.write(line + "\n" + str(string.split(line)) + "\n\n");

			a = int(string.split(line)[3]);
			w = int(string.split(line)[5]);
			n = int(string.split(line)[7]);
			E = float(string.split(line)[9]);

			if (a != len(ALPHABET)):
				sys.stderr.write("Motif " + str(s[1]) + ":  alphabet length != " + str(len(ALPHABET)) + "\n");

			# elements are linked this way:  
			#	PSSM = [len(ALPHABET)*[0.0]]*w;	#	[[0,0,0,0], [0,0,0,0], ... , [0,0,0,0]]
			PSSM = [];
			for p in range(0,w):
				PSSM = PSSM + [[0.0]*len(ALPHABET)];

			# read in PSSM
			#	ASSUME letters are in same order as ALPHABET
			for p in range(0,w):
				line = instream.readline();
				sline = string.split(line);
				for c in range(0,len(ALPHABET)):
					PSSM[p][c] = float(sline[c]);

			# compute consensus
			consensus = '';
			for p in range(0,w):
				iupac = 0;
				if (PSSM[p][0] >= MIN_NT_REP):  iupac += 8;		# 'A' is in position 0, most sig. bit
				if (PSSM[p][1] >= MIN_NT_REP):  iupac += 4;
				if (PSSM[p][2] >= MIN_NT_REP):  iupac += 2;
				if (PSSM[p][3] >= MIN_NT_REP):  iupac += 1; 	# 'T' is in position 3, least sig. bit
				consensus = consensus + IUPAC[iupac];
				
			# normalize
			for p in range(0,w):
				sum = 0.0;
				for c in range(0,len(ALPHABET)):
					sum = sum + PSSM[p][c];
				for c in range(0,len(ALPHABET)):
					PSSM[p][c] = PSSM[p][c] / sum;

			# add any pseducount here, when the number of sites is known.
			#for p in range(0,w):
			#	sum = 0.0;
			#	for c in range(0,len(ALPHABET)):
			#		PSSM[p][c] = PSSM[p][c] + pseudocount;
			#		sum = sum + PSSM[p][c];
			#	for c in range(0,len(ALPHABET)):
			#		PSSM[p][c] = PSSM[p][c] / sum;

			return (true, consensus, E, PSSM);

	# no magic line found
	return (false, "", -1, [[0,0,0,0]]);

def meme2pssm(input):
	(good, consensus, e_value, pssm) = readmeme(input);
	while (good):
		
		if (e_value <= MAX_E_VALUE):
			sys.stdout.write(consensus + ' ' + str(e_value));
			for site in pssm:
				for probability in site:
					sys.stdout.write(' ' + str(probability));
				sys.stdout.write(TAB);
			sys.stdout.write('\n');
		(good, consensus, e_value, pssm) = readmeme(input);

def main():

	if (len(sys.argv) > 1):	
		# use filename(s)
		for filename in sys.argv[1:]:
			meme2pssm(open(filename));
	else:
		meme2pssm(sys.stdin);

main();














