#!/usr/bin/python
#
# Convert fasta motif files to my PSSM format:
#
#
#	>Stb1
#	rAArAAAAArDCmrsrAAA
# 
# becomes
#
#	rAArAAAAArDCmrsrAAA  -1  0.5     0.0     0.5     0.0     1.0     0.0     0.0     0.0     1.0     0.0     0.0      0.0     0.5     0.0     0.5     0.0     1.0     0.0     0.0     0.0     1.0     0.0     0.0     0.0      1.0     0.0     0.0     0.0     1.0     0.0     0.0     0.0     1.0     0.0     0.0     0.0     0.5      0.0     0.5     0.0     0.333333333333  0.0     0.333333333333  0.333333333333  0.0     1.0      0.0      0.0     0.5     0.5     0.0     0.0     0.5     0.0     0.5     0.0     0.0     0.5     0.5      0.0      0.5     0.0     0.5     0.0     1.0     0.0     0.0     0.0     1.0     0.0     0.0     0.0     1.0      0.0     0.0     0.0
#
# which is the consensus, dummy E-value, then for each character, probability over ACGT.
#

import sys,os,string;

TAB = '\t';
ALPHABET = 'ACGT';

#                  A  C  G  T
#----------------+-----------
#IUPAC[ 0] = 'X' | 0  0  0  0    ('X' also means 'N', but I'll use it for 0000 and 'N' for 1111)
#IUPAC[ 1] = 'T' | 0  0  0  1
#IUPAC[ 2] = 'G' | 0  0  1  0
#IUPAC[ 3] = 'K' | 0  0  1  1
#IUPAC[ 4] = 'C' | 0  1  0  0
#IUPAC[ 5] = 'Y' | 0  1  0  1
#IUPAC[ 6] = 'S' | 0  1  1  0
#IUPAC[ 7] = 'B' | 0  1  1  1
#IUPAC[ 8] = 'A' | 1  0  0  0
#IUPAC[ 9] = 'W' | 1  0  0  1
#IUPAC[10] = 'R' | 1  0  1  0
#IUPAC[11] = 'D' | 1  0  1  1
#IUPAC[12] = 'M' | 1  1  0  0
#IUPAC[13] = 'H' | 1  1  0  1
#IUPAC[14] = 'V' | 1  1  1  0
#IUPAC[15] = 'N' | 1  1  1  1

IUPAC = "XTGKCYSBAWRDMHVN";

def sum(L):
	ans = 0;
	for n in L:
		ans = ans + n;
	return ans;

def scale(L, factor):
	ans = [];
	for n in L:
		ans = ans + [n * factor];
	return ans;

def normalize(vector):
	return scale(vector, 1.0/sum(vector));

def indexofIUPAC(character):
	for c in range(0,len(IUPAC)):
		if (IUPAC[c]==character):
			return c;
	return -1;

def gen_pssm(consensus):
	pssm = [];
	for character in consensus:
		pd = [0.0] * len(ALPHABET);
		index = indexofIUPAC(character);
		if (index & 8):  pd[0] = 1.0;	# A 
		if (index & 4):  pd[1] = 1.0;	# C 
		if (index & 2):  pd[2] = 1.0;	# G 
		if (index & 1):  pd[3] = 1.0;	# T 
		pssm = pssm + [normalize(pd)];
	return pssm;

def read_fasta(input, error=sys.stderr):
	header = input.readline();
	while (header == '\n'):
		header = input.readline();
	if (header == ''): return (0, None, None);
	if (header[0] != '>'):  error.write("Expected '>' in '" + header + ")\n");
	sequence = input.readline();
	# remove newlines
	header = header[:-1];
	if (sequence[-1] == '\n'):  sequence = sequence[:-1];
	return (1, header, sequence);

def main():
	
	input = sys.stdin;
	output = sys.stdout;
	
	if (len(sys.argv) > 1):
		input = open(sys.argv[1]);
	
	(more,header,sequence) = read_fasta(input);
	while (more):
		pssm = gen_pssm(string.upper(sequence));
		output.write(sequence + TAB + "-1");	# The E-value I'm writing...
		for c in pssm:
			for a in c:
				output.write(TAB + str(a));
		output.write("\n");
		(more,header,sequence) = read_fasta(input);
	
	if (len(sys.argv) > 1):
		input.close();
	
	return;


main();

