/*

iupac.h:  fast lookup for consensus sequence matching

(http://doc.bioperl.org/bioperl-live/Bio/Tools/IUPAC.html)

        Extended Dna / Rna alphabet :
        (includes symbols for nucleotide ambiguity)
        ------------------------------------------
        Symbol       Meaning      Nucleic Acid
        ------------------------------------------
         A            A           Adenine
         C            C           Cytosine
         G            G           Guanine
         T            T           Thymine
         U            U           Uracil
         M          A or C
         R          A or G
         W          A or T
         S          C or G
         Y          C or T
         K          G or T
         V        A or C or G
         H        A or C or T
         D        A or G or T
         B        C or G or T
         X      G or A or T or C
         N      G or A or T or C

        IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
          Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.

*/		  

#ifndef IUPAC_H
#define IUPAC_H 0

#include <stdlib.h>

const int DNA_ALPHABET_SIZE = 6;
const int IUPAC_ALPHABET_SIZE = 17;

const char IUPAC_ALPHABET[] = "ACGTUMRWSYKVHDBXN";
const char DNA_ALPHABET[] = "ACGTU";

const bool IUPAC_MATRIX[DNA_ALPHABET_SIZE][IUPAC_ALPHABET_SIZE] = {
/*        A, C, G, T, U, M, R, W, S, Y, K, V, H, D, B, X, N */
/* A */ { 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1 },
/* C */ { 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1 },
/* G */ { 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1 },
/* T */ { 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1 },
/* U */ { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
/* N */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 } };

const int IUPAC_DNA_COUNTS[] = 
/*        A, C, G, T, U, M, R, W, S, Y, K, V, H, D, B, X, N */
        { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4 };

const int DNA_MAP[] =
/*  A   B   C   D   E   F   G   H   I   J   K   L   M   N   O   P   Q   R   S   T   U   V   W   X   Y   Z */
  { 0, -1,  1, -1, -1, -1,  2, -1, -1, -1, -1, -1, -1,  5, -1, -1, -1, -1, -1,  3,  4 };

const int IUPAC_MAP[] = 
/*  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 */
/*  A,  C,  G,  T,  U,  M,  R,  W,  S,  Y,  K,  V,  H,  D,  B,  X,  N */
/*  A   B   C   D   E   F   G   H   I   J   K   L   M   N   O   P   Q   R   S   T   U   V   W   X   Y   Z */
  { 0, 14,  1, 13, -1, -1,  2, 12, -1, -1, 10, -1,  5, 16, -1, -1, -1,  6,  8,  3,  4, 11,  7, 15,  9, -1 };

inline int dna_map_index(char dna) { return dna-'A'; }
inline int iupac_map_index(char dna) { return dna-'A'; }

/** return true iff DNA character 'dna' is a character 
	THAT CAN BE represented by the IUPAC symbol 'iupac' 
	Assume dna is A,C,G,T, or U and iupac is upper-case letter
*/
inline bool iupac_qualify(char dna, char iupac) {
	return IUPAC_MATRIX[ DNA_MAP[dna_map_index(dna)] ][ IUPAC_MAP[iupac_map_index(iupac)] ];
}	

char iupac_intersection(char x, char y) { 

	if ('a' <= x && x <= 'z') { x = x + 'A' - 'a'; }	// to upper case
	if ('a' <= y && y <= 'z') { y = y + 'A' - 'a'; }
	if (x==y) { return x; }

	bool pattern_x[DNA_ALPHABET_SIZE];
	bool pattern_y[DNA_ALPHABET_SIZE];
	bool pattern[DNA_ALPHABET_SIZE];

	for (int p=0; p<DNA_ALPHABET_SIZE; p++) { 
		pattern_x[p] = IUPAC_MATRIX[p][IUPAC_MAP[iupac_map_index(x)]];
		pattern_y[p] = IUPAC_MATRIX[p][IUPAC_MAP[iupac_map_index(y)]];
		pattern[p] = pattern_x[p] && pattern_y[p];
	}
	for (int i=0; i<IUPAC_ALPHABET_SIZE; i++) { 
		int matches = 0;
		for (int p=0; p<DNA_ALPHABET_SIZE; p++) { 
			if (pattern[p] == IUPAC_MATRIX[p][i]) { 
				matches++;
			} else {
				break;
			}
		}
		if (matches == DNA_ALPHABET_SIZE) { return IUPAC_ALPHABET[i]; }
	}
	return '?';
}
char iupac_union(char x, char y) { 
	if ('a' <= x && x <= 'z') { x = x + 'A' - 'a'; }	// to upper case
	if ('a' <= y && y <= 'z') { y = y + 'A' - 'a'; }
	if (x==y) { return x; }

	bool pattern_x[DNA_ALPHABET_SIZE];
	bool pattern_y[DNA_ALPHABET_SIZE];
	bool pattern[DNA_ALPHABET_SIZE];

	for (int p=0; p<DNA_ALPHABET_SIZE; p++) { 
		pattern_x[p] = IUPAC_MATRIX[p][IUPAC_MAP[iupac_map_index(x)]];
		pattern_y[p] = IUPAC_MATRIX[p][IUPAC_MAP[iupac_map_index(y)]];
		pattern[p] = pattern_x[p] || pattern_y[p];
	}
	for (int i=0; i<IUPAC_ALPHABET_SIZE; i++) { 
		int matches = 0;
		for (int p=0; p<DNA_ALPHABET_SIZE; p++) { 
			if (pattern[p] == IUPAC_MATRIX[p][i]) { 
				matches++;
			} else {
				break;
			}
		}
		if (matches == DNA_ALPHABET_SIZE) { return IUPAC_ALPHABET[i]; }
	}
	return '?';
}




#endif


