/*

	genomic.h

	various "genetic/genomic" constants and utilities


*/ 

#ifndef GENOMIC_H
#define GENOMIC_H

#define GENOMIC_DEBUG 0
#if GENOMIC_DEBUG
#include <assert.h>
#endif

#include <math.h>
#include <string>
#include <vector>
using namespace std;

/*
                  T  G  C  A
----------------+-----------
IUPAC[ 0] = 'X' | 0  0  0  0    ('X' also means 'N', but I'll use it for 0000 and 'N' for 1111)
IUPAC[ 1] = 'A' | 0  0  0  1
IUPAC[ 2] = 'C' | 0  0  1  0
IUPAC[ 3] = 'M' | 0  0  1  1
IUPAC[ 4] = 'G' | 0  1  0  0
IUPAC[ 5] = 'R' | 0  1  0  1
IUPAC[ 6] = 'S' | 0  1  1  0
IUPAC[ 7] = 'V' | 0  1  1  1
IUPAC[ 8] = 'T' | 1  0  0  0
IUPAC[ 9] = 'W' | 1  0  0  1
IUPAC[10] = 'Y' | 1  0  1  0
IUPAC[11] = 'H' | 1  0  1  1
IUPAC[12] = 'K' | 1  1  0  0
IUPAC[13] = 'D' | 1  1  0  1
IUPAC[14] = 'B' | 1  1  1  0
IUPAC[15] = 'N' | 1  1  1  1
*/

const string DNA_ALPHABET = "ACGT";
const string dna_alphabet = "acgt";	// lower-case version
const unsigned int DNA_A=0, DNA_C=1, DNA_G=2, DNA_T=3, DNA_ABLEN=4;

const string IUPAC_ALPHABET = "XACMGRSVTWYHKDBN";
const string iupac_alphabet = "xacmgrsvtwyhkdbn";
const unsigned int IUPAC_ABLEN = 16;
const unsigned int IUPAC_BITS = 15;	//	00...001111

/** translate character to ID */
inline unsigned int dna_id(char dna) {
	return 
	        (dna=='g' || dna=='G') ? DNA_G :
	        (dna=='c' || dna=='C') ? DNA_C :
			(dna=='t' || dna=='T') ? DNA_T :
	        (dna=='a' || dna=='A') ? DNA_A : 
			DNA_ABLEN;
}

/** translate character to ID */
inline unsigned int complement_dna_id(char dna) {
	return 
	        (dna=='g' || dna=='G') ? DNA_C :
	        (dna=='c' || dna=='C') ? DNA_G :
			(dna=='t' || dna=='T') ? DNA_A :
	        (dna=='a' || dna=='A') ? DNA_T : 
			DNA_ABLEN;
}

inline unsigned int complement_id(unsigned int dna_id) { 
	return
			(dna_id == DNA_A) ? DNA_T :
			(dna_id == DNA_C) ? DNA_G :
			(dna_id == DNA_G) ? DNA_C :
			(dna_id == DNA_T) ? DNA_A :
			DNA_ABLEN;
}


inline unsigned int iupac_id(char dna) {
	if ('A' <= dna && dna <= 'Z') {
		dna = dna + 'a' - 'A';	// to lower case
	}
	int p = iupac_alphabet.find(dna);
	if (p < 0) { return IUPAC_ABLEN; }
	return ((unsigned int)p);
}


/** Decide whether a sequences uses ONLY DNA characters {ACGT} */
bool over_dna(const string &sequence);

/** 
 * Return a string that is the DNA/IUPAC complement of the input string
 *	reverse = reverse the ordering of the sequence
 */
string revcomp(const string &dna, bool reverse=true);


#endif	// GENOMIC_H
