# Code modified from PrimerProspector library slice_aligned_region.py (development version)

# Imports and definitions
from string import lower, upper
from operator import itemgetter

from cogent import LoadSeqs, DNA
from cogent.core.alphabet import AlphabetError
from cogent.align.align import make_dna_scoring_dict, local_pairwise
from cogent.parse.fasta import MinimalFastaParser
from cogent.core.moltype import IUPAC_DNA_ambiguities

DNA_CODES = ['A', 'C', 'T', 'G', 'R', 'Y', 'M', 'K',
 'W', 'S', 'B', 'D', 'H', 'V', 'N']


# Note that these are all written 5'->3', the reverse primers are reverse complemented for the local alignment

# If one wanted to test different primers, they would be defined here.

# 27f/338r = V2 (also includes V1, but generally just referred to as V2)
# 349f/534r = V3
# 515f/806r = V4
# 967f/1046r = V6
# 1391f/1492r = V9

primer_seqs = {
           '27f':'AGAGTTTGATCMTGGCTCAG',
           '338r':DNA.rc('GCTGCCTCCCGTAGGAGT'),
           '349f':'GYGCASCAGKCGMGAAW',
           '534r':DNA.rc('ATTACCGCGGCTGCTGG'),
           '515f':'GTGCCAGCMGCCGCGGTAA',
           '806r':DNA.rc('GGACTACVSGGGTATCTAAT'),
           '967f':'CAACGCGAAGAACCTTACC',
           '1048r':DNA.rc('CGRCRGCCATGYACCWC'),
           '1391f':'TGYACACACCGCCCGTC',
           '1492r':DNA.rc('GGCTACCTTGTTACGACTT'),
           '1391r':'TGYACACACCGCCCGTC' # Need this rather than forward primer to get proper 3' position of reverse version
           }

reference_aligned_file = '/home/ubuntu/qiime_software/gg_otus-4feb2011-release/rep_set/gg_76_otus_4feb2011_aligned.fasta'


# Begin functions

""" This library was written for the purpose of slicing out a region of a reference sequence set to 
match a target region amplified by a set of primers to improve taxonomic classification as seen in
Werner et al.  It has been modified here to simply return the positions (3' end of the primer) in the
alignment of the given primer pair, as well as the position of simulated short reads of 150, 250, and
400 base pairs.  Note that some of these reads are larger than the size of the amplicon itself, and in
this case the simulated read simply isn't created."""

def match_scorer_ambigs(match=1,
                       mismatch=-1,
                       matches=None):
    """ Alternative scorer factory for sw_align, allows match to ambiguous chars

    It allows for matching to ambiguous characters which is useful for 
     primer/sequence matching. Not sure what should happen with gaps, but they
     shouldn't be passed to this function anyway. Currently a gap will only match
     a gap.

    match and mismatch should both be numbers. Typically, match should be 
    positive and mismatch should be negative.

    Resulting function has signature f(x,y) -> number.
    
    match: score for nucleotide match
    mismatch: score for nucleotide mismatch
    matches: dictionary for matching nucleotides, including degenerate bases
    """
    
    matches = matches or \
     {'A':{'A':None},'G':{'G':None},'C':{'C':None},\
      'T':{'T':None},'-':{'-':None}}
    for ambig, chars in IUPAC_DNA_ambiguities.items():
        try:
            matches[ambig].update({}.fromkeys(chars))
        except KeyError:
            matches[ambig] = {}.fromkeys(chars)
        
        for char in chars:
            try:
                matches[char].update({ambig:None})
            except KeyError:
                matches[char] = {ambig:None}
            
    def scorer(x, y):
        # need a better way to disallow unknown characters (could
        # try/except for a KeyError on the next step, but that would only 
        # test one of the characters)
        if x not in matches or y not in matches:
            raise ValueError, "Unknown character: %s or %s" % (x,y)
        if y in matches[x]:
            return match
        else:
            return mismatch
    return scorer


def get_primer_mismatches(primer_hit,
                          target_hit,
                          sw_scorer=match_scorer_ambigs(1, -1)):
    """ Gets mismatches for a given primer sequence and target hit
    
    Specifically this returns total mismatches
    
    primer_hit: Alignment object for primer, normally matches primer unless
     gaps were used in the alignment
    target_hit: Alignment object for segment of sequence where primer was
     aligned to.  Can contain gaps.
    sw_scorer: Gives scores for mismatches, gap insertions in alignment.
    
    """

    # Sum mismatches
    mismatches = 0
    for i in range(len(primer_hit)):
        # using the scoring function to check for
        # matches, but might want to just access the dict
        if sw_scorer(target_hit[i], primer_hit[i]) == -1: 
            mismatches += 1
            
    return mismatches

def get_aligned_pos(unaligned_index, aligned_seq):
    """ Returns aligned index given index in degapped sequence
    
    unaligned_index: int with index in unaligned sequence
    aligned_seq: aligned sequence to find index"""
    
    nt_counter = 0
    total_counter = 0
    
    for nt in aligned_seq:
        if nt in DNA_CODES:
            nt_counter += 1
        total_counter += 1
        if nt_counter == unaligned_index:
            break
            
    return total_counter

def pair_hmm_align_unaligned_seqs(seqs,
                                  moltype=DNA,
                                  params={}):
    """
        Handles pairwise alignment of given sequence pair
        
        seqs: list of [primer, target sequence] in string format
        moltype: molecule type tested.  Only DNA supported.
        params: Used to set parameters for opening, extending gaps  and score
         matrix if something other than the default given in this function 
         is desired.
    """
    
    seqs = LoadSeqs(data=seqs,moltype=moltype,aligned=False)
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError,\
         "Pairwise aligning of seqs requires exactly two seqs."
    
    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(\
         match=1, transition=-1, transversion=-1)
    
    return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)

def local_align_primer_seq(primer,
                           sequence):
    """Perform local alignment of primer and sequence
    
        primer: Current primer being tested
        sequence: Current sequence
        
        Returns the Alignment object primer sequence and target sequence, 
         and the start position in sequence of the hit.
    """


    query_sequence = sequence
     
    # Get alignment object from primer, target sequence
    alignment = pair_hmm_align_unaligned_seqs([primer,query_sequence])

    # Extract sequence of primer, target site, may have gaps in insertions
    # or deletions have occurred.
    primer_hit = str(alignment.Seqs[0])
    target_hit = str(alignment.Seqs[1])
    
    # Get index of primer hit in target sequence.
    try:
        hit_start = query_sequence.index(target_hit.replace('-',''))
    except ValueError:
        raise ValueError,('substring not found, query string %s, target_hit %s'\
         % (query_sequence, target_hit))
         
    
    return primer_hit, target_hit, hit_start

def get_primer_indices_counts(curr_fasta,
                              f_primer_seq,
                              r_primer_seq,
                              mismatch_threshold=3):
    """ Gets counts of primer hit indices for forward and reverse primer
    
    curr_fasta:  current aligned fasta filepath
    f_primer_seq: forward primer sequence
    r_primer_seq: reverse primer sequence (already reverse complemented)
    mismatch_threshold: used to determine which primer hits are counted for
     getting the final aligned index"""
    
    f_primer_indices = {}
    r_primer_indices = {}
    
    for label, seq in MinimalFastaParser(open(curr_fasta), "U"):
            
            unaligned_seq = upper(seq.replace(".","").replace("-",""))
            
            primer_hit, target_hit, hit_start = \
             local_align_primer_seq(f_primer_seq, unaligned_seq)
             
            # get mismatches
            mismatches = get_primer_mismatches(primer_hit, target_hit)
            
            if mismatches <= mismatch_threshold:
                # Correction for forward primer to get 3' position
                aligned_pos = get_aligned_pos(hit_start + len(target_hit), seq)
                
                try:
                    f_primer_indices[aligned_pos] += 1
                except KeyError:
                    f_primer_indices[aligned_pos] = 0
            
            primer_hit, target_hit, hit_start = \
             local_align_primer_seq(r_primer_seq, unaligned_seq)
             
            # get mismatches
            mismatches = get_primer_mismatches(primer_hit, target_hit)
            
            if mismatches <= mismatch_threshold:
                # No correction for reverse primer hit
                aligned_pos = get_aligned_pos(hit_start, seq)
                
                try:
                    r_primer_indices[aligned_pos] += 1
                except KeyError:
                    r_primer_indices[aligned_pos] = 0
                    
    return f_primer_indices, r_primer_indices

def get_final_indices(f_primer_indices,
                      r_primer_indices):
    """ Sorts, retrieves mode of aligning primer hits indices
    
    f_primer_indices:  dictionary of primer_indices:counts
    r_primer_indices:  dictionary of primer_indices:counts
    """
    
    f_primer_final = []
    r_primer_final = []
    
    for index in f_primer_indices:
        f_primer_final.append((index, f_primer_indices[index]))
    f_primer_final.sort(key=itemgetter(1), reverse=True)
    f_primer_index = f_primer_final[0][0]

    for index in r_primer_indices:
        r_primer_final.append((index, r_primer_indices[index]))
    r_primer_final.sort(key=itemgetter(1), reverse=True)
    r_primer_index = r_primer_final[0][0]
    
    
    return f_primer_index, r_primer_index

primer_indices = {}

primer_f_indices, primer_r_indices = get_primer_indices_counts(reference_aligned_file, primer_seqs['27f'], primer_seqs['338r'])
primer_indices['27f'], primer_indices['338r'] = get_final_indices(primer_f_indices, primer_r_indices)

primer_f_indices, primer_r_indices = get_primer_indices_counts(reference_aligned_file, primer_seqs['349f'], primer_seqs['534r'])
primer_indices['349f'], primer_indices['534r'] = get_final_indices(primer_f_indices, primer_r_indices)

primer_f_indices, primer_r_indices = get_primer_indices_counts(reference_aligned_file, primer_seqs['515f'], primer_seqs['806r'])
primer_indices['515f'], primer_indices['806r'] = get_final_indices(primer_f_indices, primer_r_indices)

primer_f_indices, primer_r_indices = get_primer_indices_counts(reference_aligned_file, primer_seqs['967f'], primer_seqs['1048r'])
primer_indices['967f'], primer_indices['1048r'] = get_final_indices(primer_f_indices, primer_r_indices)

primer_f_indices, primer_r_indices = get_primer_indices_counts(reference_aligned_file, primer_seqs['1391f'], primer_seqs['1492r'])
primer_indices['1391f'], primer_indices['1492r'] = get_final_indices(primer_f_indices, primer_r_indices)

primer_f_indices, primer_r_indices = get_primer_indices_counts(reference_aligned_file, primer_seqs['515f'], primer_seqs['1391r'])
primer_indices['515f'], primer_indices['1391r'] = get_final_indices(primer_f_indices, primer_r_indices)

# Need to manually add the full length read, ('full.length', 0, 7682), which covers the entire Greengenes alignment.

region_boundaries = [
      ('v2', primer_indices['27f'], primer_indices['338r']),
      ('v2.v3', primer_indices['27f'], primer_indices['534r']),
      ('v2.v4', primer_indices['27f'], primer_indices['806r']),
      ('v2.v6', primer_indices['27f'], primer_indices['1048r']),
      ('v2.v8', primer_indices['27f'], primer_indices['1391r']),
      ('v2.v9', primer_indices['27f'], primer_indices['1492r']),
      ('v3', primer_indices['349f'], primer_indices['534r']),
      ('v3.v4', primer_indices['349f'], primer_indices['806r']),
      ('v3.v6', primer_indices['349f'], primer_indices['1048r']),
      ('v3.v8', primer_indices['349f'], primer_indices['1391r']),
      ('v3.v9', primer_indices['349f'], primer_indices['1492r']),
      ('v4', primer_indices['515f'], primer_indices['806r']),
      ('v4.v6', primer_indices['515f'], primer_indices['1048r']),
      ('v4.v8', primer_indices['515f'], primer_indices['1391r']),
      ('v4.v9', primer_indices['515f'], primer_indices['1492r']),
      ('v6', primer_indices['967f'], primer_indices['1048r']),
      ('v6.v8', primer_indices['967f'], primer_indices['1391r']),
      ('v6.v9', primer_indices['967f'], primer_indices['1492r']),
      ('v9', primer_indices['1391f'],  primer_indices['1492r'])
]

region_index = 0
start_index = 1
end_index = 2

# Print in easy format to generate a list
for curr_region in region_boundaries:
    print "('%s', %d, %d)," % (curr_region[region_index], curr_region[start_index], curr_region[end_index])
    
# Manually print full length alignment
print "('full.length', 0, 7682),"

ix_sizes = [150, 250, 400]

names_ix = ['.150', '.250', '.400']

# We used the first sequence in the 97% clustered file for the demonstration, so using the
# same file here to recreate exact results
reference97_aligned_file = '/home/ubuntu/qiime_software/gg_otus-4feb2011-release/rep_set/gg_97_otus_4feb2011_aligned.fasta'

f = open(reference97_aligned_file, "U")

for label, seq in MinimalFastaParser(f):
    curr_seq = seq
    break
    
f.close()

nts = ['A', 'T', 'C', 'G', 'N']


for curr_region in region_boundaries:

    for forward_ix in range(len(ix_sizes)):
        curr_name = curr_region[region_index]
        curr_name += names_ix[forward_ix]
        
        start_ix = curr_region[start_index]
        end_ix = curr_region[end_index]

        curr_slice = curr_seq[start_ix:end_ix]

        
        counter = 0
        target_count = ix_sizes[forward_ix]
        
        curr_forward_ix = start_ix
        for nt in curr_slice:
            if nt in nts:
                counter += 1
            curr_forward_ix += 1
            # Skip if amplicon is smaller than current read size
            if curr_forward_ix == end_ix:
                break
            if counter == target_count:
                print "('%s', %d, %d)," % (curr_name, start_ix, curr_forward_ix)
                break 
Here is the raw output of the previous steps that can be easily put into the list base_region_boundaries in the main notebook:
('v2', 136, 1868),
('v2.v3', 136, 2232),
('v2.v4', 136, 4051),
('v2.v6', 136, 4932),
('v2.v8', 136, 6426),
('v2.v9', 136, 6791),
('v3', 1916, 2232),
('v3.v4', 1916, 4051),
('v3.v6', 1916, 4932),
('v3.v8', 1916, 6426),
('v3.v9', 1916, 6791),
('v4', 2263, 4051),
('v4.v6', 2263, 4932),
('v4.v8', 2263, 6426),
('v4.v9', 2263, 6791),
('v6', 4653, 4932),
('v6.v8', 4653, 6426),
('v6.v9', 4653, 6791),
('v9', 6450, 6791),
('full.length', 0, 7682),
('v2.150', 136, 702),
('v2.250', 136, 1752),
('v2.v3.150', 136, 702),
('v2.v3.250', 136, 1752),
('v2.v3.400', 136, 2036),
('v2.v4.150', 136, 702),
('v2.v4.250', 136, 1752),
('v2.v4.400', 136, 2036),
('v2.v6.150', 136, 702),
('v2.v6.250', 136, 1752),
('v2.v6.400', 136, 2036),
('v2.v8.150', 136, 702),
('v2.v8.250', 136, 1752),
('v2.v8.400', 136, 2036),
('v2.v9.150', 136, 702),
('v2.v9.250', 136, 1752),
('v2.v9.400', 136, 2036),
('v3.v4.150', 1916, 2235),
('v3.v4.250', 1916, 2493),
('v3.v4.400', 1916, 4014),
('v3.v6.150', 1916, 2235),
('v3.v6.250', 1916, 2493),
('v3.v6.400', 1916, 4014),
('v3.v8.150', 1916, 2235),
('v3.v8.250', 1916, 2493),
('v3.v8.400', 1916, 4014),
('v3.v9.150', 1916, 2235),
('v3.v9.250', 1916, 2493),
('v3.v9.400', 1916, 4014),
('v4.150', 2263, 3794),
('v4.250', 2263, 4046),
('v4.v6.150', 2263, 3794),
('v4.v6.250', 2263, 4046),
('v4.v6.400', 2263, 4574),
('v4.v8.150', 2263, 3794),
('v4.v8.250', 2263, 4046),
('v4.v8.400', 2263, 4574),
('v4.v9.150', 2263, 3794),
('v4.v9.250', 2263, 4046),
('v4.v9.400', 2263, 4574),
('v6.v8.150', 4653, 5085),
('v6.v8.250', 4653, 5903),
('v6.v8.400', 4653, 6419),
('v6.v9.150', 4653, 5085),
('v6.v9.250', 4653, 5903),
('v6.v9.400', 4653, 6419),

But there are some duplicate regions here, such as v2.150 and v2.v3.150. These can be manually removed to get this:
('v2', 136, 1868),
('v2.v3', 136, 2232),
('v2.v4', 136, 4051),
('v2.v6', 136, 4932),
('v2.v8', 136, 6426),
('v2.v9', 136, 6791),
('v3', 1916, 2232),
('v3.v4', 1916, 4051),
('v3.v6', 1916, 4932),
('v3.v8', 1916, 6426),
('v3.v9', 1916, 6791),
('v4', 2263, 4051),
('v4.v6', 2263, 4932),
('v4.v8', 2263, 6426),
('v4.v9', 2263, 6791),
('v6', 4653, 4932),
('v6.v8', 4653, 6426),
('v6.v9', 4653, 6791),
('v9', 6450, 6791),
('full.length', 0, 7682),
('v2.150', 136, 702),
('v2.250', 136, 1752),
('v2.v3.400', 136, 2036),
('v3.v4.150', 1916, 2235),
('v3.v4.250', 1916, 2493),
('v3.v4.400', 1916, 4014),
('v4.150', 2263, 3794),
('v4.250', 2263, 4046),
('v4.v6.400', 2263, 4574),
('v6.v8.150', 4653, 5085),
('v6.v8.250', 4653, 5903),
('v6.v8.400', 4653, 6419)

Which is the same subset of sequences used to define base_region_boundaries (copied from the main demonstration notebook):
      ('v2', 136, 1868), #27f-338r
      ('v2.v3', 136, 2232),
      ('v2.v4', 136, 4051),
      ('v2.v6', 136, 4932),
      ('v2.v8', 136, 6426),
      ('v2.v9', 136, 6791),
      ('v3', 1916, 2232), #349f-534r
      ('v3.v4', 1916, 4051),
      ('v3.v6', 1916, 4932),
      ('v3.v8', 1916, 6426),
      ('v3.v9', 1916, 6791),
      ('v4', 2263, 4051), #515f-806r
      ('v4.v6', 2263, 4932),
      ('v4.v8', 2263, 6426),
      ('v4.v9', 2263, 6791),
      ('v6', 4653, 4932), #967f-1048r
      ('v6.v8', 4653, 6426),
      ('v6.v9', 4653, 6791),
      ('v9', 6450, 6791), #1391f-1492r
      ('full.length', 0, 7682),  # Start 150, 250, 400 base pair reads
      ('v2.150', 136, 702),
      ('v2.250', 136, 1752),
      ('v2.v3.400', 136, 2036), # Skips reads that are larger than amplicon size
      ('v3.v4.150', 1916, 2235),
      ('v3.v4.250', 1916, 2493),
      ('v3.v4.400', 1916, 4014),
      ('v4.150', 2263, 3794),
      ('v4.250', 2263, 4046),
      ('v4.v6.400', 2263, 4574),
      ('v6.v8.150', 4653, 5085),
      ('v6.v8.250', 4653, 5903),
      ('v6.v8.400', 4653, 6419)