%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from biom import load_table
from biom.parse import MetadataMap
from collections import Counter

!curl -OL https://github.com/biocore/American-Gut/raw/master/data/AG/AG_100nt_even10k.txt
!curl -OL https://github.com/biocore/American-Gut/raw/master/data/AG/AG_100nt_even10k.biom

metadata = MetadataMap.from_file('AG_100nt_even10k.txt')
table = load_table('AG_100nt_even10k.biom')
table.add_metadata(metadata)

def compute_shared_otus(table_full, category, values):
    """Compute shared OTUs for a given category and values
    
    Parameters
    ----------
    table_full : Table
        A BIOM table. It is assumed that this table has sample metadata associated with it.
    category : str
        The metadata category
    values : list of str
        The values of interest within the category
    
    Returns
    -------
    np.array
        Observed percentages
    np.array
        Observed top OTU IDs
    list
        Observed top taxa
    int
        Number of samples in the filtered table
    
    """
    def _get_taxa(t, id_):
        md = t.metadata(id_, axis='observation')
        if md is None:
            taxa = ''
        else:
            taxa = '; '.join(md['taxonomy'])
        return taxa
    
    table = table_full.pa(inplace=False)
    
    # filter to just those samples that meet our metadata category and values
    cat_fn = lambda v, i, md: md[category] in values
    site = table.filter(cat_fn, inplace=False)

    # normalize
    norm_fn = lambda v, i, md: v / len(site.ids(axis='sample'))
    site.transform(norm_fn)
    
    # compute percents
    obs_full = site.sum('observation')
    obs_percents = np.array([(obs_full > i).sum() for i in np.arange(0.0, 1.0, 0.01)], dtype=float)
    
    # determine the taxa that appear to be highly shared
    top_ids = site.ids(axis='observation')[np.argwhere(obs_full > 0.95)].squeeze()
    top_taxa = []
    if len(np.atleast_1d(top_ids)) > 0:
        if not top_ids.shape: # array of scalar value
            top_ids = np.array(str(top_ids))
            top_taxa.append(_get_taxa(site, str(top_ids)))
        else:
            for id_ in top_ids:
                top_taxa.append(_get_taxa(site, id_))
    
    return obs_percents, top_ids, top_taxa, len(site.ids(axis='sample'))

fecal_filter = lambda v, i, md: md['BODY_SITE'] == 'UBERON:feces'
def compute_multiple_values(table_full, category, min_count=25, prefilter=fecal_filter, iterations=10, 
                            discretize=None):
    """Compute shared OTUs over multiple category values
    
    This method will print summary details about the category being examined which can
    be used to inform a good min_count.
        
    Parameters
    ----------
    table_full : Table
        The full table to operate on.
    category : str
        The metadata category to examine.
    min_count : int, optional
        The minimum number of samples that must be associated with a category value.
        Defaults to 25.
    prefilter : func
        Prefilter, e.g., down to just fecal samples
    iterations : int, optional
        Number of rarefactions to perform
    discretize : list of tuple, optional
        Bins to discretize the data. Expected form: [(min, max)]
        
    Returns
    -------
    np.array
        The mean observed percents in row order with observed values
    np.array
        The stderr observed percents in row order with observed values
    list of str
        The observed category values
    int
        Sample depth used
        
    Notes
    -----
    This method ignores NA and no_data category values
    
    """
    def make_filter_f(md_field, md_val):
        """Construct a filter function based on a metadata category and value
        
        If md_val is a tuple, a discretization is attempted.
        """
        if isinstance(md_val, tuple):
            min_, max_ = md_val
            def f(v, i, samp_md):
                """Return True if the sample is value is min_ <= foo < max_"""
                try:
                    return min_ <= float(samp_md[md_field]) < max_
                except ValueError:
                    return False
        else:
            def f(v, i, samp_md):
                """Return true if the sample is associated with our desired value"""
                return samp_md[md_field] == md_val
        return f
    
    pa = table_full.filter(prefilter, inplace=False).pa()
    
    # get the counts within each category value
    metadata_counts = Counter()
    for md in pa.metadata(axis='sample'):
        # ignore missing data
        if md[category] in ['NA', 'no_data', '']:
            continue
            
        if discretize is not None:
            for min_, max_ in discretize:
                if min_ <= float(md[category]) < max_:
                    metadata_counts[(min_, max_)] += 1
                    break
        else:
            metadata_counts[md[category]] += 1
    
    metadata_counts = {k: v for k, v in metadata_counts.items() if v > 1}
        
    print "Category counts for %s:" % category
    for k, v in sorted(metadata_counts.items(), key=lambda item: item[1], reverse=True):
        print "  %s: %d" % (k, v)
        
    # determine the minimum sampling depth
    sample_depth = min(filter(lambda v: v > min_count, metadata_counts.values()))        
    metadata_values = [k for k, v in metadata_counts.items() if v >= sample_depth]
    
    observed_means = []
    observed_stderrs = []
    observed_values = []
    for obs_v in sorted(metadata_values):
        if discretize is not None:
            observed_values.append("%d <= x < %d" % obs_v)
        else:
            observed_values.append(obs_v)
    
        # filter to the specific category value
        md_specific = pa.filter(make_filter_f(category, obs_v), inplace=False)
        ids = md_specific.ids(axis='sample').copy() 

        results = np.zeros((iterations, 100), dtype=float)
        for idx in range(iterations):
            # subsample the table
            np.random.shuffle(ids)
            subsampled_ids = set(ids[:sample_depth])
            filter_f = lambda v, i, md: i in subsampled_ids
            ss = md_specific.filter(filter_f, inplace=False)
            ss.filter(lambda v, i, md: sum(v) > 0, axis='observation')

            # normalize
            norm_f = lambda v, i, md: v / len(ss.ids(axis='sample'))
            ss.transform(norm_f)
            
            # compute percents
            obs_full = ss.sum('observation')
            results[idx] = np.array([(obs_full > j).sum() for j in np.arange(0.0, 1.0, 0.01)], dtype=float)        
        
        observed_means.append(results.mean(axis=0))
        observed_stderrs.append(results.std(axis=0) / np.sqrt(iterations))
    
    return np.asarray(observed_means), np.asarray(observed_stderrs), observed_values, sample_depth

def plot_shared_otus(n_samples, percents, labels, title=None, depth=None, stderrs=None, start=0):
    """Plot the shared OTUs
    
    Parameters
    ----------
    n_samples : int
        The number of samples reflected
    percents : np.array
        The percents to plot
    title : str, optional
        Additional title name
    depth : int, optional
        The subsample depth
    """ 
    if len(percents.shape) == 1:
        percents = np.array([percents])
    
    fig = plt.figure()
    ax_count = plt.gca()
    
    title_str = 'Shared OTUs'
    
    if title is not None:
        title_str += ': %s' % title
    if depth is not None:
        title_str += ', n=%d' % depth
    
    ax_count.set_title(title_str)
    ax_count.set_yscale('log')
    ax_count.set_ylabel('Number of OTUs')
    ax_count.set_xlabel('Number of samples')
    
    x_ticks = np.arange(0.0, 1.0, 0.01) * n_samples
    
    for idx, row in enumerate(percents):
        if stderrs is not None:
            ax_count.errorbar(x_ticks[start:], row[start:], yerr=stderrs[idx][start:], fmt='o', 
                              markersize=5, markeredgewidth=0.25)
        else:
            ax_count.plot(x_ticks[start:], row[start:])

    plt.legend(labels, numpoints=1)
            
    plt.tight_layout()
    ax_count.grid()

def helper_single(category, values, label):
    percents, top_ids, top_taxa, n_samples = compute_shared_otus(table, category, values)
    plot_shared_otus(n_samples, percents, [label])
    print "Top observed shared taxa:"
    for t in top_taxa:
        print '    %s' % t

def helper_multiple(category, title, min_count=None, discretize=None):
    means, stderrs, values, depth = compute_multiple_values(table, category, min_count, discretize=discretize)
    plot_shared_otus(depth, means, values, title=title, depth=depth)

helper_single('BODY_SITE', ['UBERON:feces'], 'Fecal')

helper_single('BODY_SITE', ['UBERON:skin', 'UBERON:hand'], 'Skin')

helper_single('BODY_SITE', ['UBERON:tongue'], 'Oral')

helper_multiple('ANTIBIOTIC_SELECT', 'Antibiotic use')

helper_multiple('SEASONAL_ALLERGIES', 'Seasonal allergies')

helper_multiple('DIET_TYPE', 'Diet type')

helper_multiple('TYPES_OF_PLANTS', 'Number of different plants')

BMI_ranges = [(0, 18),
              (18, 25),
              (25, 30),
              (30, 35),
              (35, 40)]
helper_multiple('BMI', 'BMI', discretize=BMI_ranges)