%matplotlib inline import matplotlib.pyplot as plt import numpy as np from biom import load_table from biom.parse import MetadataMap from collections import Counter !curl -OL https://github.com/biocore/American-Gut/raw/master/data/AG/AG_100nt_even10k.txt !curl -OL https://github.com/biocore/American-Gut/raw/master/data/AG/AG_100nt_even10k.biom metadata = MetadataMap.from_file('AG_100nt_even10k.txt') table = load_table('AG_100nt_even10k.biom') table.add_metadata(metadata) def compute_shared_otus(table_full, category, values): """Compute shared OTUs for a given category and values Parameters ---------- table_full : Table A BIOM table. It is assumed that this table has sample metadata associated with it. category : str The metadata category values : list of str The values of interest within the category Returns ------- np.array Observed percentages np.array Observed top OTU IDs list Observed top taxa int Number of samples in the filtered table """ def _get_taxa(t, id_): md = t.metadata(id_, axis='observation') if md is None: taxa = '' else: taxa = '; '.join(md['taxonomy']) return taxa table = table_full.pa(inplace=False) # filter to just those samples that meet our metadata category and values cat_fn = lambda v, i, md: md[category] in values site = table.filter(cat_fn, inplace=False) # normalize norm_fn = lambda v, i, md: v / len(site.ids(axis='sample')) site.transform(norm_fn) # compute percents obs_full = site.sum('observation') obs_percents = np.array([(obs_full > i).sum() for i in np.arange(0.0, 1.0, 0.01)], dtype=float) # determine the taxa that appear to be highly shared top_ids = site.ids(axis='observation')[np.argwhere(obs_full > 0.95)].squeeze() top_taxa = [] if len(np.atleast_1d(top_ids)) > 0: if not top_ids.shape: # array of scalar value top_ids = np.array(str(top_ids)) top_taxa.append(_get_taxa(site, str(top_ids))) else: for id_ in top_ids: top_taxa.append(_get_taxa(site, id_)) return obs_percents, top_ids, top_taxa, len(site.ids(axis='sample')) fecal_filter = lambda v, i, md: md['BODY_SITE'] == 'UBERON:feces' def compute_multiple_values(table_full, category, min_count=25, prefilter=fecal_filter, iterations=10, discretize=None): """Compute shared OTUs over multiple category values This method will print summary details about the category being examined which can be used to inform a good min_count. Parameters ---------- table_full : Table The full table to operate on. category : str The metadata category to examine. min_count : int, optional The minimum number of samples that must be associated with a category value. Defaults to 25. prefilter : func Prefilter, e.g., down to just fecal samples iterations : int, optional Number of rarefactions to perform discretize : list of tuple, optional Bins to discretize the data. Expected form: [(min, max)] Returns ------- np.array The mean observed percents in row order with observed values np.array The stderr observed percents in row order with observed values list of str The observed category values int Sample depth used Notes ----- This method ignores NA and no_data category values """ def make_filter_f(md_field, md_val): """Construct a filter function based on a metadata category and value If md_val is a tuple, a discretization is attempted. """ if isinstance(md_val, tuple): min_, max_ = md_val def f(v, i, samp_md): """Return True if the sample is value is min_ <= foo < max_""" try: return min_ <= float(samp_md[md_field]) < max_ except ValueError: return False else: def f(v, i, samp_md): """Return true if the sample is associated with our desired value""" return samp_md[md_field] == md_val return f pa = table_full.filter(prefilter, inplace=False).pa() # get the counts within each category value metadata_counts = Counter() for md in pa.metadata(axis='sample'): # ignore missing data if md[category] in ['NA', 'no_data', '']: continue if discretize is not None: for min_, max_ in discretize: if min_ <= float(md[category]) < max_: metadata_counts[(min_, max_)] += 1 break else: metadata_counts[md[category]] += 1 metadata_counts = {k: v for k, v in metadata_counts.items() if v > 1} print "Category counts for %s:" % category for k, v in sorted(metadata_counts.items(), key=lambda item: item[1], reverse=True): print " %s: %d" % (k, v) # determine the minimum sampling depth sample_depth = min(filter(lambda v: v > min_count, metadata_counts.values())) metadata_values = [k for k, v in metadata_counts.items() if v >= sample_depth] observed_means = [] observed_stderrs = [] observed_values = [] for obs_v in sorted(metadata_values): if discretize is not None: observed_values.append("%d <= x < %d" % obs_v) else: observed_values.append(obs_v) # filter to the specific category value md_specific = pa.filter(make_filter_f(category, obs_v), inplace=False) ids = md_specific.ids(axis='sample').copy() results = np.zeros((iterations, 100), dtype=float) for idx in range(iterations): # subsample the table np.random.shuffle(ids) subsampled_ids = set(ids[:sample_depth]) filter_f = lambda v, i, md: i in subsampled_ids ss = md_specific.filter(filter_f, inplace=False) ss.filter(lambda v, i, md: sum(v) > 0, axis='observation') # normalize norm_f = lambda v, i, md: v / len(ss.ids(axis='sample')) ss.transform(norm_f) # compute percents obs_full = ss.sum('observation') results[idx] = np.array([(obs_full > j).sum() for j in np.arange(0.0, 1.0, 0.01)], dtype=float) observed_means.append(results.mean(axis=0)) observed_stderrs.append(results.std(axis=0) / np.sqrt(iterations)) return np.asarray(observed_means), np.asarray(observed_stderrs), observed_values, sample_depth def plot_shared_otus(n_samples, percents, labels, title=None, depth=None, stderrs=None, start=0): """Plot the shared OTUs Parameters ---------- n_samples : int The number of samples reflected percents : np.array The percents to plot title : str, optional Additional title name depth : int, optional The subsample depth """ if len(percents.shape) == 1: percents = np.array([percents]) fig = plt.figure() ax_count = plt.gca() title_str = 'Shared OTUs' if title is not None: title_str += ': %s' % title if depth is not None: title_str += ', n=%d' % depth ax_count.set_title(title_str) ax_count.set_yscale('log') ax_count.set_ylabel('Number of OTUs') ax_count.set_xlabel('Number of samples') x_ticks = np.arange(0.0, 1.0, 0.01) * n_samples for idx, row in enumerate(percents): if stderrs is not None: ax_count.errorbar(x_ticks[start:], row[start:], yerr=stderrs[idx][start:], fmt='o', markersize=5, markeredgewidth=0.25) else: ax_count.plot(x_ticks[start:], row[start:]) plt.legend(labels, numpoints=1) plt.tight_layout() ax_count.grid() def helper_single(category, values, label): percents, top_ids, top_taxa, n_samples = compute_shared_otus(table, category, values) plot_shared_otus(n_samples, percents, [label]) print "Top observed shared taxa:" for t in top_taxa: print ' %s' % t def helper_multiple(category, title, min_count=None, discretize=None): means, stderrs, values, depth = compute_multiple_values(table, category, min_count, discretize=discretize) plot_shared_otus(depth, means, values, title=title, depth=depth) helper_single('BODY_SITE', ['UBERON:feces'], 'Fecal') helper_single('BODY_SITE', ['UBERON:skin', 'UBERON:hand'], 'Skin') helper_single('BODY_SITE', ['UBERON:tongue'], 'Oral') helper_multiple('ANTIBIOTIC_SELECT', 'Antibiotic use') helper_multiple('SEASONAL_ALLERGIES', 'Seasonal allergies') helper_multiple('DIET_TYPE', 'Diet type') helper_multiple('TYPES_OF_PLANTS', 'Number of different plants') BMI_ranges = [(0, 18), (18, 25), (25, 30), (30, 35), (35, 40)] helper_multiple('BMI', 'BMI', discretize=BMI_ranges)