from biom import load_table t1 = load_table('table.biom') level = 6 def collapse_f(id_, md): return ';'.join(md['taxonomy'][:level]) collapsed_t1 = t1.collapse(collapse_f, axis='observation') len(collapsed_t1.observation_ids) !summarize_taxa.py -i table.biom -o summarize_taxa_out/ t2 = load_table('summarize_taxa_out/table_L6.biom') len(t2.observation_ids) level = 6 def collapse_f(id_, md): return ';'.join(md['taxonomy'][:level]) collapsed_t1 = t1.collapse(collapse_f, axis='observation', min_group_size=1) len(collapsed_t1.observation_ids) from os.path import exists if not exists('table.from_biom_w_taxonomy.txt'): !biom convert -i table.biom -o table.from_biom_w_taxonomy.txt --to-tsv --header-key taxonomy import pandas as pd t3 = pd.read_csv('table.from_biom_w_taxonomy.txt', sep='\t', skiprows=2, index_col="otu-id", names=['otu-id', 'count', 'taxonomy']) # All counts are greater than zero len(t3[t3["count"] == 0]) t3_taxa = [] for e in t3['taxonomy']: t = [x.strip() for x in e.split(';')] t3_taxa.append(';'.join(t[:level])) print len(set(t3_taxa)) print set(t3_taxa) - set(t2.observation_ids) print set(t2.observation_ids) - set(t3_taxa) print set(collapsed_t1.observation_ids) - set(t2.observation_ids) print set(t2.observation_ids) - set(collapsed_t1.observation_ids) set(collapsed_t1.observation_ids) - set(t2.observation_ids) missing_ids = set(t2.observation_ids) - set(collapsed_t1.observation_ids) missing_ids from collections import defaultdict tax_to_ids = defaultdict(list) for e in t3.index: tax = t3['taxonomy'][e].replace(' ','') try: s_index = tax.index(';s__') except ValueError: continue tax = tax[:s_index] tax_to_ids[tax].append(e) singles = [] for tax, ids in tax_to_ids.items(): if len(ids) == 1: singles.append(tax) print set(singles) - set(missing_ids) print len(singles) print tax_to_ids['k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Anaerostipes'] print len(t3.index) print len(set(t3.index)) print t2 r = [] for m in missing_ids: obs_idx = t2.index(m,axis='observation') r.append((t2.get_value_by_ids(m,'MockMiSeq.even'), t2.observation_ids[obs_idx])) r.sort() for e in r: print e r = [] for e in list(t2.iter(axis='observation')): r.append((e[0][0], e[1])) r.sort() for e in r: print e !print_qiime_config.py f = open('test.biom','w') f.write("""{ "columns": [ { "id": "Sample1", "metadata": { "BarcodeSequence": "AGCACGAGCCTA", "DOB": 20060805 } }, { "id": "Sample2", "metadata": { "BarcodeSequence": "AACTCGTCGATG", "DOB": 20060216 } }, { "id": "Sample3", "metadata": { "BarcodeSequence": "ACAGACCACTCA", "DOB": 20060109 } }, { "id": "Sample4", "metadata": { "BarcodeSequence": "ACCAGCGACTAG", "DOB": 20070530 } }, { "id": "Sample5", "metadata": { "BarcodeSequence": "AGCAGCACTTGT", "DOB": 20070101 } }, { "id": "Sample6", "metadata": { "BarcodeSequence": "AGCAGCACAACT", "DOB": 20070716 } } ], "data": [ [0, 2, 1.0], [1, 0, 5.0], [1, 1, 1.0], [1, 3, 2.0], [1, 4, 3.0], [1, 5, 1.0], [2, 2, 1.0], [2, 3, 4.0], [2, 5, 2.0], [3, 0, 2.0], [3, 1, 1.0], [3, 2, 1.0], [3, 5, 1.0], [4, 1, 1.0], [4, 2, 1.0] ], "date": "2012-12-11T07:30:29.870689", "format": "Biological Observation Matrix 1.0.0", "format_url": "http://biom-format.org", "generated_by": "some software package", "id": null, "matrix_element_type": "float", "matrix_type": "sparse", "rows": [ { "id": "GG_OTU_1", "metadata": { "confidence": 0.665, "taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"] } }, { "id": "GG_OTU_2", "metadata": { "confidence": 0.98, "taxonomy": ["Root", "k__Bacteria"] } }, { "id": "GG_OTU_3", "metadata": { "confidence": 1.0, "taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"] } }, { "id": "GG_OTU_4", "metadata": { "confidence": 0.842, "taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"] } }, { "id": "GG_OTU_5", "metadata": { "confidence": 1.0, "taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__OnlyOnce"] } } ], "shape": [5, 6], "type": "OTU table" }""") f.close() t4 = load_table('test.biom') for e in t4.observation_metadata: print e['taxonomy'] def collapse_on_family(id_, md): return ';'.join(md['taxonomy'][:5]) collapsed_t4 = t4.collapse(collapse_f, axis='observation') print collapsed_t4