f = open('test.biom','w')
f.write("""{
"columns": [
{
"id": "Sample1",
"metadata": {
"BarcodeSequence": "AGCACGAGCCTA",
"DOB": 20060805
}
},
{
"id": "Sample2",
"metadata": {
"BarcodeSequence": "AACTCGTCGATG",
"DOB": 20060216
}
},
{
"id": "Sample3",
"metadata": {
"BarcodeSequence": "ACAGACCACTCA",
"DOB": 20060109
}
},
{
"id": "Sample4",
"metadata": {
"BarcodeSequence": "ACCAGCGACTAG",
"DOB": 20070530
}
},
{
"id": "Sample5",
"metadata": {
"BarcodeSequence": "AGCAGCACTTGT",
"DOB": 20070101
}
},
{
"id": "Sample6",
"metadata": {
"BarcodeSequence": "AGCAGCACAACT",
"DOB": 20070716
}
}
],
"data": [
[0, 2, 1.0],
[1, 0, 5.0],
[1, 1, 1.0],
[1, 3, 2.0],
[1, 4, 3.0],
[1, 5, 1.0],
[2, 2, 1.0],
[2, 3, 4.0],
[2, 5, 2.0],
[3, 0, 2.0],
[3, 1, 1.0],
[3, 2, 1.0],
[3, 5, 1.0],
[4, 1, 1.0],
[4, 2, 1.0]
],
"date": "2012-12-11T07:30:29.870689",
"format": "Biological Observation Matrix 1.0.0",
"format_url": "http://biom-format.org",
"generated_by": "some software package",
"id": null,
"matrix_element_type": "float",
"matrix_type": "sparse",
"rows": [
{
"id": "GG_OTU_1",
"metadata": {
"confidence": 0.665,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
}
},
{
"id": "GG_OTU_2",
"metadata": {
"confidence": 0.98,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__OnlyOnce1"]
}
},
{
"id": "GG_OTU_3",
"metadata": {
"confidence": 1.0,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
}
},
{
"id": "GG_OTU_4",
"metadata": {
"confidence": 0.842,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__Lachnospiraceae"]
}
},
{
"id": "GG_OTU_5",
"metadata": {
"confidence": 1.0,
"taxonomy": ["Root", "k__Bacteria", "p__Firmicutes", "c__Clostridia", "o__Clostridiales", "f__OnlyOnce2"]
}
}
],
"shape": [5, 6],
"type": "OTU table"
}""")
f.close()
from biom import load_table
t = load_table('test.biom')
for e in t.observation_metadata:
print e['taxonomy']
[u'Root', u'k__Bacteria', u'p__Firmicutes', u'c__Clostridia', u'o__Clostridiales', u'f__Lachnospiraceae'] [u'Root', u'k__Bacteria', u'p__Firmicutes', u'c__Clostridia', u'o__Clostridiales', u'f__OnlyOnce1'] [u'Root', u'k__Bacteria', u'p__Firmicutes', u'c__Clostridia', u'o__Clostridiales', u'f__Lachnospiraceae'] [u'Root', u'k__Bacteria', u'p__Firmicutes', u'c__Clostridia', u'o__Clostridiales', u'f__Lachnospiraceae'] [u'Root', u'k__Bacteria', u'p__Firmicutes', u'c__Clostridia', u'o__Clostridiales', u'f__OnlyOnce2']
When collapsing with Table.collapse
, there is only one resulting OTU. The two families that were associated with only one OTU each are dropped.
def collapse_on_family(id_, md):
return ';'.join(md['taxonomy'][:6])
t_biom_collapsed = t.collapse(collapse_on_family, axis='observation')
print t_biom_collapsed
# Constructed from biom file #OTU ID Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae 0.666666666667 0.333333333333 1.0 1.33333333333 0.0 1.0
However, when we collapse with QIIME those are retained, which is the expected behavior.
!summarize_taxa.py -i test.biom -o summarize_taxa_out/
t_qiime_collapsed = load_table('summarize_taxa_out/test_L6.biom')
print t_qiime_collapsed
# Constructed from biom file #OTU ID Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae 0.285714285714 0.333333333333 0.75 0.666666666667 0.0 0.75 Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__OnlyOnce1 0.714285714286 0.333333333333 0.0 0.333333333333 1.0 0.25 Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__OnlyOnce2 0.0 0.333333333333 0.25 0.0 0.0 0.0
Confirming that with an alternative collapsing function we observe the same incorrect result. Here I define collapse_f
as in the BIOM docs here though note that there are some unrelated issues with that approach.
collapse_f = lambda id_, md: md['taxonomy'][5]
alt_t_biom_collapsed = t.collapse(collapse_f, axis='observation')
print alt_t_biom_collapsed
# Constructed from biom file #OTU ID Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 f__Lachnospiraceae 0.666666666667 0.333333333333 1.0 1.33333333333 0.0 1.0
The solution here is to pass min_group_size=1
. I think this should be the default (see here for further discussion).
collapse_f = lambda id_, md: md['taxonomy'][5]
alt_t_biom_collapsed = t.collapse(collapse_f, axis='observation', min_group_size=1)
print alt_t_biom_collapsed
# Constructed from biom file #OTU ID Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 f__OnlyOnce2 0.0 1.0 1.0 0.0 0.0 0.0 f__Lachnospiraceae 0.666666666667 0.333333333333 1.0 1.33333333333 0.0 1.0 f__OnlyOnce1 5.0 1.0 0.0 2.0 3.0 1.0
For the records:
!print_qiime_config.py
System information ================== Platform: darwin Python version: 2.7.1 (r271:86832, Aug 30 2012, 10:07:33) [GCC 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2336.11.00)] Python executable: /Users/caporaso/.virtualenvs/qiime/bin/python Dependency versions =================== NumPy version: 1.8.1 SciPy version: 0.14.0 matplotlib version: 1.3.1 biom-format version: 2.0.1 qcli version: 0.1.0 pyqi version: 0.3.2 scikit-bio version: 0.1.3 QIIME library version: 1.8.0-dev, master@0668ba7 QIIME script version: 1.8.0-dev PyNAST version (if installed): 1.2.2 Emperor version: 0.9.3 RDP Classifier version (if installed): rdp_classifier-2.2.jar Java version (if installed): 1.6.0_65 QIIME config values =================== blastmat_dir: /Applications/blast-2.2.22/data/ sc_queue: all.q template_alignment_lanemask_fp: /Users/caporaso/data/greengenes_core_sets/lanemask_in_1s_and_0s.txt pynast_template_alignment_fp: /Users/caporaso/data/greengenes_core_sets/core_set_aligned_imputed.fasta_11_8_07.no_dots seconds_to_sleep: 1 pynast_template_alignment_blastdb: None assign_taxonomy_reference_seqs_fp: /Users/caporaso/data/gg_13_8_otus/rep_set/97_otus.fasta torque_queue: friendlyq topiaryexplorer_project_dir: /Users/caporaso/code/TopiaryExplorer-0.9.1/ jobs_to_start: 4 denoiser_min_per_core: 50 cluster_jobs_fp: start_parallel_jobs.py assign_taxonomy_id_to_taxonomy_fp: /Users/caporaso/data/gg_13_8_otus/taxonomy/97_otu_taxonomy.txt temp_dir: /Users/caporaso/temp blastall_fp: blastall