def shear(tree, names):
    """Lop off tips until the tree just has the desired tip names"""

    tcopy = tree.deepcopy()
    all_tips = set([n.Name for n in tcopy.tips()])
    ids = set(names)

    if not ids.issubset(all_tips):
        raise ValueError, "ids are not a subset of the tree!"
    
    while len(tcopy.tips()) != len(ids):
        for n in tcopy.tips():
            if n.Name not in ids:
                n.Parent.removeNode(n)
        tcopy.prune()
    
    return tcopy

def RFD(tree1, tree2, proportion = False):
    """Calculates the Robinson and Foulds symmetric difference

    Implementation based off of code by Julia Goodrich
    """
    t1names = tree1.getTipNames()
    t2names = tree2.getTipNames()
    if set(t1names) != set(t2names):
        if len(t1names) < len(t2names):
            tree2 = shear(tree2, t1names)
        else:
            tree1 = shear(tree1, t2names)
    
    tree1_sets = tree1.subsets()
    tree2_sets = tree2.subsets()
 
    not_in_both = tree1_sets ^ tree2_sets

    total_subsets = len(tree1_sets) + len(tree2_sets)
    dist = len(not_in_both)

    if proportion:
        dist = dist/float(total_subsets)
    return dist

base_region_boundaries = [
      ('v2', 136, 1868), #27f-338r
      ('v2.v3', 136, 2232),
      ('v2.v4', 136, 4051),
      ('v2.v6', 136, 4932),
      ('v2.v8', 136, 6426),
      ('v2.v9', 136, 6791),
      ('v3', 1916, 2232), #349f-534r
      ('v3.v4', 1916, 4051),
      ('v3.v6', 1916, 4932),
      ('v3.v8', 1916, 6426),
      ('v3.v9', 1916, 6791),
      ('v4', 2263, 4051), #515f-806r
      ('v4.v6', 2263, 4932),
      ('v4.v8', 2263, 6426),
      ('v4.v9', 2263, 6791),
      ('v6', 4653, 4932), #967f-1048r
      ('v6.v8', 4653, 6426),
      ('v6.v9', 4653, 6791),
      ('v9', 6450, 6791), #1391f-1492r
      ('full.length', 0, 7682),  # Start 150, 250, 400 base pair reads
      ('v2.150', 136, 702),
      ('v2.250', 136, 1752),
      ('v2.v3.400', 136, 2036), # Skips reads that are larger than amplicon size
      ('v3.v4.150', 1916, 2235),
      ('v3.v4.250', 1916, 2493),
      ('v3.v4.400', 1916, 4014),
      ('v4.150', 2263, 3794),
      ('v4.250', 2263, 4046),
      ('v4.v6.400', 2263, 4574),
      ('v6.v8.150', 4653, 5085),
      ('v6.v8.250', 4653, 5903),
      ('v6.v8.400', 4653, 6419)
]
percentages = range(76,98,3)

labels = []
trees = []
from os.path import exists
for p in percentages:
    for rb in base_region_boundaries:
        tree_path = '/home/ubuntu/data/gg_%s_otus_4feb2011_aligned_%s_pfiltered.tre' % (p,rb[0])
        if exists(tree_path):
            labels.append("%i.%s" % (p,rb[0]))
            trees.append(tree_path)
print len(trees)

from IPython import parallel
rc = parallel.Client(packer='pickle')
view = rc.load_balanced_view()
print "We have %i engines available" % len(rc.ids)
# skip cached results (for faster debugging)
rc[:]['skip_cache'] = True

import socket
ar = rc[:].apply_async(socket.gethostname)
mapping = ar.get_dict()

rev_map = {}

per_node = 4
for eid, node in mapping.iteritems():
    if node not in rev_map:
        rev_map[node] = []
    if len(rev_map[node]) < per_node:
        rev_map[node].append(eid)

from IPython.utils.data import flatten
targets = flatten(rev_map.values())
len(targets)

dv = rc[targets]
dv.push(dict(shear=shear, RFD=RFD, trees=trees), block=False)
load_ar = dv.execute(
"""
from cogent.parse.tree import DndParser
ts = [DndParser(open(tree_fp)) for tree_fp in trees]
""", block=False)
load_ar

def compute_row(i, ts):
    """function to compute a whole row with RFD"""
    import numpy
    row = []
    t1 = ts[i]
    for t2 in ts:
        row.append(RFD(t1, t2, True))
    return numpy.array(row)

ntasks = len(trees)
tree_ref = parallel.Reference('ts')
rf_view = rc.load_balanced_view(targets=targets)
row_ars = [ rf_view.apply_async(compute_row, i, tree_ref) for i in range(ntasks) ]

import time
ready = [ ar.ready() for ar in row_ars ]
while not all(ready):
    ready = [ ar.ready() for ar in row_ars ]
    sys.stdout.write('\r%3i / %3i' % (sum(ready), len(ready)))
    sys.stdout.flush()
    time.sleep(5)

rfrows = [ ar.get() for ar in row_ars ]

rf_dist_mat = numpy.array(rfrows)


from qiime.format import format_distance_matrix
rf_parallel_fp = '/home/ubuntu/data/rf_distance_matrix.txt'
rf_parallel_f = open(rf_parallel_fp,'w')
rf_parallel_f.write(format_distance_matrix(labels,rf_dist_mat))
rf_parallel_f.close()

We now have a Robinson-Foulds distance matrix which we can compare against our original Pearson distance matrix to confirm that our conclusions are robust to the choice of distance metric.

We'll begin by comparing these with the Mantel test.
!compare_distance_matrices.py -i /home/ubuntu/data/distance_matrix_complete.txt,/home/ubuntu/data/rf_distance_matrix.txt -o /home/ubuntu/data/pearson_v_rf --method mantel -n 1000
!cat /home/ubuntu/data/pearson_v_rf/mantel_results.txt
We see that the Mantel test gives a significant p-value (p = 0.001), telling us that the Pearson distances are significantly correlated with the Robinson-Foulds distances. 

As we based our original conclusions on our interpretation of a Principal Coordinates (PCoA) plot, it's also useful to perform a Procrustes analysis to tell us whether we would be likely to derive the same conclusion from the PCoA plot generated from Robinson-Foulds distances, based on low sum-of-squares distances between the paired points in the two principal coordinate plots.
!principal_coordinates.py -i /home/ubuntu/data/rf_distance_matrix.txt -o /home/ubuntu/data/rf_pc.txt

!transform_coordinate_matrices.py -i /home/ubuntu/data/rf_pc.txt,/home/ubuntu/data/pc.txt -o /home/ubuntu/data/pearson_v_rf/ -r 1000

!cat /home/ubuntu/data/pearson_v_rf/rf_pc_pc_procrustes_results.txt

!compare_3d_plots.py -i /home/ubuntu/data/pearson_v_rf/pc1_transformed.txt,/home/ubuntu/data/pearson_v_rf/pc2_transformed.txt -o /home/ubuntu/data/pearson_v_rf/plots/ -m tree_metadata.txt