from __future__ import print_function
import dendropy
from dendropy import popgenstat
import os
from collections import OrderedDict
genes_species = OrderedDict()
my_species = ['RESTV', 'SUDV']
my_genes = ['NP', 'L', 'VP35', 'VP40']
for name in my_genes:
gene_name = name.split('.')[0]
char_mat = dendropy.DnaCharacterMatrix.get_from_path('%s_align.fasta' % name, 'fasta')
genes_species[gene_name] = {}
for species in my_species:
genes_species[gene_name][species] = dendropy.DnaCharacterMatrix()
for taxon, char_map in char_mat.items():
species = taxon.label.split('_')[0]
if species in my_species:
genes_species[gene_name][species].extend_map({taxon: char_map})
import numpy as np
import pandas as pd
summary = np.ndarray(shape=(len(genes_species), 4 * len(my_species)))
stats = ['seg_sites', 'nuc_div', 'taj_d', 'wat_theta']
for row, (gene, species_data) in enumerate(genes_species.items()):
for col_base, species in enumerate(my_species):
summary[row, col_base * 4] = popgenstat.num_segregating_sites(species_data[species])
summary[row, col_base * 4 + 1] = popgenstat.nucleotide_diversity(species_data[species])
summary[row, col_base * 4 + 2] = popgenstat.tajimas_d(species_data[species])
summary[row, col_base * 4 + 3] = popgenstat.wattersons_theta(species_data[species])
columns = []
for species in my_species:
columns.extend(['%s (%s)' % (stat, species) for stat in stats])
df = pd.DataFrame(summary, index=genes_species.keys(), columns=columns)
df # vs print(df)
seg_sites (RESTV) | nuc_div (RESTV) | taj_d (RESTV) | wat_theta (RESTV) | seg_sites (SUDV) | nuc_div (SUDV) | taj_d (SUDV) | wat_theta (SUDV) | |
---|---|---|---|---|---|---|---|---|
NP | 113 | 0.020659 | -0.482275 | 49.489051 | 118 | 0.029630 | 1.203522 | 56.64 |
L | 288 | 0.018143 | -0.295386 | 126.131387 | 282 | 0.024193 | 1.412350 | 135.36 |
VP35 | 42 | 0.017099 | -0.530330 | 18.394161 | 50 | 0.027761 | 1.069061 | 24.00 |
VP40 | 61 | 0.026155 | -0.188135 | 26.715328 | 41 | 0.023517 | 1.269160 | 19.68 |
def do_basic_popgen(seqs):
num_seg_sites = popgenstat.num_segregating_sites(seqs)
avg_pair = popgenstat.average_number_of_pairwise_differences(seqs)
nuc_div = popgenstat.nucleotide_diversity(seqs)
print('Segregating sites: %d, Avg pairwise diffs: %.2f, Nucleotide diversity %.6f' % (num_seg_sites, avg_pair, nuc_div))
print("Watterson's theta: %s" % popgenstat.wattersons_theta(seqs))
print("Tajima's D: %s" % popgenstat.tajimas_d(seqs))
ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path('trim.fasta',
schema='fasta', data_type='dna')
sl_2014 = []
drc_2007 = []
ebov2007_set = dendropy.DnaCharacterMatrix()
ebov2014_set = dendropy.DnaCharacterMatrix()
for taxon, char_map in ebov_seqs.items():
print(taxon.label)
if taxon.label.startswith('EBOV_2014') and len(sl_2014) < 8:
sl_2014.append(char_map)
ebov2014_set.extend_map({taxon: char_map})
elif taxon.label.startswith('EBOV_2007'):
drc_2007.append(char_map)
ebov2007_set.extend_map({taxon: char_map})
del ebov_seqs
BDBV_KC545393 18728 bp BDBV_KC545395 18728 bp BDBV_KC545394 18728 bp BDBV_KC545396 18728 bp BDBV_FJ217161 18728 bp TAFV_FJ217162 18728 bp EBOV_2014_KM034549 18728 bp EBOV_2014_KM034550 18728 bp EBOV_2014_KM034554 18728 bp EBOV_2014_KM034555 18728 bp EBOV_2014_KM034556 18728 bp EBOV_2014_KM034557 18728 bp EBOV_2014_KM034560 18728 bp EBOV_2014_KM034551 18728 bp EBOV_2014_KM034552 18728 bp EBOV_2014_KM034553 18728 bp EBOV_2014_KM034558 18728 bp EBOV_2014_KM034559 18728 bp EBOV_2014_KM034561 18728 bp EBOV_2014_KM034562 18728 bp EBOV_2014_KM034563 18728 bp EBOV_1976_AF272001 18728 bp EBOV_1976_KC242801 18728 bp EBOV_1995_KC242796 18728 bp EBOV_1995_KC242799 18728 bp EBOV_2007_KC242784 18728 bp EBOV_2007_KC242785 18728 bp EBOV_2007_KC242787 18728 bp EBOV_2007_KC242786 18728 bp EBOV_2007_KC242789 18728 bp EBOV_2007_KC242788 18728 bp EBOV_2007_KC242790 18728 bp RESTV_AB050936 18728 bp RESTV_JX477166 18728 bp RESTV_FJ621585 18728 bp RESTV_JX477165 18728 bp RESTV_FJ621583 18728 bp RESTV_FJ621584 18728 bp SUDV_KC242783 18728 bp SUDV_FJ968794 18728 bp SUDV_EU338380 18728 bp SUDV_AY729654 18728 bp SUDV_JN638998 18728 bp SUDV_KC589025 18728 bp
print('2007 outbreak:')
print('Number of individuals: %s' % len(ebov2007_set.taxon_set))
do_basic_popgen(ebov2007_set)
print('\n2014 outbreak:')
print('Number of individuals: %s' % len(ebov2014_set.taxon_set))
do_basic_popgen(ebov2014_set)
2007 outbreak: Number of individuals: 7 Segregating sites: 25, Avg pairwise diffs: 7.71, Nucleotide diversity 0.000412 Watterson's theta: 10.2040816327 Tajima's D: -1.38311415748 2014 outbreak: Number of individuals: 8 Segregating sites: 6, Avg pairwise diffs: 2.79, Nucleotide diversity 0.000149 Watterson's theta: 2.31404958678 Tajima's D: 0.950120802758
print(len(sl_2014))
print(len(drc_2007))
8 7
pair_stats = popgenstat.PopulationPairSummaryStatistics(sl_2014, drc_2007)
print('Average number of pairwise differences irrespective of population: %.2f' %
pair_stats.average_number_of_pairwise_differences)
print('Average number of pairwise differences between populations: %.2f' %
pair_stats.average_number_of_pairwise_differences_between)
print('Average number of pairwise differences within populations: %.2f' %
pair_stats.average_number_of_pairwise_differences_within)
print('Average number of net pairwise differences : %.2f' %
pair_stats.average_number_of_pairwise_differences_net)
print('Number of segregating sites: %d' %
pair_stats.num_segregating_sites)
print("Watterson's theta: %.2f" %
pair_stats.wattersons_theta)
print("Wakeley's Psi: %.3f" % pair_stats.wakeleys_psi)
print("Tajima's D: %.2f" % pair_stats.tajimas_d)
Average number of pairwise differences irrespective of population: 284.46 Average number of pairwise differences between populations: 529.07 Average number of pairwise differences within populations: 10.50 Average number of net pairwise differences : 518.57 Number of segregating sites: 549 Watterson's theta: 168.84 Wakeley's Psi: 0.296 Tajima's D: 3.05