import csv
def load_vcf(filename):
fp = open(filename, 'rb')
r = csv.reader(fp, delimiter='\t')
snp_calls = []
for n, row in enumerate(r):
if row[0].startswith('#'):
continue
chr, pos, _, ref, alt = row[:5]
pos = int(pos)
snp_calls.append((chr, pos, ref, alt))
return snp_calls
cd /mnt
/mnt
ls *.vcf
SRR098038.vcf SRR098042.vcf
original = load_vcf('SRR098042.vcf')
evolved = load_vcf('SRR098038.vcf')
# print out all of the original strain mutations
for k in original:
print k
('rel606', 9972, 'T', 'G') ('rel606', 10563, 'G', 'A') ('rel606', 20488, 'G', 'A') ('rel606', 81158, 'A', 'C') ('rel606', 216480, 'C', 'T') ('rel606', 247796, 'T', 'C') ('rel606', 281923, 'G', 'T') ('rel606', 356372, 'T', 'G') ('rel606', 398061, 'G', 'C') ('rel606', 433359, 'CTTTTTTT', 'CTTTTTTTT') ('rel606', 473901, 'CCGC', 'CCGCGC') ('rel606', 553093, 'C', 'T') ('rel606', 590473, 'C', 'T') ('rel606', 648692, 'C', 'T') ('rel606', 734488, 'C', 'T') ('rel606', 943475, 'C', 'T') ('rel606', 965841, 'CGG', 'CGGG') ('rel606', 1095414, 'G', 'T') ('rel606', 1237485, 'T', 'C') ('rel606', 1331794, 'C', 'A') ('rel606', 1607915, 'ATT', 'AT') ('rel606', 1733343, 'G', 'A') ('rel606', 1895617, 'T', 'C') ('rel606', 1934133, 'CAA', 'CAAA') ('rel606', 1955859, 'ACC', 'ACCC') ('rel606', 2143829, 'ACCC', 'ACCCCC') ('rel606', 2333538, 'AT', 'ATT') ('rel606', 2389878, 'G', 'T') ('rel606', 2446984, 'A', 'C') ('rel606', 2611312, 'CT', 'C') ('rel606', 2665639, 'A', 'T') ('rel606', 2774433, 'G', 'T') ('rel606', 2843036, 'G', 'A') ('rel606', 2999330, 'G', 'A') ('rel606', 3260570, 'CTTT', 'CTT') ('rel606', 3273818, 'G', 'T') ('rel606', 3285613, 'C', 'A') ('rel606', 3288025, 'T', 'A') ('rel606', 3339313, 'A', 'C') ('rel606', 3401754, 'C', 'A') ('rel606', 3463900, 'C', 'T') ('rel606', 3481820, 'A', 'G') ('rel606', 3486812, 'T', 'C') ('rel606', 3488669, 'A', 'C') ('rel606', 3893548, 'CCA', 'C') ('rel606', 3909807, 'G', 'T') ('rel606', 3911965, 'ATTTTTTT', 'ATTTTTT') ('rel606', 4100183, 'A', 'G') ('rel606', 4107018, 'T', 'A') ('rel606', 4120979, 'A', 'C') ('rel606', 4145568, 'A', 'C') ('rel606', 4180200, 'T', 'G') ('rel606', 4201958, 'A', 'C') ('rel606', 4363338, 'C', 'A') ('rel606', 4431393, 'TGG', 'T') ('rel606', 4433347, 'A', 'G') ('rel606', 4616538, 'A', 'C')
# print out all of the evolved mutations
for k in evolved:
print k
('rel606', 1612, 'A', 'G') ('rel606', 9972, 'T', 'G') ('rel606', 10563, 'G', 'A') ('rel606', 29810, 'C', 'T') ('rel606', 62118, 'A', 'G') ('rel606', 80113, 'A', 'G') ('rel606', 81158, 'A', 'C') ('rel606', 86487, 'C', 'T') ('rel606', 105581, 'G', 'A') ('rel606', 216480, 'C', 'T') ('rel606', 235408, 'T', 'C') ('rel606', 247796, 'T', 'C') ('rel606', 281923, 'G', 'T') ('rel606', 295630, 'C', 'T') ('rel606', 357611, 'T', 'C') ('rel606', 360523, 'A', 'G') ('rel606', 369189, 'AGGGGG', 'AGGGG') ('rel606', 398061, 'G', 'C') ('rel606', 426451, 'A', 'G') ('rel606', 433359, 'CTTTTTTT', 'CTTTTTTTT') ('rel606', 447290, 'T', 'C') ('rel606', 451177, 'CAAAAAAA', 'CAAAAAAAA') ('rel606', 462346, 'C', 'T') ('rel606', 473901, 'CCGC', 'CCGCGC') ('rel606', 479549, 'C', 'T') ('rel606', 522180, 'A', 'G') ('rel606', 537562, 'C', 'T') ('rel606', 550478, 'AGGGGGGG', 'AGGGGGGGG') ('rel606', 553093, 'C', 'T') ('rel606', 581859, 'CTT', 'CTTT') ('rel606', 585419, 'C', 'G') ('rel606', 587027, 'A', 'T') ('rel606', 590472, 'C', 'CG') ('rel606', 592510, 'T', 'C') ('rel606', 594287, 'A', 'G') ('rel606', 609346, 'C', 'T') ('rel606', 609890, 'AGG', 'AG') ('rel606', 613169, 'C', 'T') ('rel606', 618015, 'A', 'G') ('rel606', 623182, 'G', 'A') ('rel606', 627610, 'G', 'A') ('rel606', 636294, 'AGGGGG', 'AGGGGGG') ('rel606', 648692, 'C', 'T') ('rel606', 664684, 'G', 'A') ('rel606', 664685, 'T', 'G') ('rel606', 670500, 'C', 'T') ('rel606', 679395, 'C', 'T') ('rel606', 692880, 'A', 'G') ('rel606', 723449, 'T', 'C') ('rel606', 731936, 'A', 'G') ('rel606', 734488, 'C', 'T') ('rel606', 734890, 'C', 'T') ('rel606', 767233, 'C', 'T') ('rel606', 767369, 'C', 'T') ('rel606', 771216, 'A', 'G') ('rel606', 804446, 'C', 'T') ('rel606', 817276, 'C', 'T') ('rel606', 819195, 'A', 'G') ('rel606', 828156, 'C', 'T') ('rel606', 894198, 'C', 'T') ('rel606', 904606, 'C', 'T') ('rel606', 943475, 'C', 'T') ('rel606', 960078, 'C', 'T') ('rel606', 985505, 'G', 'C') ('rel606', 1001295, 'A', 'G') ('rel606', 1012012, 'A', 'G') ('rel606', 1012984, 'C', 'T') ('rel606', 1048113, 'A', 'G') ('rel606', 1082646, 'A', 'G') ('rel606', 1084304, 'C', 'G') ('rel606', 1085185, 'C', 'T') ('rel606', 1113198, 'A', 'G') ('rel606', 1115003, 'C', 'G') ('rel606', 1115970, 'G', 'A') ('rel606', 1148998, 'A', 'G') ('rel606', 1166055, 'TGGGGGG', 'TGGGGGGG') ('rel606', 1180473, 'C', 'T') ('rel606', 1186699, 'C', 'T') ('rel606', 1256283, 'TCCCCC', 'TCCCCCC') ('rel606', 1287211, 'C', 'T') ('rel606', 1305575, 'C', 'T') ('rel606', 1331794, 'C', 'A') ('rel606', 1333191, 'C', 'T') ('rel606', 1339424, 'A', 'G') ('rel606', 1349558, 'AGGGGGGG', 'AGGGGGG') ('rel606', 1357824, 'C', 'T') ('rel606', 1364103, 'C', 'T') ('rel606', 1407377, 'C', 'T') ('rel606', 1428828, 'CTTTTTTTTT', 'CTTTTTTTT') ('rel606', 1439576, 'AGGGGG', 'AGGGGGG') ('rel606', 1465314, 'G', 'A') ('rel606', 1468305, 'T', 'C') ('rel606', 1470490, 'ATTTTTTTT', 'ATTTTTTT') ('rel606', 1484250, 'C', 'T') ('rel606', 1500053, 'A', 'G') ('rel606', 1511178, 'C', 'T') ('rel606', 1522006, 'C', 'T') ('rel606', 1533135, 'A', 'G') ('rel606', 1557691, 'A', 'G') ('rel606', 1583771, 'T', 'C') ('rel606', 1587505, 'C', 'T') ('rel606', 1591928, 'G', 'A') ('rel606', 1607915, 'ATT', 'AT') ('rel606', 1660918, 'A', 'C') ('rel606', 1665289, 'T', 'C') ('rel606', 1668292, 'T', 'C') ('rel606', 1700108, 'G', 'A') ('rel606', 1718342, 'T', 'C') ('rel606', 1733343, 'G', 'A') ('rel606', 1746124, 'G', 'A') ('rel606', 1750149, 'C', 'T') ('rel606', 1776431, 'T', 'TA') ('rel606', 1795808, 'GCCCCCCC', 'GCCCCCC') ('rel606', 1902731, 'A', 'G') ('rel606', 1918819, 'G', 'A') ('rel606', 1924396, 'G', 'T') ('rel606', 1945203, 'T', 'C') ('rel606', 1965157, 'T', 'C') ('rel606', 2007865, 'T', 'C') ('rel606', 2035094, 'C', 'A') ('rel606', 2035096, 'A', 'C') ('rel606', 2083398, 'CGG', 'CGGG') ('rel606', 2121590, 'T', 'C') ('rel606', 2162780, 'T', 'C') ('rel606', 2166182, 'G', 'A') ('rel606', 2167011, 'T', 'C') ('rel606', 2190388, 'TCCCCC', 'TCCCCCC') ('rel606', 2192961, 'G', 'T') ('rel606', 2198299, 'A', 'G') ('rel606', 2200070, 'G', 'A') ('rel606', 2216447, 'C', 'T') ('rel606', 2244525, 'G', 'A') ('rel606', 2258836, 'G', 'A') ('rel606', 2271283, 'A', 'G') ('rel606', 2277269, 'C', 'T') ('rel606', 2283856, 'A', 'G') ('rel606', 2293568, 'G', 'A') ('rel606', 2319260, 'G', 'A') ('rel606', 2333538, 'AT', 'ATT') ('rel606', 2359353, 'A', 'G') ('rel606', 2363952, 'G', 'A') ('rel606', 2374708, 'G', 'A') ('rel606', 2389878, 'G', 'T') ('rel606', 2393070, 'A', 'G') ('rel606', 2446984, 'A', 'C') ('rel606', 2483043, 'G', 'A') ('rel606', 2532744, 'T', 'C') ('rel606', 2570083, 'G', 'A') ('rel606', 2602849, 'TCCCCCCC', 'TCCCCCCCC') ('rel606', 2611312, 'CT', 'C') ('rel606', 2611313, 'TG', 'T') ('rel606', 2616295, 'AGGGGGGG', 'AGGGGGGGG,AGGGGGGGGG') ('rel606', 2639449, 'G', 'A') ('rel606', 2659827, 'T', 'C') ('rel606', 2665639, 'A', 'T') ('rel606', 2697224, 'T', 'C') ('rel606', 2724782, 'T', 'C') ('rel606', 2753451, 'G', 'A') ('rel606', 2753768, 'C', 'T') ('rel606', 2771282, 'G', 'A') ('rel606', 2780858, 'A', 'G') ('rel606', 2789588, 'A', 'G') ('rel606', 2843036, 'G', 'A') ('rel606', 2867864, 'T', 'C') ('rel606', 2881538, 'T', 'C') ('rel606', 2935321, 'T', 'C') ('rel606', 2952353, 'A', 'G') ('rel606', 2977018, 'G', 'A') ('rel606', 2999330, 'G', 'A') ('rel606', 3003744, 'C', 'T') ('rel606', 3017476, 'T', 'G') ('rel606', 3027417, 'A', 'G') ('rel606', 3037295, 'A', 'G') ('rel606', 3052305, 'AGCGCGCGCGCG', 'AGCGCGCGCG') ('rel606', 3052716, 'GCCCCCC', 'GCCCCCCC') ('rel606', 3054123, 'A', 'G') ('rel606', 3061923, 'T', 'C') ('rel606', 3072845, 'T', 'C') ('rel606', 3078788, 'C', 'T') ('rel606', 3088370, 'TAAAAAAAAA', 'TAAAAAAAA') ('rel606', 3105821, 'G', 'A') ('rel606', 3111293, 'G', 'A') ('rel606', 3114506, 'CGG', 'CGGG') ('rel606', 3117097, 'G', 'A') ('rel606', 3117473, 'T', 'A') ('rel606', 3127663, 'C', 'T') ('rel606', 3130508, 'G', 'A') ('rel606', 3135301, 'T', 'C') ('rel606', 3153996, 'T', 'C') ('rel606', 3178614, 'T', 'C') ('rel606', 3228444, 'G', 'A') ('rel606', 3240003, 'T', 'C') ('rel606', 3248428, 'CAAAAAA', 'CAAAAA') ('rel606', 3259432, 'A', 'G') ('rel606', 3260570, 'CTTT', 'CTT') ('rel606', 3268930, 'G', 'A') ('rel606', 3288025, 'T', 'A') ('rel606', 3322942, 'T', 'C') ('rel606', 3339313, 'A', 'C') ('rel606', 3342590, 'T', 'C') ('rel606', 3345160, 'A', 'G') ('rel606', 3351577, 'A', 'G') ('rel606', 3360893, 'GCCCCCC', 'GCCCCCCC,GCCCCCCCC') ('rel606', 3401754, 'C', 'A') ('rel606', 3463900, 'C', 'T') ('rel606', 3479105, 'T', 'C') ('rel606', 3481820, 'A', 'G') ('rel606', 3486895, 'G', 'T') ('rel606', 3488669, 'A', 'C') ('rel606', 3499794, 'A', 'G') ('rel606', 3536603, 'G', 'A') ('rel606', 3546377, 'T', 'C') ('rel606', 3547183, 'A', 'G') ('rel606', 3566527, 'G', 'A') ('rel606', 3570439, 'T', 'C') ('rel606', 3570560, 'GTTTTTTTT', 'GTTTTTTT') ('rel606', 3600378, 'C', 'T') ('rel606', 3605711, 'T', 'C') ('rel606', 3607788, 'T', 'C') ('rel606', 3612959, 'C', 'T') ('rel606', 3625450, 'GCCCCCCC', 'GCCCCCC') ('rel606', 3635762, 'T', 'C') ('rel606', 3652530, 'TACA', 'TACACA') ('rel606', 3740571, 'A', 'G') ('rel606', 3758443, 'C', 'T') ('rel606', 3822371, 'G', 'A') ('rel606', 3822735, 'G', 'T') ('rel606', 3837268, 'G', 'A') ('rel606', 3844068, 'T', 'C') ('rel606', 3862134, 'G', 'A') ('rel606', 3864940, 'G', 'A') ('rel606', 3867544, 'A', 'G') ('rel606', 3869813, 'G', 'A') ('rel606', 3872302, 'G', 'A') ('rel606', 3874430, 'G', 'A') ('rel606', 3874844, 'G', 'A') ('rel606', 3885824, 'T', 'C') ('rel606', 3892335, 'C', 'T') ('rel606', 3893548, 'CCA', 'C') ('rel606', 3909807, 'G', 'T') ('rel606', 3911965, 'ATTTTTTT', 'ATTTTTT') ('rel606', 3920984, 'C', 'T') ('rel606', 3950544, 'A', 'G') ('rel606', 4046586, 'T', 'A') ('rel606', 4100183, 'A', 'G') ('rel606', 4107018, 'T', 'A') ('rel606', 4111636, 'T', 'C') ('rel606', 4154522, 'A', 'G') ('rel606', 4163775, 'C', 'T') ('rel606', 4180200, 'T', 'G') ('rel606', 4196682, 'C', 'A') ('rel606', 4201958, 'A', 'C') ('rel606', 4209289, 'T', 'C') ('rel606', 4212600, 'G', 'A') ('rel606', 4313510, 'C', 'T') ('rel606', 4325444, 'T', 'A') ('rel606', 4347725, 'C', 'T') ('rel606', 4349281, 'ATTTTTTTT', 'ATTTTTTT') ('rel606', 4363338, 'C', 'A') ('rel606', 4380609, 'C', 'T') ('rel606', 4382294, 'C', 'T') ('rel606', 4431393, 'TGG', 'T') ('rel606', 4433347, 'A', 'G') ('rel606', 4472216, 'C', 'T') ('rel606', 4491013, 'A', 'G') ('rel606', 4522700, 'C', 'T') ('rel606', 4533672, 'A', 'G') ('rel606', 4560894, 'C', 'A') ('rel606', 4583729, 'T', 'C') ('rel606', 4588672, 'A', 'G') ('rel606', 4616538, 'A', 'C') ('rel606', 4629225, 'C', 'T')
You might expect the ancestor ('original') to have exactly the same variants as in 'evolved'. But while it shares many, some are not; see below.
for k in original:
if k not in evolved:
print k, 'is only in original'
('rel606', 20488, 'G', 'A') is only in original ('rel606', 356372, 'T', 'G') is only in original ('rel606', 590473, 'C', 'T') is only in original ('rel606', 965841, 'CGG', 'CGGG') is only in original ('rel606', 1095414, 'G', 'T') is only in original ('rel606', 1237485, 'T', 'C') is only in original ('rel606', 1895617, 'T', 'C') is only in original ('rel606', 1934133, 'CAA', 'CAAA') is only in original ('rel606', 1955859, 'ACC', 'ACCC') is only in original ('rel606', 2143829, 'ACCC', 'ACCCCC') is only in original ('rel606', 2774433, 'G', 'T') is only in original ('rel606', 3273818, 'G', 'T') is only in original ('rel606', 3285613, 'C', 'A') is only in original ('rel606', 3486812, 'T', 'C') is only in original ('rel606', 4120979, 'A', 'C') is only in original ('rel606', 4145568, 'A', 'C') is only in original
If you go look at these in tview, some of them are low coverage, and others are probably mismappings due to repetitive sequence (anywhere where you see multiple different variants, it's probably due to mismappings, since these are clonal populations).
mutS_start = 2751953
mutS_end = 2754514
for (name, pos, a, b) in original:
if pos >= mutS_start and pos < mutS_end:
print 'in mutS in original:', name, pos, a, b
for (name, pos, a, b) in evolved:
if pos >= mutS_start and pos < mutS_end:
print 'in mutS in evolved:', name, pos, a, b
in mutS in evolved: rel606 2753451 G A in mutS in evolved: rel606 2753768 C T
yep!