from genda.formats.panVCF import VCF
from genda.formats.Snp_array import SNP_array
from genda.formats.PED import PED
v = VCF('./tests/data/chr22.test.vcf')
#Preview VCF
v.vcf.ix[0:10,0:7]
#CHROM | POS | REF | ALT | QUAL | FILTER | FORMAT | |
---|---|---|---|---|---|---|---|
rs149201999 | 22 | 16050408 | T | C | 100 | PASS | GT:DS:GL |
rs146752890 | 22 | 16050612 | C | G | 100 | PASS | GT:DS:GL |
rs139377059 | 22 | 16050678 | C | T | 100 | PASS | GT:DS:GL |
rs188945759 | 22 | 16050984 | C | G | 100 | PASS | GT:DS:GL |
rs6518357 | 22 | 16051107 | C | A | 100 | PASS | GT:DS:GL |
rs62224609 | 22 | 16051249 | T | C | 100 | PASS | GT:DS:GL |
rs62224610 | 22 | 16051347 | G | C | 100 | PASS | GT:DS:GL |
rs143503259 | 22 | 16051453 | A | C | 100 | PASS | GT:DS:GL |
rs192339082 | 22 | 16051477 | C | A | 100 | PASS | GT:DS:GL |
rs79725552 | 22 | 16051480 | T | C | 100 | PASS | GT:DS:GL |
#Preview genotype matrix
v.geno.ix[0:10,0:7]
HG00096 | HG00097 | HG00099 | HG00100 | HG00101 | HG00102 | HG00103 | |
---|---|---|---|---|---|---|---|
rs149201999 | 0 | 1 | 1 | 0 | 1 | 1 | 0 |
rs146752890 | 1 | 1 | 1 | 0 | 1 | 0 | 0 |
rs139377059 | 0 | 1 | 1 | 0 | 1 | 1 | 0 |
rs188945759 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
rs6518357 | 0 | 1 | 1 | 0 | 1 | 1 | 0 |
rs62224609 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
rs62224610 | 0 | 1 | 1 | 1 | 1 | 1 | 0 |
rs143503259 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
rs192339082 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
rs79725552 | 0 | 1 | 1 | 0 | 1 | 1 | 0 |
#Load in data from a SNP array which is formatted with both alleles in one column (eg. 23 and me data)
s = SNP_array('./tests/data/one-column-test-data', fileformat='one column', delim = '\t',
encoding = {'rs4477212':'A/G','rs3094315':'A/G','rs3131972':'G/A','rs12124819':'A/G','rs11240777':'A/G',\
'rs6681049':'C/T','rs4970383':'T/C','rs4475691':'T/C','rs7537756':'A/T'})
#Preview data from SNP array
s.df.ix[0:10,:]
rsid | chromosome | position | genotype | |
---|---|---|---|---|
rsid | ||||
rs4477212 | rs4477212 | 1 | 72017 | AA |
rs3094315 | rs3094315 | 1 | 742429 | AA |
rs3131972 | rs3131972 | 1 | 742584 | GG |
rs12124819 | rs12124819 | 1 | 766409 | AG |
rs11240777 | rs11240777 | 1 | 788822 | GG |
rs6681049 | rs6681049 | 1 | 789870 | CC |
rs4970383 | rs4970383 | 1 | 828418 | CC |
rs4475691 | rs4475691 | 1 | 836671 | CC |
rs7537756 | rs7537756 | 1 | 844113 | AA |
#Preview genotype data from SNP array
s.geno.ix[0:10,:]
genotype | |
---|---|
rsid | |
rs4477212 | 0 |
rs3094315 | 0 |
rs3131972 | 0 |
rs12124819 | 1 |
rs11240777 | 2 |
rs6681049 | 0 |
rs4970383 | 2 |
rs4475691 | 2 |
rs7537756 | 0 |
#Load in data form a SNP array with two columns representing the alleles of an individual
t = SNP_array('tests/data/two-column-test-data', fileformat = 'two column', delim = '\t',\
encoding = {'rs4477212':'A/G','rs3094315':'A/G','rs3131972':'G/A','rs12124819':'A/G',\
'rs11240777':'A/G','rs6681049':'C/T','rs4970383':'T/C','rs4475691':'T/C','rs7537756':'A/T'})
#Preview data
t.df.ix[0:10,0:4]
chromosome | Snp.ID | genetic.position | bp.position | |
---|---|---|---|---|
Snp.ID | ||||
rs4477212 | 1 | rs4477212 | 0 | 72017 |
rs3094315 | 1 | rs3094315 | 0 | 742429 |
rs3131972 | 1 | rs3131972 | 0 | 742584 |
rs12124819 | 1 | rs12124819 | 0 | 766409 |
rs11240777 | 1 | rs11240777 | 0 | 788822 |
rs6681049 | 1 | rs6681049 | 0 | 789870 |
rs4970383 | 1 | rs4970383 | 0 | 828418 |
rs4475691 | 1 | rs4475691 | 0 | 836671 |
rs7537756 | 1 | rs7537756 | 0 | 844113 |
#Preview genotype matrix
t.geno.ix[0:10,0:7]
person1_alle | person2_alle | |
---|---|---|
Snp.ID | ||
rs4477212 | 0 | 0 |
rs3094315 | 0 | 1 |
rs3131972 | 0 | 1 |
rs12124819 | 1 | 1 |
rs11240777 | 2 | 2 |
rs6681049 | 0 | 1 |
rs4970383 | 2 | 2 |
rs4475691 | 2 | 0 |
rs7537756 | 0 | 1 |
#Load in a PED file
p = PED('tests/data/test.ped', 'tests/data/test.map', {'snp1':'A/C','snp2':'A/C','snp3':'C/A','snp4':'T/G','snp5':'C/A'})
#See the parsed out PED file
p.PED
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | sample1 | 0 | 0 | 1 | 1 | A | A | A | A | A | A | T | G | A | A |
1 | 2 | sample2 | 0 | 0 | 1 | 1 | A | C | A | C | A | C | T | G | A | C |
2 | 3 | sample3 | 0 | 0 | 2 | 1 | A | A | A | A | A | A | G | G | A | A |
3 | 4 | sample4 | 0 | 0 | 2 | 1 | A | 0 | A | C | A | C | G | G | A | C |
#See parsed out MAP file
p.MAP
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 1 | snp1 | 0 | 1000 |
1 | X | snp2 | 0 | 1000 |
2 | Y | snp3 | 0 | 1000 |
3 | XY | snp4 | 0 | 1000 |
4 | MT | snp5 | 0 | 1000 |
#Vieing the genotype matrix
p.geno
1 | sample1 | sample2 | sample3 | sample4 |
---|---|---|---|---|
1 | ||||
snp1 | 0 | 1 | 0 | NaN |
snp2 | 0 | 1 | 0 | 1 |
snp3 | 2 | 1 | 2 | 1 |
snp4 | 1 | 1 | 2 | 2 |
snp5 | 2 | 1 | 2 | 1 |