from __future__ import print_function, division
PyPlink
¶PyPlink
is a Python module to read and write binary Plink files. Here are small examples for PyPlink
.
from pyplink import PyPlink
The Plink
softwares provides a testing dataset on the resources page. It contains the 270 samples from the HapMap project (release 23) on build GRCh36/hg18.
import zipfile
try:
from urllib.request import urlretrieve
except ImportError:
from urllib import urlretrieve
# Downloading the demo data from Plink webset
urlretrieve(
"http://pngu.mgh.harvard.edu/~purcell/plink/dist/hapmap_r23a.zip",
"hapmap_r23a.zip",
)
# Extracting the archive content
with zipfile.ZipFile("hapmap_r23a.zip", "r") as z:
z.extractall(".")
To read a binary file, PyPlink
only requires the prefix of the files.
pedfile = PyPlink("hapmap_r23a")
print("{:,d} samples and {:,d} markers".format(
pedfile.get_nb_samples(),
pedfile.get_nb_markers(),
))
270 samples and 4,098,136 markers
all_samples = pedfile.get_fam()
all_samples.head()
fid | iid | father | mother | gender | status | |
---|---|---|---|---|---|---|
0 | NA18524 | NA18524 | 0 | 0 | 1 | -9 |
1 | NA18526 | NA18526 | 0 | 0 | 2 | -9 |
2 | NA18529 | NA18529 | 0 | 0 | 2 | -9 |
3 | NA18532 | NA18532 | 0 | 0 | 2 | -9 |
4 | NA18537 | NA18537 | 0 | 0 | 2 | -9 |
all_markers = pedfile.get_bim()
all_markers.head()
chrom | pos | cm | a1 | a2 | |
---|---|---|---|---|---|
snp | |||||
rs10399749 | 1 | 45162 | 0 | 0 | C |
rs2949420 | 1 | 45257 | 0 | 0 | T |
rs4030303 | 1 | 72434 | 0 | A | G |
rs4030300 | 1 | 72515 | 0 | 0 | C |
rs3855952 | 1 | 77689 | 0 | 0 | A |
for marker_id, genotypes in pedfile:
print(marker_id)
print(genotypes)
break
rs10399749 [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 -1 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 -1 0 0 0 0 0 0 0 -1 0 -1 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 -1 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
for marker_id, genotypes in pedfile.iter_geno():
print(marker_id)
print(genotypes)
break
rs10399749 [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 -1 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 -1 0 0 0 0 0 0 0 -1 0 -1 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 -1 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
for marker_id, genotypes in pedfile.iter_acgt_geno():
print(marker_id)
print(genotypes)
break
rs10399749 ['CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC']
markers = ["rs7092431", "rs9943770", "rs1587483"]
for marker_id, genotypes in pedfile.iter_geno_marker(markers):
print(marker_id)
print(genotypes, end="\n\n")
rs7092431 [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] rs9943770 [ 0 0 0 2 0 2 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 -1 0 0 0 1 2 1 1 0 1 1 2 0 0 0 1 0 0 0 0 1 1 0 2 1 1 1 0 1 1 -1 1 0 0 1 0 0 2 0 1 2 0 1 1 1 2 0 1 0 0 0 0 0 0 0 2 1 2 1 1 2 1 1 1 1 0 1 1 2 1 0 -1 0 1 1 1 0 1 0 1 -1 2 1 0 0 0 1 1 1 2 0 0 1 1] rs1587483 [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
markers = ["rs7092431", "rs9943770", "rs1587483"]
for marker_id, genotypes in pedfile.iter_acgt_geno_marker(markers):
print(marker_id)
print(genotypes, end="\n\n")
rs7092431 ['GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'] rs9943770 ['AA' 'AA' 'AA' 'GG' 'AA' 'GG' 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'GA' 'AA' 'AA' 'GA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' '00' 'AA' 'AA' 'AA' 'GA' 'GG' 'GA' 'GA' 'AA' 'GA' 'GA' 'GG' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'GA' 'GA' 'AA' 'GG' 'GA' 'GA' 'GA' 'AA' 'GA' 'GA' '00' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GG' 'AA' 'GA' 'GG' 'AA' 'GA' 'GA' 'GA' 'GG' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GG' 'GA' 'GG' 'GA' 'GA' 'GG' 'GA' 'GA' 'GA' 'GA' 'AA' 'GA' 'GA' 'GG' 'GA' 'AA' '00' 'AA' 'GA' 'GA' 'GA' 'AA' 'GA' 'AA' 'GA' '00' 'GG' 'GA' 'AA' 'AA' 'AA' 'GA' 'GA' 'GA' 'GG' 'AA' 'AA' 'GA' 'GA'] rs1587483 ['GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'CG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG']
pedfile.get_geno_marker("rs7619974")
array([ 0, 0, 0, 0, 0, -1, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=int8)
pedfile.get_acgt_geno_marker("rs7619974")
array(['CC', 'CC', 'CC', 'CC', 'CC', '00', 'CC', 'CC', '00', '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00'], dtype='<U2')
# Getting the Y markers
y_markers = all_markers[all_markers.chrom == 24].index.values
# Getting the males
males = all_samples.gender == 1
# Cycling through the Y markers
for marker_id, genotypes in pedfile.iter_geno_marker(y_markers):
male_genotypes = genotypes[males.values]
print("{:,d} total genotypes".format(len(genotypes)))
print("{:,d} genotypes for {:,d} males ({} on chr{} and position {:,d})".format(
len(male_genotypes),
males.sum(),
marker_id,
all_markers.loc[marker_id, "chrom"],
all_markers.loc[marker_id, "pos"],
))
break
270 total genotypes 142 genotypes for 142 males (rs1140798 on chr24 and position 169,542)
To count the minor allele frequency of a subset of markers (only for founders).
# Getting the founders
founders = (all_samples.father == "0") & (all_samples.mother == "0")
# Computing the MAF
markers = ["rs7619974", "rs2949048", "rs16941434"]
for marker_id, genotypes in pedfile.iter_geno_marker(markers):
valid_genotypes = genotypes[founders.values & (genotypes != -1)]
maf = valid_genotypes.sum() / (len(valid_genotypes) * 2)
print(marker_id, round(maf, 6), sep="\t")
rs7619974 0.0 rs2949048 0.02381 rs16941434 0.357143
The following examples shows how to write a binary file using the PyPlink
module. The SNP-major format is the default. It means that the binary file is written one marker at a time.
Note that
PyPlink
only writes theBED
file. The user is required to create theFAM
andBIM
files.
# The genotypes for 3 markers and 10 samples
all_genotypes = [
[0, 0, 0, 1, 0, 0, -1, 2, 1, 0],
[0, 0, 1, 1, 0, 0, 0, 1, 2, 0],
[0, 0, 0, 0, 1, 1, 0, 0, 0, 1],
]
# Writing the BED file using PyPlink
with PyPlink("test_output", "w") as pedfile:
for genotypes in all_genotypes:
pedfile.write_genotypes(genotypes)
# Writing a dummy FAM file
with open("test_output.fam", "w") as fam_file:
for i in range(10):
print("family_{}".format(i+1), "sample_{}".format(i+1), "0", "0", "0", "-9",
sep=" ", file=fam_file)
# Writing a dummy BIM file
with open("test_output.bim", "w") as bim_file:
for i in range(3):
print("1", "marker_{}".format(i+1), "0", i+1, "A", "T",
sep="\t", file=bim_file)
# Checking the content of the newly created binary files
pedfile = PyPlink("test_output")
pedfile.get_fam()
fid | iid | father | mother | gender | status | |
---|---|---|---|---|---|---|
0 | family_1 | sample_1 | 0 | 0 | 0 | -9 |
1 | family_2 | sample_2 | 0 | 0 | 0 | -9 |
2 | family_3 | sample_3 | 0 | 0 | 0 | -9 |
3 | family_4 | sample_4 | 0 | 0 | 0 | -9 |
4 | family_5 | sample_5 | 0 | 0 | 0 | -9 |
5 | family_6 | sample_6 | 0 | 0 | 0 | -9 |
6 | family_7 | sample_7 | 0 | 0 | 0 | -9 |
7 | family_8 | sample_8 | 0 | 0 | 0 | -9 |
8 | family_9 | sample_9 | 0 | 0 | 0 | -9 |
9 | family_10 | sample_10 | 0 | 0 | 0 | -9 |
pedfile.get_bim()
chrom | pos | cm | a1 | a2 | |
---|---|---|---|---|---|
snp | |||||
marker_1 | 1 | 1 | 0 | A | T |
marker_2 | 1 | 2 | 0 | A | T |
marker_3 | 1 | 3 | 0 | A | T |
for marker, genotypes in pedfile:
print(marker, genotypes)
marker_1 [ 0 0 0 1 0 0 -1 2 1 0] marker_2 [0 0 1 1 0 0 0 1 2 0] marker_3 [0 0 0 0 1 1 0 0 0 1]
The newly created binary files are compatible with Plink.
from subprocess import Popen, PIPE
# Computing frequencies
proc = Popen(["plink", "--noweb", "--bfile", "test_output", "--freq"],
stdout=PIPE, stderr=PIPE)
outs, errs = proc.communicate()
print(outs.decode(), end="")
@----------------------------------------------------------@ | PLINK! | v1.07 | 10/Aug/2009 | |----------------------------------------------------------| | (C) 2009 Shaun Purcell, GNU General Public License, v2 | |----------------------------------------------------------| | For documentation, citation & bug-report instructions: | | http://pngu.mgh.harvard.edu/purcell/plink/ | @----------------------------------------------------------@ Skipping web check... [ --noweb ] Writing this text to log file [ plink.log ] Analysis started: Fri Jan 20 09:20:55 2017 Options in effect: --noweb --bfile test_output --freq Reading map (extended format) from [ test_output.bim ] 3 markers to be included from [ test_output.bim ] Reading pedigree information from [ test_output.fam ] 10 individuals read from [ test_output.fam ] 0 individuals with nonmissing phenotypes Assuming a disease phenotype (1=unaff, 2=aff, 0=miss) Missing phenotype value is also -9 0 cases, 0 controls and 10 missing 0 males, 0 females, and 10 of unspecified sex Warning, found 10 individuals with ambiguous sex codes These individuals will be set to missing ( or use --allow-no-sex ) Writing list of these individuals to [ plink.nosex ] Reading genotype bitfile from [ test_output.bed ] Detected that binary PED file is v1.00 SNP-major mode Before frequency and genotyping pruning, there are 3 SNPs 10 founders and 0 non-founders found Writing allele frequencies (founders-only) to [ plink.frq ] Analysis finished: Fri Jan 20 09:20:55 2017
with open("plink.frq", "r") as i_file:
print(i_file.read(), end="")
CHR SNP A1 A2 MAF NCHROBS 1 marker_1 A T 0.2222 18 1 marker_2 A T 0.25 20 1 marker_3 A T 0.15 20
The following examples shows how to write a binary file using the PyPlink
module. The INDIVIDUAL-major format means that the binary file is written one sample at a time.
Files in INDIVIDUAL-major format is not readable by PyPlink
. You need to convert it using Plink.
Note that
PyPlink
only writes theBED
file. The user is required to create theFAM
andBIM
files.
# The genotypes for 3 markers and 10 samples (INDIVIDUAL-major)
all_genotypes = [
[ 0, 0, 0],
[ 0, 0, 0],
[ 0, 1, 0],
[ 1, 1, 0],
[ 0, 0, 1],
[ 0, 0, 1],
[-1, 0, 0],
[ 2, 1, 0],
[ 1, 2, 0],
[ 0, 0, 1],
]
# Writing the BED file using PyPlink
with PyPlink("test_output_2", "w", bed_format="INDIVIDUAL-major") as pedfile:
for genotypes in all_genotypes:
pedfile.write_genotypes(genotypes)
# Writing a dummy FAM file
with open("test_output_2.fam", "w") as fam_file:
for i in range(10):
print("family_{}".format(i+1), "sample_{}".format(i+1), "0", "0", "0", "-9",
sep=" ", file=fam_file)
# Writing a dummy BIM file
with open("test_output_2.bim", "w") as bim_file:
for i in range(3):
print("1", "marker_{}".format(i+1), "0", i+1, "A", "T",
sep="\t", file=bim_file)
from subprocess import Popen, PIPE
# Computing frequencies
proc = Popen(["plink", "--noweb", "--bfile", "test_output_2", "--freq", "--out", "plink_2"],
stdout=PIPE, stderr=PIPE)
outs, errs = proc.communicate()
print(outs.decode(), end="")
@----------------------------------------------------------@ | PLINK! | v1.07 | 10/Aug/2009 | |----------------------------------------------------------| | (C) 2009 Shaun Purcell, GNU General Public License, v2 | |----------------------------------------------------------| | For documentation, citation & bug-report instructions: | | http://pngu.mgh.harvard.edu/purcell/plink/ | @----------------------------------------------------------@ Skipping web check... [ --noweb ] Writing this text to log file [ plink_2.log ] Analysis started: Fri Jan 20 09:21:10 2017 Options in effect: --noweb --bfile test_output_2 --freq --out plink_2 Reading map (extended format) from [ test_output_2.bim ] 3 markers to be included from [ test_output_2.bim ] Reading pedigree information from [ test_output_2.fam ] 10 individuals read from [ test_output_2.fam ] 0 individuals with nonmissing phenotypes Assuming a disease phenotype (1=unaff, 2=aff, 0=miss) Missing phenotype value is also -9 0 cases, 0 controls and 10 missing 0 males, 0 females, and 10 of unspecified sex Warning, found 10 individuals with ambiguous sex codes These individuals will be set to missing ( or use --allow-no-sex ) Writing list of these individuals to [ plink_2.nosex ] Reading genotype bitfile from [ test_output_2.bed ] Detected that binary PED file is v1.00 individual-major mode Before frequency and genotyping pruning, there are 3 SNPs Converting data to SNP-major format 10 founders and 0 non-founders found Writing allele frequencies (founders-only) to [ plink_2.frq ] Analysis finished: Fri Jan 20 09:21:10 2017
with open("plink_2.frq", "r") as i_file:
print(i_file.read(), end="")
CHR SNP A1 A2 MAF NCHROBS 1 marker_1 A T 0.2222 18 1 marker_2 A T 0.25 20 1 marker_3 A T 0.15 20