#!/usr/bin/env python
# coding: utf-8
# In[1]:
from __future__ import print_function, division
# # `PyPlink`
#
# `PyPlink` is a Python module to read and write binary Plink files. Here are small examples for `PyPlink`.
# In[2]:
from pyplink import PyPlink
# ## Table of contents
#
# * [**Reading binary pedfile**](#Reading-binary-pedfile)
# * [Getting the demo data](#Getting-the-demo-data)
# * [Reading the binary file](#Reading-the-binary-file)
# * [Getting dataset information](#Getting-dataset-information)
# * [Iterating over all markers](#Iterating-over-all-markers)
# * [*Additive format*](#iterating_over_all_additive)
# * [*Nucleotide format*](#iterating_over_all_nuc)
# * [Iterating over selected markers](#Iterating-over-selected-markers)
# * [*Additive format*](#iterating_over_selected_additive)
# * [*Nucleotide format*](#iterating_over_selected_nuc)
# * [Extracting a single marker](#Extracting-a-single-marker)
# * [*Additive format*](#extracting_additive)
# * [*Nucleotide format*](#extracting_nuc)
# * [Misc example](#Misc-example)
# * [*Extracting a subset of markers and samples*](#Extracting-a-subset-of-markers-and-samples)
# * [*Counting the allele frequency of markers*](#Counting-the-allele-frequency-of-markers)
#
#
# * [**Writing binary pedfile**](#Writing-binary-pedfile)
# * [SNP-major format](#SNP-major-format)
# * [INDIVIDUAL-major-format](#INDIVIDUAL-major-format)
# ## Reading binary pedfile
#
# ### Getting the demo data
#
# The [`Plink`](http://pngu.mgh.harvard.edu/~purcell/plink/) softwares provides a testing dataset on the [resources page](http://pngu.mgh.harvard.edu/~purcell/plink/res.shtml). It contains the 270 samples from the HapMap project (release 23) on build GRCh36/hg18.
# In[3]:
import zipfile
try:
from urllib.request import urlretrieve
except ImportError:
from urllib import urlretrieve
# Downloading the demo data from Plink webset
urlretrieve(
"http://pngu.mgh.harvard.edu/~purcell/plink/dist/hapmap_r23a.zip",
"hapmap_r23a.zip",
)
# Extracting the archive content
with zipfile.ZipFile("hapmap_r23a.zip", "r") as z:
z.extractall(".")
# ### Reading the binary file
#
# To read a binary file, `PyPlink` only requires the prefix of the files.
# In[4]:
pedfile = PyPlink("hapmap_r23a")
# ### Getting dataset information
# In[5]:
print("{:,d} samples and {:,d} markers".format(
pedfile.get_nb_samples(),
pedfile.get_nb_markers(),
))
# In[6]:
all_samples = pedfile.get_fam()
all_samples.head()
# In[7]:
all_markers = pedfile.get_bim()
all_markers.head()
# ### Iterating over all markers
#
#
# #### Additive format
#
# Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele).
# In[8]:
for marker_id, genotypes in pedfile:
print(marker_id)
print(genotypes)
break
# In[9]:
for marker_id, genotypes in pedfile.iter_geno():
print(marker_id)
print(genotypes)
break
#
# #### Nucleotide format
#
# Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown).
# In[10]:
for marker_id, genotypes in pedfile.iter_acgt_geno():
print(marker_id)
print(genotypes)
break
# ### Iterating over selected markers
#
#
# #### Additive format
#
# Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele).
# In[11]:
markers = ["rs7092431", "rs9943770", "rs1587483"]
for marker_id, genotypes in pedfile.iter_geno_marker(markers):
print(marker_id)
print(genotypes, end="\n\n")
#
# #### Nucleotide format
#
# Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown).
# In[12]:
markers = ["rs7092431", "rs9943770", "rs1587483"]
for marker_id, genotypes in pedfile.iter_acgt_geno_marker(markers):
print(marker_id)
print(genotypes, end="\n\n")
# ### Extracting a single marker
#
#
# #### Additive format
#
# Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele).
# In[13]:
pedfile.get_geno_marker("rs7619974")
#
# #### Nucleotide format
#
# Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown).
# In[14]:
pedfile.get_acgt_geno_marker("rs7619974")
# ### Misc example
#
# #### Extracting a subset of markers and samples
#
# To get all markers on the Y chromosomes for the males.
# In[15]:
# Getting the Y markers
y_markers = all_markers[all_markers.chrom == 24].index.values
# Getting the males
males = all_samples.gender == 1
# Cycling through the Y markers
for marker_id, genotypes in pedfile.iter_geno_marker(y_markers):
male_genotypes = genotypes[males.values]
print("{:,d} total genotypes".format(len(genotypes)))
print("{:,d} genotypes for {:,d} males ({} on chr{} and position {:,d})".format(
len(male_genotypes),
males.sum(),
marker_id,
all_markers.loc[marker_id, "chrom"],
all_markers.loc[marker_id, "pos"],
))
break
# #### Counting the allele frequency of markers
#
# To count the minor allele frequency of a subset of markers (only for founders).
# In[16]:
# Getting the founders
founders = (all_samples.father == "0") & (all_samples.mother == "0")
# Computing the MAF
markers = ["rs7619974", "rs2949048", "rs16941434"]
for marker_id, genotypes in pedfile.iter_geno_marker(markers):
valid_genotypes = genotypes[founders.values & (genotypes != -1)]
maf = valid_genotypes.sum() / (len(valid_genotypes) * 2)
print(marker_id, round(maf, 6), sep="\t")
# ## Writing binary pedfile
#
# ### *SNP-major* format
#
# The following examples shows how to write a binary file using the `PyPlink` module. The *SNP-major* format is the default. It means that the binary file is written one marker at a time.
#
# > Note that `PyPlink` only writes the `BED` file. The user is required to create the `FAM` and `BIM` files.
# In[17]:
# The genotypes for 3 markers and 10 samples
all_genotypes = [
[0, 0, 0, 1, 0, 0, -1, 2, 1, 0],
[0, 0, 1, 1, 0, 0, 0, 1, 2, 0],
[0, 0, 0, 0, 1, 1, 0, 0, 0, 1],
]
# Writing the BED file using PyPlink
with PyPlink("test_output", "w") as pedfile:
for genotypes in all_genotypes:
pedfile.write_genotypes(genotypes)
# Writing a dummy FAM file
with open("test_output.fam", "w") as fam_file:
for i in range(10):
print("family_{}".format(i+1), "sample_{}".format(i+1), "0", "0", "0", "-9",
sep=" ", file=fam_file)
# Writing a dummy BIM file
with open("test_output.bim", "w") as bim_file:
for i in range(3):
print("1", "marker_{}".format(i+1), "0", i+1, "A", "T",
sep="\t", file=bim_file)
# In[18]:
# Checking the content of the newly created binary files
pedfile = PyPlink("test_output")
# In[19]:
pedfile.get_fam()
# In[20]:
pedfile.get_bim()
# In[21]:
for marker, genotypes in pedfile:
print(marker, genotypes)
# The newly created binary files are compatible with Plink.
# In[22]:
from subprocess import Popen, PIPE
# Computing frequencies
proc = Popen(["plink", "--noweb", "--bfile", "test_output", "--freq"],
stdout=PIPE, stderr=PIPE)
outs, errs = proc.communicate()
print(outs.decode(), end="")
# In[23]:
with open("plink.frq", "r") as i_file:
print(i_file.read(), end="")
# ### *INDIVIDUAL-major* format
#
# The following examples shows how to write a binary file using the `PyPlink` module. The *INDIVIDUAL-major* format means that the binary file is written one sample at a time.
#
# **Files in *INDIVIDUAL-major* format is not readable by `PyPlink`.** You need to convert it using *Plink*.
#
# > Note that `PyPlink` only writes the `BED` file. The user is required to create the `FAM` and `BIM` files.
# In[24]:
# The genotypes for 3 markers and 10 samples (INDIVIDUAL-major)
all_genotypes = [
[ 0, 0, 0],
[ 0, 0, 0],
[ 0, 1, 0],
[ 1, 1, 0],
[ 0, 0, 1],
[ 0, 0, 1],
[-1, 0, 0],
[ 2, 1, 0],
[ 1, 2, 0],
[ 0, 0, 1],
]
# Writing the BED file using PyPlink
with PyPlink("test_output_2", "w", bed_format="INDIVIDUAL-major") as pedfile:
for genotypes in all_genotypes:
pedfile.write_genotypes(genotypes)
# Writing a dummy FAM file
with open("test_output_2.fam", "w") as fam_file:
for i in range(10):
print("family_{}".format(i+1), "sample_{}".format(i+1), "0", "0", "0", "-9",
sep=" ", file=fam_file)
# Writing a dummy BIM file
with open("test_output_2.bim", "w") as bim_file:
for i in range(3):
print("1", "marker_{}".format(i+1), "0", i+1, "A", "T",
sep="\t", file=bim_file)
# In[25]:
from subprocess import Popen, PIPE
# Computing frequencies
proc = Popen(["plink", "--noweb", "--bfile", "test_output_2", "--freq", "--out", "plink_2"],
stdout=PIPE, stderr=PIPE)
outs, errs = proc.communicate()
print(outs.decode(), end="")
# In[26]:
with open("plink_2.frq", "r") as i_file:
print(i_file.read(), end="")