#!/usr/bin/env python # coding: utf-8 # In[1]: from __future__ import print_function, division # # `PyPlink` # # `PyPlink` is a Python module to read and write binary Plink files. Here are small examples for `PyPlink`. # In[2]: from pyplink import PyPlink # ## Table of contents # # * [**Reading binary pedfile**](#Reading-binary-pedfile) # * [Getting the demo data](#Getting-the-demo-data) # * [Reading the binary file](#Reading-the-binary-file) # * [Getting dataset information](#Getting-dataset-information) # * [Iterating over all markers](#Iterating-over-all-markers) # * [*Additive format*](#iterating_over_all_additive) # * [*Nucleotide format*](#iterating_over_all_nuc) # * [Iterating over selected markers](#Iterating-over-selected-markers) # * [*Additive format*](#iterating_over_selected_additive) # * [*Nucleotide format*](#iterating_over_selected_nuc) # * [Extracting a single marker](#Extracting-a-single-marker) # * [*Additive format*](#extracting_additive) # * [*Nucleotide format*](#extracting_nuc) # * [Misc example](#Misc-example) # * [*Extracting a subset of markers and samples*](#Extracting-a-subset-of-markers-and-samples) # * [*Counting the allele frequency of markers*](#Counting-the-allele-frequency-of-markers) # # # * [**Writing binary pedfile**](#Writing-binary-pedfile) # * [SNP-major format](#SNP-major-format) # * [INDIVIDUAL-major-format](#INDIVIDUAL-major-format) # ## Reading binary pedfile # # ### Getting the demo data # # The [`Plink`](http://pngu.mgh.harvard.edu/~purcell/plink/) softwares provides a testing dataset on the [resources page](http://pngu.mgh.harvard.edu/~purcell/plink/res.shtml). It contains the 270 samples from the HapMap project (release 23) on build GRCh36/hg18. # In[3]: import zipfile try: from urllib.request import urlretrieve except ImportError: from urllib import urlretrieve # Downloading the demo data from Plink webset urlretrieve( "http://pngu.mgh.harvard.edu/~purcell/plink/dist/hapmap_r23a.zip", "hapmap_r23a.zip", ) # Extracting the archive content with zipfile.ZipFile("hapmap_r23a.zip", "r") as z: z.extractall(".") # ### Reading the binary file # # To read a binary file, `PyPlink` only requires the prefix of the files. # In[4]: pedfile = PyPlink("hapmap_r23a") # ### Getting dataset information # In[5]: print("{:,d} samples and {:,d} markers".format( pedfile.get_nb_samples(), pedfile.get_nb_markers(), )) # In[6]: all_samples = pedfile.get_fam() all_samples.head() # In[7]: all_markers = pedfile.get_bim() all_markers.head() # ### Iterating over all markers # # # #### Additive format # # Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele). # In[8]: for marker_id, genotypes in pedfile: print(marker_id) print(genotypes) break # In[9]: for marker_id, genotypes in pedfile.iter_geno(): print(marker_id) print(genotypes) break # # #### Nucleotide format # # Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown). # In[10]: for marker_id, genotypes in pedfile.iter_acgt_geno(): print(marker_id) print(genotypes) break # ### Iterating over selected markers # # # #### Additive format # # Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele). # In[11]: markers = ["rs7092431", "rs9943770", "rs1587483"] for marker_id, genotypes in pedfile.iter_geno_marker(markers): print(marker_id) print(genotypes, end="\n\n") # # #### Nucleotide format # # Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown). # In[12]: markers = ["rs7092431", "rs9943770", "rs1587483"] for marker_id, genotypes in pedfile.iter_acgt_geno_marker(markers): print(marker_id) print(genotypes, end="\n\n") # ### Extracting a single marker # # # #### Additive format # # Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele). # In[13]: pedfile.get_geno_marker("rs7619974") # # #### Nucleotide format # # Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown). # In[14]: pedfile.get_acgt_geno_marker("rs7619974") # ### Misc example # # #### Extracting a subset of markers and samples # # To get all markers on the Y chromosomes for the males. # In[15]: # Getting the Y markers y_markers = all_markers[all_markers.chrom == 24].index.values # Getting the males males = all_samples.gender == 1 # Cycling through the Y markers for marker_id, genotypes in pedfile.iter_geno_marker(y_markers): male_genotypes = genotypes[males.values] print("{:,d} total genotypes".format(len(genotypes))) print("{:,d} genotypes for {:,d} males ({} on chr{} and position {:,d})".format( len(male_genotypes), males.sum(), marker_id, all_markers.loc[marker_id, "chrom"], all_markers.loc[marker_id, "pos"], )) break # #### Counting the allele frequency of markers # # To count the minor allele frequency of a subset of markers (only for founders). # In[16]: # Getting the founders founders = (all_samples.father == "0") & (all_samples.mother == "0") # Computing the MAF markers = ["rs7619974", "rs2949048", "rs16941434"] for marker_id, genotypes in pedfile.iter_geno_marker(markers): valid_genotypes = genotypes[founders.values & (genotypes != -1)] maf = valid_genotypes.sum() / (len(valid_genotypes) * 2) print(marker_id, round(maf, 6), sep="\t") # ## Writing binary pedfile # # ### *SNP-major* format # # The following examples shows how to write a binary file using the `PyPlink` module. The *SNP-major* format is the default. It means that the binary file is written one marker at a time. # # > Note that `PyPlink` only writes the `BED` file. The user is required to create the `FAM` and `BIM` files. # In[17]: # The genotypes for 3 markers and 10 samples all_genotypes = [ [0, 0, 0, 1, 0, 0, -1, 2, 1, 0], [0, 0, 1, 1, 0, 0, 0, 1, 2, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0, 1], ] # Writing the BED file using PyPlink with PyPlink("test_output", "w") as pedfile: for genotypes in all_genotypes: pedfile.write_genotypes(genotypes) # Writing a dummy FAM file with open("test_output.fam", "w") as fam_file: for i in range(10): print("family_{}".format(i+1), "sample_{}".format(i+1), "0", "0", "0", "-9", sep=" ", file=fam_file) # Writing a dummy BIM file with open("test_output.bim", "w") as bim_file: for i in range(3): print("1", "marker_{}".format(i+1), "0", i+1, "A", "T", sep="\t", file=bim_file) # In[18]: # Checking the content of the newly created binary files pedfile = PyPlink("test_output") # In[19]: pedfile.get_fam() # In[20]: pedfile.get_bim() # In[21]: for marker, genotypes in pedfile: print(marker, genotypes) # The newly created binary files are compatible with Plink. # In[22]: from subprocess import Popen, PIPE # Computing frequencies proc = Popen(["plink", "--noweb", "--bfile", "test_output", "--freq"], stdout=PIPE, stderr=PIPE) outs, errs = proc.communicate() print(outs.decode(), end="") # In[23]: with open("plink.frq", "r") as i_file: print(i_file.read(), end="") # ### *INDIVIDUAL-major* format # # The following examples shows how to write a binary file using the `PyPlink` module. The *INDIVIDUAL-major* format means that the binary file is written one sample at a time. # # **Files in *INDIVIDUAL-major* format is not readable by `PyPlink`.** You need to convert it using *Plink*. # # > Note that `PyPlink` only writes the `BED` file. The user is required to create the `FAM` and `BIM` files. # In[24]: # The genotypes for 3 markers and 10 samples (INDIVIDUAL-major) all_genotypes = [ [ 0, 0, 0], [ 0, 0, 0], [ 0, 1, 0], [ 1, 1, 0], [ 0, 0, 1], [ 0, 0, 1], [-1, 0, 0], [ 2, 1, 0], [ 1, 2, 0], [ 0, 0, 1], ] # Writing the BED file using PyPlink with PyPlink("test_output_2", "w", bed_format="INDIVIDUAL-major") as pedfile: for genotypes in all_genotypes: pedfile.write_genotypes(genotypes) # Writing a dummy FAM file with open("test_output_2.fam", "w") as fam_file: for i in range(10): print("family_{}".format(i+1), "sample_{}".format(i+1), "0", "0", "0", "-9", sep=" ", file=fam_file) # Writing a dummy BIM file with open("test_output_2.bim", "w") as bim_file: for i in range(3): print("1", "marker_{}".format(i+1), "0", i+1, "A", "T", sep="\t", file=bim_file) # In[25]: from subprocess import Popen, PIPE # Computing frequencies proc = Popen(["plink", "--noweb", "--bfile", "test_output_2", "--freq", "--out", "plink_2"], stdout=PIPE, stderr=PIPE) outs, errs = proc.communicate() print(outs.decode(), end="") # In[26]: with open("plink_2.frq", "r") as i_file: print(i_file.read(), end="")