#!/usr/bin/env python # coding: utf-8 # # Live oak introgression manuscript # ## Notebook 1: RADseq Data Assembly # # # #### D. Eaton, A. Hipp, A. Gonzalez-Rodriguez & J. Cavender-Bares # ##### contact: deren.eaton@yale.edu # # ----------------------- # ### This notebook # This is an IPython notebook, a tool for reproducible science. It can be downloaded and executed to run all scripts in the notebook, or viewed as 'read-only' through a browser. The default language in each cell is IPython except where indicated with "%%bash" or "!", indicating bash scripts. # # ------------------------- # In[ ]: get_ipython().run_cell_magic('bash', '', '## create directories for data assembly\nmkdir -p analysis_pyrad/fastq/ \nmkdir -p figures/\n') # ## Download the sequence data # # Sequence data for this study is archived on the NCBI sequence read archive (SRA). The data were run in two separate Illumina runs, but are combined under a single project id number. # # Project SRA: SRP055977 # Project number: PRJNA277574 # Biosample numbers: SAMN03394519 -- SAMN03394561 # Runs: SRR1915524 -- SRR1915566 # The first data set contains 30 samples sequenced on an Illumina GAIIx for 100bp single-end reads, and all sample names from this data set end with "\_v". The second data set includes 15 samples sequenced on an Illumina HiSEQ2000 for 100bp single-end reads, and samples from this data set end with "\_re". The quality scores are offset by 64 on the first data and by 33 on the second. Of the 15 samples in the second data set, 7 are re-sequenced individuals from the first, and thus after being filtered through step 2 of the _pyRAD_ analysis are combined with the replicates from the first data set, and clustered together in step 3. This yields a total of 37 samples in the data set. The sample "VI1" from the Hipp data set does not have voucher information and was thus excluded from the analysis. The re-sequenced sample "CRL0001" from the Hipp data set was suspected to be contaminated and was also excluded. The total number of samples is 37. # # You can download the data set using the script below: # In[77]: get_ipython().run_cell_magic('bash', '', '## get the SRA format files\nfor run in $(seq 24 28);\n\n do\nwget -q -r -nH --cut-dirs=9 \\\nftp://ftp-trace.ncbi.nlm.nih.gov/\\\nsra/sra-instant/reads/ByRun/sra/SRR/\\\nSRR191/SRR19155$run/SRR19155$run".sra";\n\ndone\n') # In[78]: get_ipython().run_cell_magic('bash', '', '## convert sra files to fastq using fastq-dump tool\nfastq-dump *.sra\n') # Rename fastq files to have sample ID names instead of SRR run IDs. The following script will replace the numbers with the correct names. # In[81]: ## This reads in a table mapping sample names to SRA numbers ## that is hosted on github import pandas import urllib2 import glob import os ## open table from github url url = "https://raw.githubusercontent.com/"+\ "dereneaton/virentes/master/SraRunTable.txt" intable = urllib2.urlopen(url) ## make name xfer dictionary DF = pandas.read_table(intable, sep="\t") D = {DF.Run_s[i]:DF.Library_Name_s[i] for i in DF.index} ## change file names and move to fastq dir/ for fname in glob.glob("*.fastq"): os.rename(fname, "analysis_pyrad/fastq/"+\ D[fname.replace(".fastq",".fq")]) # # _pyRAD_ assembly of _de novo_ RADseq loci # # RADseq data were filtered and clustered using the program _pyRAD_ v.2.13. Below are the scripts for generating the params file and entering the parameters used in the study. Because the data are already de-multiplexed (sorted by barcodes), a barcodes files is not needed, and we begin the analysis from step 2 of _pyRAD_. # #### Creating a params input file # In[ ]: get_ipython().run_cell_magic('bash', '', '## create template params file\n~/pyrad_v.2.13/pyRAD -n\n') # In[ ]: get_ipython().run_cell_magic('bash', '', "## substitute new parameters into file\nsed -i '/## 1. /c\\analysis_pyrad ## 1. working directory ' params.txt\nsed -i '/## 2. /c\\ ## 2. data loc ' params.txt\nsed -i '/## 3. /c\\ ## 3. barcode loc ' params.txt\nsed -i '/## 7. /c\\12 ## 7. N processors ' params.txt\nsed -i '/## 9. /c\\6 ## 9. NQual ' params.txt\nsed -i '/## 10./c\\.85 ## 10. clust threshold ' params.txt\nsed -i '/## 13./c\\5 ## 13. maxSH ' params.txt\nsed -i '/## 14./c\\virentes_c85d6m4p5 ## 14. output name ' params.txt\nsed -i '/## 21./c\\1 ## 21. Filter type ' params.txt\nsed -i '/## 26./c\\20 ## 26. MaxSNPs ' params.txt\nsed -i '/## 29./c\\2,2 ## 29. trim overhang ' params.txt\nsed -i '/## 30./c\\u,n,k ## 30. output formats ' params.txt\n") # In[ ]: ## my modified template params file get_ipython().system(' cat params.txt') # ### Filtering the data # # I set the location of the de-multiplexed data to select only the '\_re' data set first, then I run step 2 of _pyRAD_ to filter these data based on quality scores with the default offset score of 33, and also to remove reads with Illumina adapters detected. After this I change the data location to select only the '\_v' data set files, and run step 2 with the offset score changed to 64. # In[ ]: get_ipython().run_cell_magic('bash', '', "sed -i '/## 18./c\\analysis_pyrad/fastq/*_re.fastq ## 18. select re data files ' params.txt \nsed -i '/## 20./c\\33 ## 20. using default 33 ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 2\n\nsed -i '/## 18./c\\analysis_pyrad/fastq/*_v.fastq ## 18. select v data files ' params.txt\nsed -i '/## 20./c\\64 ## 20. change offset to 64 ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 2\n") # In[ ]: get_ipython().run_cell_magic('bash', '', '## remove the two samples that we are excluding\nrm analysis_pyrad/edits/VI1_re.edit\nrm analysis_pyrad/edits/CRL0001_re.edit\n') # In[ ]: ## IPython code to concatenate data of re-sequenced individuals import glob edits = glob.glob("analysis_pyrad/edits/*.edit") names = ["_".join(i.split("_")[:-1]) for i in edits] for name in set(names): if names.count(name) == 2: cmd = "cat %s_re.edit %s_v.edit > %s.edit" % (name,name,name) stderr = get_ipython().getoutput(' $cmd') cmd = "rm %s_re.edit %s_v.edit" % (name,name) stderr = get_ipython().getoutput(' $cmd') else: cmd = "mv %s_re.edit %s.edit" % (name,name) stderr = get_ipython().getoutput(' $cmd') cmd = "mv %s_v.edit %s.edit" % (name,name) stderr = get_ipython().getoutput(' $cmd') # In[ ]: ## remove _re and _v endings from sequence names ## this makes it cleaner for combining re-sequenced samples for editfile in glob.glob("analysis_pyrad/edits/*.edit"): cmd = "sed -i s/%s/_/g %s" % ("_re_",editfile) get_ipython().system(' $cmd') cmd = "sed -i s/%s/_/g %s" % ("_v_",editfile) get_ipython().system(' $cmd') # ### Clustering within samples # In[ ]: get_ipython().run_cell_magic('bash', '', '~/pyrad_v.2.13/pyRAD -p params.txt -s 3\n') # ### Calling consensus sequences # In[ ]: get_ipython().run_cell_magic('bash', '', '~/pyrad_v.2.13/pyRAD -p params.txt -s 45\n') # #### Exclude small samples # Because the two samples TXWV2 and CUMM5 have considerably less data than all other samples we chose to exclude them at this step before clustering across samples. # In[ ]: get_ipython().run_cell_magic('bash', '', 'rm analysis_pyrad/clust.85/TXWV2.consens.gz\nrm analysis_pyrad/clust.85/CUMM5.consens.gz\n') # ### Clustering across samples # This is the most time-consuming step. # In[ ]: get_ipython().run_cell_magic('bash', '', '~/pyrad_v.2.13/pyRAD -p params.txt -s 6\n') # ### Outputting assembled data sets # # By changing the parameters on lines 13,14, and by subselecting taxa using line 19 of the params file, I created several data sets that include/exclude different samples, and vary in their proportions of missing data. # In[ ]: get_ipython().run_cell_magic('bash', '', "## The largest, all inclusive, and most missing data, data set (All_min4)\nsed -i '/## 12./c\\4 ## 12. MinCov ' params.txt \nsed -i '/## 14./c\\virentes_c85d6m4p5 ## 14. output name ' params.txt\nsed -i '/## 17./c\\ ## 17. exclude samples ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n") # In[ ]: get_ipython().run_cell_magic('bash', '', "## Smaller, all inclusive, less missing data, data set (All_min20)\nsed -i '/## 12./c\\20 ## 12. MinCov ' params.txt \nsed -i '/## 14./c\\virentes_c85d6m20p5 ## 14. output name ' params.txt\nsed -i '/## 17./c\\ ## 17. exclude samples ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n") # In[ ]: get_ipython().run_cell_magic('bash', '', "## excluding outgroups for using in structure\nsed -i '/## 12./c\\20 ## 12. MinCov ' params.txt \nsed -i '/## 14./c\\virentes_c85d6m20p5noutg ## 14. output name ' params.txt\nsed -i '/## 17./c\\AR,EN,DO,DU,CH,NI,HE,EN ## 17. exclude samples ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n") # In[ ]: get_ipython().run_cell_magic('bash', '', "## Only MGV clade and outgroups, low missing and high missing data sets (MGV_min4, MGV_min16)\nsed -i '/## 12./c\\4 ## 12. MinCov ' params.txt \nsed -i '/## 14./c\\MGV_c85d6m4p5 ## 14. output name ' params.txt\nsed -i '/## 17./c\\BJSB3,BJSL25,BJVL19,BZBB1,CRL0001,CRL0030,CUCA4,CUSV6,CUVN10,HNDA09,MXED8,MXGT4,MXSA3017,TXGR3,TXMD3 ## 17. exclude samples' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n\nsed -i '/## 12./c\\16 ## 12. MinCov ' params.txt\nsed -i '/## 14./c\\MGV_c85d6m16p5 ## 14. output name ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n") # In[ ]: get_ipython().run_cell_magic('bash', '', "## Only OS clade and outgroups, low missing and high missing data sets (OS_min4, OS_min14)\nsed -i '/## 12./c\\4 ## 12. MinCov ' params.txt \nsed -i '/## 14./c\\OS_c85d6m4p5 ## 14. output name ' params.txt\nsed -i '/## 17./c\\BJSB3,BJSL25,BJVL19,MXED8,MXGT4,TXGR3,TXMD3,FLAB109,FLBA140,FLCK18,FLCK216,FLMO62,FLSA185,FLSF33,FLSF47,FLSF54,FLWO6,LALC2,SCCU3 ## 17. exclude samples' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n\nsed -i '/## 12./c\\13 ## 12. MinCov ' params.txt \nsed -i '/## 14./c\\OS_c85d6m13p5 ## 14. output name ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n") # In[ ]: get_ipython().run_cell_magic('bash', '', "## Only OSMGV clade and outgroups, low missing and high missing data sets (OSMGV_min4, OSMGV_min16)\nsed -i '/## 12./c\\4 ## 12. MinCov ' params.txt \nsed -i '/## 14./c\\OSMGV_c85d6m4p5 ## 14. output name ' params.txt\nsed -i '/## 17./c\\BJSB3,BJSL25,BJVL19,MXED8,MXGT4,TXGR3,TXMD3 ## 17. exclude samples' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n\nsed -i '/## 12./c\\20 ## 12. MinCov ' params.txt \nsed -i '/## 14./c\\OSMGV_c85d6m20p5 ## 14. output name ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n") # In[ ]: get_ipython().run_cell_magic('bash', '', "## Only FB clade and outgroups, low missing and high missing data sets (FB_min4, FB_min16)\nsed -i '/## 12./c\\4 ## 12. MinCov ' params.txt \nsed -i '/## 14./c\\FB_c85d6m4p5 ## 14. output name ' params.txt\nsed -i '/## 17./c\\FLAB109,FLBA140,FLCK18,FLCK216,FLMO62,FLSA185,FLSF33,FLSF47,FLSF54,FLWO6,LALC2,SCCU3,BZBB1,CRL0001,CRL0030,CUCA4,CUSV6,CUVN10,HNDA09,MXSA3017 ## 17. exclude samples' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n\nsed -i '/## 12./c\\12 ## 12. MinCov ' params.txt \nsed -i '/## 14./c\\FB_c85d6m12p5 ## 14. output name ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n")