#!/usr/bin/env python
# coding: utf-8

# # Live oak introgression manuscript
# ## Notebook 1: RADseq Data Assembly  
# 
# 
# #### D. Eaton, A. Hipp, A. Gonzalez-Rodriguez & J. Cavender-Bares
# ##### contact: deren.eaton@yale.edu  
# 
# -----------------------   

# ### This notebook  
# This is an IPython notebook, a tool for reproducible science. It can be downloaded and executed to run all scripts in the notebook, or viewed as 'read-only' through a browser. The default language in each cell is IPython except where indicated with "%%bash" or "!", indicating bash scripts. 
# 
# -------------------------  

# In[ ]:


get_ipython().run_cell_magic('bash', '', '## create directories for data assembly\nmkdir -p analysis_pyrad/fastq/  \nmkdir -p figures/\n')


# ## Download the sequence data
# 
# Sequence data for this study is archived on the NCBI sequence read archive (SRA). The data were run in two separate Illumina runs, but are combined under a single project id number. 
# 
# Project SRA: SRP055977  
# Project number: PRJNA277574  
# Biosample numbers: SAMN03394519 -- SAMN03394561  
# Runs: SRR1915524 -- SRR1915566

# The first data set contains 30 samples sequenced on an Illumina GAIIx for 100bp single-end reads, and all sample names from this data set end with "\_v". The second data set includes 15 samples sequenced on an Illumina HiSEQ2000 for 100bp single-end reads, and samples from this data set end with "\_re". The quality scores are offset by 64 on the first data and by 33 on the second. Of the 15 samples in the second data set, 7 are re-sequenced individuals from the first, and thus after being filtered through step 2 of the _pyRAD_ analysis are combined with the replicates from the first data set, and clustered together in step 3. This yields a total of 37 samples in the data set. The sample "VI1" from the Hipp data set does not have voucher information and was thus excluded from the analysis. The re-sequenced sample "CRL0001" from the Hipp data set was suspected to be contaminated and was also excluded. The total number of samples is 37. 
# 
# You can download the data set using the script below:

# In[77]:


get_ipython().run_cell_magic('bash', '', '## get the SRA format files\nfor run in $(seq 24 28);\n\n  do\nwget -q -r -nH --cut-dirs=9 \\\nftp://ftp-trace.ncbi.nlm.nih.gov/\\\nsra/sra-instant/reads/ByRun/sra/SRR/\\\nSRR191/SRR19155$run/SRR19155$run".sra";\n\ndone\n')


# In[78]:


get_ipython().run_cell_magic('bash', '', '## convert sra files to fastq using fastq-dump tool\nfastq-dump *.sra\n')


# Rename fastq files to have sample ID names instead of SRR run IDs. The following script will replace the numbers with the correct names.

# In[81]:


## This reads in a table mapping sample names to SRA numbers
## that is hosted on github
import pandas
import urllib2
import glob
import os

## open table from github url
url = "https://raw.githubusercontent.com/"+\
      "dereneaton/virentes/master/SraRunTable.txt"
intable = urllib2.urlopen(url)

## make name xfer dictionary
DF = pandas.read_table(intable, sep="\t")
D = {DF.Run_s[i]:DF.Library_Name_s[i] for i in DF.index}

## change file names and move to fastq dir/
for fname in glob.glob("*.fastq"):
    os.rename(fname, "analysis_pyrad/fastq/"+\
                     D[fname.replace(".fastq",".fq")])


# # _pyRAD_ assembly of _de novo_ RADseq loci
# 
# RADseq data were filtered and clustered using the program _pyRAD_ v.2.13. Below are the scripts for generating the params file and entering the parameters used in the study. Because the data are already de-multiplexed (sorted by barcodes), a barcodes files is not needed, and we begin the analysis from step 2 of _pyRAD_. 

# #### Creating a params input file

# In[ ]:


get_ipython().run_cell_magic('bash', '', '## create template params file\n~/pyrad_v.2.13/pyRAD -n\n')


# In[ ]:


get_ipython().run_cell_magic('bash', '', "## substitute new parameters into file\nsed -i '/## 1. /c\\analysis_pyrad         ## 1. working directory ' params.txt\nsed -i '/## 2. /c\\                       ## 2. data loc          ' params.txt\nsed -i '/## 3. /c\\                       ## 3. barcode loc       ' params.txt\nsed -i '/## 7. /c\\12                     ## 7. N processors      ' params.txt\nsed -i '/## 9. /c\\6                      ## 9. NQual             ' params.txt\nsed -i '/## 10./c\\.85                    ## 10. clust threshold  ' params.txt\nsed -i '/## 13./c\\5                      ## 13. maxSH            ' params.txt\nsed -i '/## 14./c\\virentes_c85d6m4p5     ## 14. output name      ' params.txt\nsed -i '/## 21./c\\1                      ## 21. Filter type      ' params.txt\nsed -i '/## 26./c\\20                     ## 26. MaxSNPs          ' params.txt\nsed -i '/## 29./c\\2,2                    ## 29. trim overhang    ' params.txt\nsed -i '/## 30./c\\u,n,k                  ## 30. output formats   ' params.txt\n")


# In[ ]:


## my modified template params file
get_ipython().system(' cat params.txt')


# ### Filtering the data
# 
# I set the location of the de-multiplexed data to select only the '\_re' data set first, then I run step 2 of _pyRAD_ to filter these data based on quality scores with the default offset score of 33, and also to remove reads with Illumina adapters detected. After this I change the data location to select only the '\_v' data set files, and run step 2 with the offset score changed to 64.   

# In[ ]:


get_ipython().run_cell_magic('bash', '', "sed -i '/## 18./c\\analysis_pyrad/fastq/*_re.fastq  ## 18. select re data files ' params.txt  \nsed -i '/## 20./c\\33              ## 20. using default 33       ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 2\n\nsed -i '/## 18./c\\analysis_pyrad/fastq/*_v.fastq   ## 18. select v data files ' params.txt\nsed -i '/## 20./c\\64              ## 20. change offset to 64       ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 2\n")


# In[ ]:


get_ipython().run_cell_magic('bash', '', '## remove the two samples that we are excluding\nrm analysis_pyrad/edits/VI1_re.edit\nrm analysis_pyrad/edits/CRL0001_re.edit\n')


# In[ ]:


## IPython code to concatenate data of re-sequenced individuals

import glob

edits = glob.glob("analysis_pyrad/edits/*.edit")
names = ["_".join(i.split("_")[:-1]) for i in edits]

for name in set(names): 
    if names.count(name) == 2:
        cmd = "cat %s_re.edit %s_v.edit > %s.edit" % (name,name,name)
        stderr = get_ipython().getoutput(' $cmd')
        cmd = "rm %s_re.edit %s_v.edit" % (name,name)
        stderr = get_ipython().getoutput(' $cmd')
    else:
        cmd = "mv %s_re.edit %s.edit" % (name,name)
        stderr = get_ipython().getoutput(' $cmd')
        cmd = "mv %s_v.edit %s.edit" % (name,name)
        stderr = get_ipython().getoutput(' $cmd')


# In[ ]:


## remove _re and _v endings from sequence names
## this makes it cleaner for combining re-sequenced samples
for editfile in glob.glob("analysis_pyrad/edits/*.edit"):
    cmd = "sed -i s/%s/_/g %s" % ("_re_",editfile)
    get_ipython().system(' $cmd')
    cmd = "sed -i s/%s/_/g %s" % ("_v_",editfile)
    get_ipython().system(' $cmd')


# ### Clustering within samples

# In[ ]:


get_ipython().run_cell_magic('bash', '', '~/pyrad_v.2.13/pyRAD -p params.txt -s 3\n')


# ### Calling consensus sequences

# In[ ]:


get_ipython().run_cell_magic('bash', '', '~/pyrad_v.2.13/pyRAD -p params.txt -s 45\n')


# #### Exclude small samples
# Because the two samples TXWV2 and CUMM5 have considerably less data than all other samples we chose to exclude them at this step before clustering across samples.

# In[ ]:


get_ipython().run_cell_magic('bash', '', 'rm analysis_pyrad/clust.85/TXWV2.consens.gz\nrm analysis_pyrad/clust.85/CUMM5.consens.gz\n')


# ### Clustering across samples 
# This is the most time-consuming step. 

# In[ ]:


get_ipython().run_cell_magic('bash', '', '~/pyrad_v.2.13/pyRAD -p params.txt -s 6\n')


# ### Outputting assembled data sets
# 
# By changing the parameters on lines 13,14, and by subselecting taxa using line 19 of the params file, I created several data sets that include/exclude different samples, and vary in their proportions of missing data.  

# In[ ]:


get_ipython().run_cell_magic('bash', '', "## The largest, all inclusive, and most missing data, data set (All_min4)\nsed -i '/## 12./c\\4                        ## 12. MinCov           ' params.txt  \nsed -i '/## 14./c\\virentes_c85d6m4p5       ## 14. output name      ' params.txt\nsed -i '/## 17./c\\                         ## 17. exclude samples  ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n")


# In[ ]:


get_ipython().run_cell_magic('bash', '', "## Smaller, all inclusive, less missing data, data set (All_min20)\nsed -i '/## 12./c\\20                        ## 12. MinCov           ' params.txt  \nsed -i '/## 14./c\\virentes_c85d6m20p5       ## 14. output name      ' params.txt\nsed -i '/## 17./c\\                          ## 17. exclude samples  ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n")


# In[ ]:


get_ipython().run_cell_magic('bash', '', "## excluding outgroups for using in structure\nsed -i '/## 12./c\\20                             ## 12. MinCov           ' params.txt  \nsed -i '/## 14./c\\virentes_c85d6m20p5noutg       ## 14. output name      ' params.txt\nsed -i '/## 17./c\\AR,EN,DO,DU,CH,NI,HE,EN        ## 17. exclude samples  ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n")


# In[ ]:


get_ipython().run_cell_magic('bash', '', "## Only MGV clade and outgroups, low missing and high missing data sets (MGV_min4, MGV_min16)\nsed -i '/## 12./c\\4                         ## 12. MinCov           ' params.txt  \nsed -i '/## 14./c\\MGV_c85d6m4p5             ## 14. output name      ' params.txt\nsed -i '/## 17./c\\BJSB3,BJSL25,BJVL19,BZBB1,CRL0001,CRL0030,CUCA4,CUSV6,CUVN10,HNDA09,MXED8,MXGT4,MXSA3017,TXGR3,TXMD3    ## 17. exclude samples' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n\nsed -i '/## 12./c\\16                        ## 12. MinCov           ' params.txt\nsed -i '/## 14./c\\MGV_c85d6m16p5            ## 14. output name      ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n")


# In[ ]:


get_ipython().run_cell_magic('bash', '', "## Only OS clade and outgroups, low missing and high missing data sets (OS_min4, OS_min14)\nsed -i '/## 12./c\\4                        ## 12. MinCov           ' params.txt  \nsed -i '/## 14./c\\OS_c85d6m4p5             ## 14. output name      ' params.txt\nsed -i '/## 17./c\\BJSB3,BJSL25,BJVL19,MXED8,MXGT4,TXGR3,TXMD3,FLAB109,FLBA140,FLCK18,FLCK216,FLMO62,FLSA185,FLSF33,FLSF47,FLSF54,FLWO6,LALC2,SCCU3     ## 17. exclude samples' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n\nsed -i '/## 12./c\\13                        ## 12. MinCov           ' params.txt  \nsed -i '/## 14./c\\OS_c85d6m13p5             ## 14. output name      ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n")


# In[ ]:


get_ipython().run_cell_magic('bash', '', "## Only OSMGV clade and outgroups, low missing and high missing data sets (OSMGV_min4, OSMGV_min16)\nsed -i '/## 12./c\\4                           ## 12. MinCov           ' params.txt  \nsed -i '/## 14./c\\OSMGV_c85d6m4p5             ## 14. output name      ' params.txt\nsed -i '/## 17./c\\BJSB3,BJSL25,BJVL19,MXED8,MXGT4,TXGR3,TXMD3     ## 17. exclude samples' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n\nsed -i '/## 12./c\\20                           ## 12. MinCov           ' params.txt  \nsed -i '/## 14./c\\OSMGV_c85d6m20p5             ## 14. output name      ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n")


# In[ ]:


get_ipython().run_cell_magic('bash', '', "## Only FB clade and outgroups, low missing and high missing data sets (FB_min4, FB_min16)\nsed -i '/## 12./c\\4                           ## 12. MinCov           ' params.txt  \nsed -i '/## 14./c\\FB_c85d6m4p5                ## 14. output name      ' params.txt\nsed -i '/## 17./c\\FLAB109,FLBA140,FLCK18,FLCK216,FLMO62,FLSA185,FLSF33,FLSF47,FLSF54,FLWO6,LALC2,SCCU3,BZBB1,CRL0001,CRL0030,CUCA4,CUSV6,CUVN10,HNDA09,MXSA3017    ## 17. exclude samples' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n\nsed -i '/## 12./c\\12                           ## 12. MinCov           ' params.txt  \nsed -i '/## 14./c\\FB_c85d6m12p5                ## 14. output name      ' params.txt\n~/pyrad_v.2.13/pyRAD -p params.txt -s 7\n")