This is an IPython Notebook, best viewed through a web browser at the following permanent link: nbviewer. Each cell in the notebook will process Python
code by default, but can also execute other languages when directed. Here we use primarily bash scripts (cells headed with %%bash
or lines starting with !
) but also a few R
scripts (cells headed with %%R
). By executing all cells in this notebook our entire analysis should be easily reproducible.
Executing the code will require the following software (we use the version listed):
%%bash
mkdir -p analysis_pyrad/
mkdir -p analysis_dtests/
mkdir -p analysis_raxml/
mkdir -p analysis_mrbayes/
First we create a template params file for pyRAD (v.2.13) using the (-n
) argument, then we substitute in the relevant parameters for our analysis using the bash command sed
.
%%bash
## create template file
~/Dropbox/pyrad-git/pyRAD -n
mv params.txt analysis_pyrad/params.txt
## substitute in parameters
sed -i '/## 1. /c\analysis_pyrad/_c90d5 ## 1. working directory ' analysis_pyrad/params.txt
sed -i '/## 2. /c\/home/deren/Documents/RADSEQ_DATA/Carex/*.fastq ## 2. data file ' analysis_pyrad/params.txt
sed -i '/## 3. /c\/home/deren/Documents/RADSEQ_DATA/barcodes/Carex1.barcodes ## 3. barcodes file ' analysis_pyrad/params.txt
sed -i '/## 7. /c\10 ## 7. use multiple cores ' analysis_pyrad/params.txt
sed -i '/## 8. /c\5 ## 8. set mindepth ' analysis_pyrad/params.txt
sed -i '/## 9. /c\3 ## 9. max lowQual ' analysis_pyrad/params.txt
sed -i '/## 10. /c\.90 ## 10. decreased clust threshold ' analysis_pyrad/params.txt
sed -i '/## 11. /c\gbs ## 11. changed data type ' analysis_pyrad/params.txt
sed -i '/## 14. /c\c90d5m4p3 ## 14. output name ' analysis_pyrad/params.txt
sed -i '/## 21./c\2 ## 21. Filter type ' analysis_pyrad/params.txt
sed -i '/## 23./c\3 ## 23. maxH consens ' analysis_pyrad/params.txt
sed -i '/## 24./c\3 ## 24. maxN consens ' analysis_pyrad/params.txt
sed -i '/## 29./c\2,2 ## 29. trim overhang edges ' analysis_pyrad/params.txt
sed -i '/## 30./c\n ## 30. add output formats ' analysis_pyrad/params.txt
sed -i '/## 34./c\99999 ## 34. no max stack size ' analysis_pyrad/params.txt
Calling step 1 of pyRAD de-muliplexes the data set, here allowing for one base difference in the barcodes. This creates a fastQ file for only the samples that are indicated in our barcodes file. For this study this includes 9 sampled individuals.
%%bash
## create sub working directory
mkdir -p analysis_pyrad/_c90d5/
## demultiplex data files
~/Dropbox/pyrad-git/pyRAD -p analysis_pyrad/params.txt -s 1 2&> /dev/null
%%bash
## assemble data sets
~/Dropbox/pyrad-git/pyRAD -p analysis_pyrad/params.txt -s 234567 2&> /dev/null
## print summary of results
cat analysis_pyrad/_c90d5/stats/c90d5m4p3.stats
%%bash
## create sub working directory
mkdir -p analysis_pyrad/_c85d5/
## set parameters in params file
sed -i '/## 1. /c\analysis_pyrad/_c85d5 ## 1. working directory ' analysis_pyrad/params.txt
sed -i '/## 10./c\.85 ## 10. decreased clust threshold ' analysis_pyrad/params.txt
sed -i '/## 14./c\c85d5m4p3 ## 14. output name ' analysis_pyrad/params.txt
sed -i '/## 18./c\analysis_pyrad/_c90d5/fastq/*.gz ## 18. use de-multiplexed data ' analysis_pyrad/params.txt
%%bash
~/Dropbox/pyrad-git/pyRAD -p analysis_pyrad/params.txt -s 234567 2&> /dev/null
cat analysis_pyrad/_c85d5/stats/c85d5m4p3.stats
%%bash
## create sub working directory
mkdir -p analysis_pyrad/_c85d2/
## copy re-usable files from _c85d5/
cp -r analysis_pyrad/_c85d5/stats/ analysis_pyrad/_c85d2/
cp -r analysis_pyrad/_c85d5/clust.85/ analysis_pyrad/_c85d2/
## remove consens files b/c they will be remade at lower mindepth
rm analysis_pyrad/_c85d2/clust.85/*consens*
## set new parameters in params file
sed -i '/## 1. /c\analysis_pyrad/_c85d2 ## 1. working directory ' analysis_pyrad/params.txt
sed -i '/## 8. /c\2 ## 8. set mindepth ' analysis_pyrad/params.txt
sed -i '/## 14./c\c85d2m4p3 ## 14. output name ' analysis_pyrad/params.txt
sed -i '/## 31./c\1 ## 31. allow lowdepth base calls ' analysis_pyrad/params.txt
%%bash
~/Dropbox/pyrad-git/pyRAD -p analysis_pyrad/params.txt -s 567 2&> /dev/null
cat analysis_pyrad/_c85d2/stats/c85d2m4p3.stats
%%bash
raxmlHPC-PTHREADS-AVX -f a \
-p 12345 \
-s analysis_pyrad/_c90d5/outfiles/c90d5m4p3.phy \
-x 12345 -N 100 \
-m GTRGAMMA \
-n c90d5m4p3 \
-o 'stramin' \
-w /home/deren/Documents/Marcial/Carex/analysis_raxml/ \
-T 5 2&> /dev/null
%%bash
raxmlHPC-PTHREADS-AVX -f a \
-p 12345 \
-s analysis_pyrad/_c85d5/outfiles/c85d5m4p3.phy \
-x 12345 -N 100 \
-m GTRGAMMA \
-n c85d5m4p3 \
-o 'stramin' \
-w /home/deren/Documents/Marcial/Carex/analysis_raxml/ \
-T 5 2&> /dev/null
%%bash
raxmlHPC-PTHREADS-AVX -f a \
-p 12345 \
-s analysis_pyrad/_c85d2/outfiles/c85d2m4p3.phy \
-x 12345 -N 100 \
-m GTRGAMMA \
-n c85d2m4p3 \
-o 'stramin' \
-w /home/deren/Documents/Marcial/Carex/analysis_raxml/ \
-T 5 2&> /dev/null
We ran two independent mrBayes (v.3.2.2) analyses for each data set, each running for 10M generations sampling trees every 1000 gens and using four Metropolis-coupled MCMC chains per run.
## create a nexus string with parameters for mrbayes run
nexinfo = """
\nbegin mrbayes;
lset nst=6 rates=invgamma;
set autoclose=yes;
mcmc ngen=10000000 printfreq=1000 samplefreq=1000 nchains=4 savebrlens=yes;
sump;
sumt;
end;
"""
## append mrbayes nexus info to the end of nexus sequence files
! echo '$nexinfo' >> analysis_pyrad/_c90d5/outfiles/c90d5m4p3.nex
! echo '$nexinfo' >> analysis_pyrad/_c85d5/outfiles/c85d5m4p3.nex
! echo '$nexinfo' >> analysis_pyrad/_c85d2/outfiles/c85d2m4p3.nex
## call mrbayes on each nexus file data set
## Do two runs for each data set
quiet = ! mb analysis_pyrad/_c90d5/outfiles/c90d5m4p3.nex
quiet = ! mb analysis_pyrad/_c85d5/outfiles/c85d5m4p3.nex
quiet = ! mb analysis_pyrad/_c85d2/outfiles/c85d2m4p3.nex
#quiet = ! mpirun -np 6 analysis_pyrad/_c85d2/outfiles/c85d2m4p3.nex
## clean up, move files to mrbayes directory
! mv /home/deren/Documents/Marcial/Carex/analysis_pyrad/_c90d5/outfiles/c90d5m4p3.nex.* analysis_mrbayes/
! mv /home/deren/Documents/Marcial/Carex/analysis_pyrad/_c85d5/outfiles/c85d5m4p3.nex.* analysis_mrbayes/
! mv /home/deren/Documents/Marcial/Carex/analysis_pyrad/_c85d2/outfiles/c85d2m4p3.nex.* analysis_mrbayes/
%%bash
## create Dtest template file
~/Dropbox/pyrad-git/pyRAD -D > analysis_dtests/input.D4
%%bash
## substitute in new parameters
sed -i '/## N bootstrap/c\200 ## N bootstrap ' analysis_dtests/input.D4
sed -i '/## loc/c\analysis_pyrad/_c85d2/outfiles/c85d2m4p3.loci ## loc/path ' analysis_dtests/input.D4
sed -i '/## output file /c\analysis_dtests/test1 ## output file ' analysis_dtests/input.D4
sed -i '/## N cores/c\4 ## N cores ' analysis_dtests/input.D4
Append iterations over test samples to file by iterating over sample names with the following Python script:
CS = ['2816','2830D','2893G1']
W = ['waponah']
CSU = ['suberec']
CL = ['longii']
CV = ['vexans']
CA = ['albolut']
S = ['stramin']
def makeD4(P1,P2,P3,O,poly,infile):
! head -n 8 $infile > temp
! mv temp $infile
tests = []
if not poly:
for o in O:
for p3 in P3:
for p2 in P2:
for p1 in P1:
if p1 != p2:
if set((p1,p2,p3,o)) not in tests:
tests.append(set((p1,p2,p3,o)))
test = '%s\t%s\t%s\t%s\t##' % (p1,p2,p3,o)
## appends tests to input file
! echo '$test' >> analysis_dtests/input.D4
else:
for p3 in P3:
for p2 in P2:
for p1 in P1:
if p1 != p2:
if set((p1,p2,p3)) not in tests:
tests.append(set((p1,p2,p3)))
test = '[%s]\t[%s]\t[%s]\t[%s]\t##' % (p1,p2,p3,",".join(O))
## appends tests to input file
! echo '$test' >> analysis_dtests/input.D4
# make and print input file
makeD4(CS,CS+W,CSU+CL+CV+CA,S,0,"analysis_dtests/input.D4")
! cat analysis_dtests/input.D4
quiet = ! ~/Dropbox/pyrad-git/pyRAD -d analysis_dtests/input.D4
## change output file name
! sed -i '/## output file /c\analysis_dtests/test1.H ## output file ' analysis_dtests/input.D4
## make new input file
makeD4(CS,CS,CSU+CL+CV+CA,S,1,"analysis_dtests/input.D4")
## run the test
quiet = ! ~/Dropbox/pyrad-git/pyRAD -d analysis_dtests/input.D4
! cat analysis_dtests/test1.D4.txt
## change output file name
! sed -i '/## output file /c\analysis_dtests/test2 ## output file ' analysis_dtests/input.D4
## make new input file
makeD4(CS,CS,W,S+CL+CV+CA,0,"analysis_dtests/input.D4")
## run the test
quiet = ! ~/Dropbox/pyrad-git/pyRAD -d analysis_dtests/input.D4
! cat analysis_dtests/test2.D4.txt
This uses the SNP frequency weighted across multiple outgroups as the outgroup allele.
## change output file name
! sed -i '/## output file /c\analysis_dtests/test2.H ## output file ' analysis_dtests/input.D4
## make new input file
makeD4(CS,CS,W,S+CL+CV+CA,1,"analysis_dtests/input.D4")
## run the test
quiet = ! ~/Dropbox/pyrad-git/pyRAD -d analysis_dtests/input.D4
## change output file name
! sed -i '/## output file /c\analysis_dtests/test3 ## output file ' analysis_dtests/input.D4
## make new input file
makeD4(CL,CV,CA+CSU,S+CS,0,"analysis_dtests/input.D4")
## run the test
quiet = ! ~/Dropbox/pyrad-git/pyRAD -d analysis_dtests/input.D4
! cat analysis_dtests/test3.D4.txt
## change output file name
! sed -i '/## output file /c\analysis_dtests/test3.H ## output file ' analysis_dtests/input.D4
## make new input file
makeD4(CL,CV,CA+CSU,S+CS,1,"analysis_dtests/input.D4")
## run the test
quiet = ! ~/Dropbox/pyrad-git/pyRAD -d analysis_dtests/input.D4
## If executing this code in the IPython notebook
## you will need to have the python library python-Rpy2 installed
## otherwise you can execute the code below in a separate R shell
%load_ext rmagic
%%R
library(ape)
library(RADami)
%%R
tre <- read.tree("analysis_raxml/RAxML_bestTree.c90d5m4p3")
boots <- read.tree("analysis_raxml/RAxML_bipartitions.c90d5m4p3")
plot(ladderize(tre))
nodelabels(boots$node.label, bg='grey')
%%R
tre <- read.tree("analysis_raxml/RAxML_bestTree.c85d5m4p3")
boots <- read.tree("analysis_raxml/RAxML_bipartitions.c85d5m4p3")
tre <- ladderize(rotate(tre,11))
plot(tre)
nodelabels(boots$node.label, bg='grey')
%%R
tre <- read.tree("analysis_raxml/RAxML_bestTree.c85d2m4p3")
boots <- read.tree("analysis_raxml/RAxML_bipartitions.c85d2m4p3")
plot(ladderize(tre))
nodelabels(boots$node.label, bg='grey')
Examine the number of loci supporting each tree using the R package 'RADami' (v.1.0-3)
%%R
# Partitioned RAD analysis of the Carex scoparia trees
# 2014-06-06
# First, prune to the ingroup taxa of relevance to your question.
# Then, generate a treeset using NNI:
scop.tre.c85d2 <- read.tree('../R1.pyRAD/analysis_raxml/RAxML_bestTree.c85d2m4p3')
scop.tre.c85d5 <- read.tree('../R1.pyRAD/analysis_raxml/RAxML_bestTree.c85d5m4p3')
scop.tre.c90d5 <- read.tree('../R1.pyRAD/analysis_raxml/RAxML_bestTree.c90d5m4p3')
scop.tre.c85d2$tip.label <- paste('>', scop.tre.c85d2$tip.label, sep = '')
scop.tre.c85d5$tip.label <- paste('>', scop.tre.c85d5$tip.label, sep = '')
scop.tre.c90d5$tip.label <- paste('>', scop.tre.c90d5$tip.label, sep = '')
scop.tre.c85d2 <- drop.tip(scop.tre.c85d2, '>stramin')
scop.tre.c85d5 <- drop.tip(scop.tre.c85d5, '>stramin')
scop.tre.c90d5 <- drop.tip(scop.tre.c90d5, '>stramin')
scop.nni.c85d2 <- genTrees(scop.tre.c85d2, maxmoves=3, N = 200, filebase = 'scop.nni.c85d2', perms = c(length(nni(scop.tre.c85d2)), 150, 150))
scop.nni.c85d5 <- genTrees(scop.tre.c85d5, maxmoves=3, N = 200, filebase = 'scop.nni.c85d5', perms = c(length(nni(scop.tre.c85d5)), 150, 150))
scop.nni.c90d5 <- genTrees(scop.tre.c90d5, maxmoves=3, N = 200, filebase = 'scop.nni.c90d5', perms = c(length(nni(scop.tre.c90d5)), 150, 150))
%%R
# then, generate datasets for each of your treesets:
scop.rad.c85d2 <- read.pyRAD('../R1.pyRAD/analysis_pyrad/_c85d2/outfiles/c85d2m4p3.loci')
scop.rad.c85d5 <- read.pyRAD('../R1.pyRAD/analysis_pyrad/_c85d5/outfiles/c85d5m4p3.loci')
scop.rad.c90d5 <- read.pyRAD('../R1.pyRAD/analysis_pyrad/_c90d5/outfiles/c90d5m4p3.loci')
gen.RAD.loci.datasets(scop.rad.c85d2, scop.nni.c85d2, fileBase = 'c85d2', cores = 10, taxa = scop.nni.c85d2[[1]]$tip.label)
gen.RAD.loci.datasets(scop.rad.c85d5, scop.nni.c85d5, fileBase = 'c85d5', cores = 10, taxa = scop.nni.c85d2[[1]]$tip.label)
gen.RAD.loci.datasets(scop.rad.c90d5, scop.nni.c90d5, fileBase = 'c90d5', cores = 10, taxa = scop.nni.c85d2[[1]]$tip.label)
%%R
# Each resulting dataset will have a shell script associated with it in the folder numbered '.0', and this should be executed
# outside of R to get the RAxML output needed for the next steps of analysis. You can close or open R as you see fit, but if
# you close it, save the workspace (it will make your life easier in the next steps, and I have assumed in the next lines that
# you are working in the still-open workspace, or loaded the saved workspace and re-attached RADami). Read in the likelihood files,
# ranking the trees by likelihood:
scop.matched.c85d5 <- match.lnL.to.trees('c85d5.0')
scop.matched.c85d2 <- match.lnL.to.trees('c85d2.0')
scop.matched.c90d5 <- match.lnL.to.trees('c90d5.0')
# Then, plot them:
plot(scop.matched.c85d2, fileprefix='c85d2', panels = c('bestMat', 'worstMat', 'bestMat'), ci = rep(0.95, 3), regression = c(F, F, T),
lnL.break = c(-1E10, -1E10, -374000), pch = 3)
plot(scop.matched.c90d5, fileprefix='c90d5', panels = c('bestMat', 'worstMat', 'bestMat'), ci = rep(0.95, 3), regression = c(F, F, T),
lnL.break = c(-1E10, -1E10, -253350), pch = 3)
plot(scop.matched.c85d5, fileprefix='c85d5', panels = c('bestMat', 'worstMat'), ci = rep(0.95, 2), regression = c(T, F),
lnL.break = c(-1E10, -1E10), pch = 3)
# and look at the outliers:
pdf('c90d5.trees.pdf', width = 8.5, height = 11)
layout(matrix(1:6, 3, 2))
for(i in c(1, which(colSums(scop.ranks.c90d5$bestMat) > 59))) {
plot(scop.nni.c90d5[[i]])
title(main = paste('c90d5, tree',i))
}
dev.off()