#!/usr/bin/env python
# coding: utf-8

# # Live oaks introgression manuscript
# ## Notebook 4: Population analyses 
# 
# #### D. Eaton, A. Hipp, A. Gonzalez-Rodriguez & J. Cavender-Bares
# ##### contact: deren.eaton@yale.edu  
# 
# -----------------------   

# In[110]:


get_ipython().run_cell_magic('bash', '', 'mkdir -p analysis_treemix/\nmkdir -p analysis_structure/\n')


# In[2]:


import numpy as np
import gzip
from collections import OrderedDict, Counter

import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')


# ------------------   
# 
# ## Treemix analysis
# 
# I run a custom python code below to convert the unlinked snps file from the pyrad output into a Treemix input file. This requires grouping sampled individuals into populations, and calling biallelic SNP frequencies for them. I grouped the four non-Virentes white oaks into the outgroup sample. Each ingroup taxon is its own population, except for _Q. fusiformis_ for which I split up the Texas and Mexico populations, for a total of 9 populations. 

# In[1]:


## group sample names into taxon names
## again I exclude the South Carolina sample of virginiana, which has less data.

taxa = {
    'virginiana' : ["LALC2","FLSF33","FLBA140"],
    'minima'     : ["FLCK216","FLSA185","FLMO62","FLSF47"],
    'geminata'   : ["FLCK18","FLWO6","FLSF54","FLAB109"],
    'sagraeana'  : ["CUSV6","CUVN10","CUCA4"],
    'oleoides'   : ["MXSA3017","BZBB1","HNDA09","CRL0001","CRL0030"],
    'brandegeei' : ["BJSB3","BJSL25","BJVL19"],
    'fusiformis.m' : ["MXED8","MXGT4"],
    'fusiformis.t' : ["TXMD3","TXGR3"],
    'outgroup'   : ["EN","AR","DO","DU"],
}

## used to require that all populations have data for at least
## one individual in each locus that we use
minhits = [1]*9


# In[3]:


## resolve ambiguity characters
def resolve(base):
    D = {'R':['G','A'],
         'K':['G','T'],
         'S':['G','C'],
         'Y':['T','C'],
         'W':['T','A'],
         'M':['C','A'],}
    if base in D:
        return D[base]
    else: 
        return [base,base]


# In[4]:


## print groups and minhits 
for i,j in zip(taxa,minhits):
    print "\t   ",i, taxa[i], 'minimum=',j
    
## read in data to sample names
#infile = open("outfiles/c85d6m4p3.unlinked_snps",'r')
infile = open("analysis_pyrad/outfiles/virentes_c85d6m4p5.unlinked_snps",'r')
dat = infile.readlines()
nsamp,nsnps = dat[0].strip().split(" ")
nsamp = int(nsamp)
nsnps = int(nsnps)
NDATA = np.empty([int(nsamp),int(nsnps)],dtype='object')

## read SNP matrix into a numpy.array
for line in range(len(dat[1:])):
    a,b = dat[1:][line].split()
    NDATA[line] = list(b)
sites = np.transpose(NDATA)

## unpack ambiguity bases and find two most common alleles
## at every SNP site, save to a list
alleles = []
for site in sites:
    ds = []
    for s in site:
        if s in list("RKSYWM"):
            ds.append(resolve(s)[0])
            ds.append(resolve(s)[1])
        else:
            ds.append(s)
            ds.append(s)
    snp = [s for s in ds if s not in ["N",'-']]
    a = Counter(snp).most_common(3)
    alleles.append([a[0][0],a[1][0]])

## create a dictionary mapping sample names to SNPs    
SNPS = OrderedDict()
for line in dat[1:]:
    a,b = line.split()
    SNPS[a] = b

## create a dictionary with empty lists for each taxon 
FREQ = OrderedDict()
for tax in taxa:
    FREQ[tax] = []

## fill the FREQ dictionary with SNPs for all 
## samples in that taxon
keeps = []
for snp in range(int(nsnps)):
    GG = []
    for tax,mins in zip(taxa,minhits):
        GG.append( sum([SNPS[i][snp] not in ["N","-"] for i in taxa[tax]]) >= int(mins))
    if all(GG):
        keeps.append(snp)

for keep in keeps:
    for tax in FREQ:
        bunch = []
        for i in taxa[tax]:
            #print tax, i, SNPS[i][keep]
            bunch.append(resolve(SNPS[i][keep])[0])
            bunch.append(resolve(SNPS[i][keep])[1])
        FREQ[tax].append("".join(bunch))

## output files
outfile = gzip.open("analysis_treemix/treemix_SNPs.gz",'w')

## print header
print >>outfile, " ".join(FREQ.keys())

## print data
for i,j in enumerate(keeps):
    a1 = alleles[j][0]
    a2 = alleles[j][1]
    H = [str(FREQ[tax][i].count(a1))+","+str(FREQ[tax][i].count(a2)) for tax in FREQ]

    ## exclude non-biallelic SNPs
    if "0,0" not in " ".join(H):
        ## exclude if not variable in subsampling
        if not all([i.split(",")[1] == '0' for i in H]):
            print >>outfile, " ".join(H)
outfile.close()


# In[5]:


get_ipython().system(' zcat analysis_treemix/treemix_SNPs.gz | wc')


# In[9]:


get_ipython().run_cell_magic('bash', '', 'treemix -i analysis_treemix/treemix_SNPs.gz -root outgroup -k 1      -o analysis_treemix/SNPs_m0 > analysis_treemix/logSm0\ntreemix -i analysis_treemix/treemix_SNPs.gz -root outgroup -k 1 -m 1 -o analysis_treemix/SNPs_m1 > analysis_treemix/logSm1\ntreemix -i analysis_treemix/treemix_SNPs.gz -root outgroup -k 1 -m 2 -o analysis_treemix/SNPs_m2 > analysis_treemix/logSm2\ntreemix -i analysis_treemix/treemix_SNPs.gz -root outgroup -k 1 -m 3 -o analysis_treemix/SNPs_m3 > analysis_treemix/logSm3\ntreemix -i analysis_treemix/treemix_SNPs.gz -root outgroup -k 1 -m 4 -o analysis_treemix/SNPs_m4 > analysis_treemix/logSm4\ntreemix -i analysis_treemix/treemix_SNPs.gz -root outgroup -k 1 -m 5 -o analysis_treemix/SNPs_m5 > analysis_treemix/logSm5\ntreemix -i analysis_treemix/treemix_SNPs.gz -root outgroup -k 1 -m 6 -o analysis_treemix/SNPs_m6 > analysis_treemix/logSm6\ntreemix -i analysis_treemix/treemix_SNPs.gz -root outgroup -k 1 -m 7 -o analysis_treemix/SNPs_m7 > analysis_treemix/logSm7\n')


# ## Plot the treemix results
# 

# In[7]:


get_ipython().run_line_magic('load_ext', 'rmagic')
## this requires that you have the python module rpy2 installed


# In[8]:


get_ipython().run_cell_magic('R', '', 'source("~/local/src/treemix/src/plotting_funcs.R")\n')


# In[12]:


get_ipython().run_cell_magic('bash', '', 'echo -e "virginiana\\nminima\\ngeminata\\nsagraeana\\noleoides\\nfusiformis.m\\nfusiformis.t\\nbrandegeei\\noutgroup" >analysis_treemix/poporder\n')


# In[25]:


get_ipython().run_cell_magic('R', '-h 400 -w 800', '## save figure\n#svg("figures/treemix_m0.svg", width=8, height=4)\npar(mfrow=c(1,2));\nplot_tree("analysis_treemix/SNPs_m0", lwd=3)\nplot_resid("analysis_treemix/SNPs_m0","analysis_treemix/poporder")\n#dev.off()\n')


# In[27]:


get_ipython().run_cell_magic('R', '-h 400 -w 800', '## save figure\n#svg("figures/treemix_m1.svg", width=8, height=4)\npar(mfrow=c(1,2));\nplot_tree("analysis_treemix/SNPs_m1", lwd=3)\nplot_resid("analysis_treemix/SNPs_m1","analysis_treemix/poporder")\n#dev.off()\n')


# In[29]:


get_ipython().run_cell_magic('R', '-h 400 -w 800', '## save figure\n#svg("figures/treemix_m2.svg", width=8, height=4)\npar(mfrow=c(1,2));\nplot_tree("analysis_treemix/SNPs_m2", lwd=3)\nplot_resid("analysis_treemix/SNPs_m2","analysis_treemix/poporder")\n#dev.off()\n')


# In[19]:


get_ipython().run_cell_magic('R', '-h 400 -w 800', '## save figure\npar(mfrow=c(1,2));\nplot_tree("analysis_treemix/SNPs_m3", lwd=2)\nplot_resid("analysis_treemix/SNPs_m3","analysis_treemix/poporder")\n')


# In[20]:


get_ipython().run_cell_magic('R', '-h 400 -w 800', '## save figure\npar(mfrow=c(1,2));\nplot_tree("analysis_treemix/SNPs_m4", lwd=2)\nplot_resid("analysis_treemix/SNPs_m4","analysis_treemix/poporder")\n')


# In[21]:


get_ipython().run_cell_magic('R', '-h 400 -w 800', '## save figure\npar(mfrow=c(1,2));\nplot_tree("analysis_treemix/SNPs_m5", lwd=2)\nplot_resid("analysis_treemix/SNPs_m5","analysis_treemix/poporder")\n')


# --------------------   
# 
# ## _Structure_ analysis of RADseq SNPs 
# 
# Structure was run on the structure formatted ('k' option) SNP output from pyRAD in the file "virentes\_c85d6m20p5noutg.str". Clustering assignment was measured for K values between 2-8, and the optimal K value was inferred by Evanno's K, as described in the text. Tthe optimal K value is 7. Here we run 10 replicate analyses at each value of K, and estimate average cluster assignment values by permuting across replicates using _CLUMPP_, and finally plot the results as a stacked barplot. 

# In[ ]:


## download fresh (default) params files (or you could get them elsewhere)
dl = get_ipython().getoutput(' wget www.dereneaton.com/downloads/struct.main   -q -O analysis_structure/struct.main')
dl = get_ipython().getoutput(' wget www.dereneaton.com/downloads/struct.extras -q -O analysis_structure/struct.extras')


# In[ ]:


## set the file format to match pyRAD structure format (6 blank columns)
## meaning no populations, locations, names, or phenotypes are indicated.
stderr = get_ipython().getoutput(" sed -i '/LABEL /c\\#define LABEL 1 //'         analysis_structure/struct.main")
stderr = get_ipython().getoutput(" sed -i '/POPDATA /c\\#define POPDATA 0 //'     analysis_structure/struct.main")
stderr = get_ipython().getoutput(" sed -i '/POPFLAG /c\\#define POPFLAG 0 //'     analysis_structure/struct.main")
stderr = get_ipython().getoutput(" sed -i '/LOCDATA /c\\#define LOCDATA 0 //'     analysis_structure/struct.main")
stderr = get_ipython().getoutput(" sed -i '/PHENOTYPE /c\\#define PHENOTYPE 0 //' analysis_structure/struct.main")
stderr = get_ipython().getoutput(" sed -i '/EXTRACOLS /c\\#define EXTRACOLS 0 //' analysis_structure/struct.main")
stderr = get_ipython().getoutput(" sed -i '/BURNIN /c\\#define BURNIN 50000 //'     analysis_structure/struct.main")
stderr = get_ipython().getoutput(" sed -i '/NUMREPS /c\\#define NUMREPS 500000 //'   analysis_structure/struct.main")

stderr = get_ipython().getoutput(" sed -i '/RANDOMIZE /c\\#define RANDOMIZE 0 //'   analysis_structure/struct.extras")
stderr = get_ipython().getoutput(" sed -i '/LOCISPOP /c\\#define LOCISPOP 0 //'     analysis_structure/struct.extras")
stderr = get_ipython().getoutput(" sed -i '/USEPOPINFO /c\\#define USEPOPINFO 0 //' analysis_structure/struct.extras")


# In[ ]:


## create a Parallel computing client
## can map a function across how ever many 
## parallel engines are turned on in the notebook
## under the Clusters tab
from IPython.parallel import Client
PAR = Client()[:]
LOADBALANCE = Client().load_balanced_view()


# In[ ]:


## decorator adds parallel capability if run in IPython notebook
@LOADBALANCE.parallel(block=True)
def structure(arg):
    ff = arg[1]
    NUMINDS, NUMLOCI = (27,14011)
    cmd = """ -m analysis_structure/struct.main \
              -e analysis_structure/struct.extras \
              -K %d \
              -N %s \
              -L %s \
              -i %s.str \
              -D %s \
              -o %s """ % (arg[0], NUMINDS, NUMLOCI, ff, str(arg[3]), arg[2]+".K"+str(arg[0]))
    stderr = get_ipython().getoutput(' structure $cmd')


# In[ ]:


args = [[K, "analysis_pyrad/outfiles/virentes_c85d6m20p5noutg", 
        "analysis_structure/RADrep"+str(reps), np.random.randint(0,1e9,1)[0]] for K in range(2,9) \
        for reps in range(1,11)]
for i in args[:]: print i


# In[ ]:


with open("analysis_pyrad/outfiles/virentes_c85d6m20p5noutg.str",'r') as ifile:
    print len(ifile.readline().strip().split()), 'SNPs in structure data set'


# In[ ]:


## run these jobs in parallel
quiet = structure.map(args)


# ### Use _CLUMPP_ to get means across permuted replicates  
# First I create concatenated indfiles by cutting out the relevant lines of the _STRUCTURE_ output and appending to a single file for all replicates with the same value of $K$. Then I create and edit a _CLUMPP_ params file, entering settings to perform permutations of all iterations using the greedy algorithm. 

# In[ ]:


def run_CLUMPP(K): 
    repfiles = glob.glob("analysis_structure/RADrep*.K"+str(K)+"_f")
    outfile = open("analysis_structure/RADr"+str(K)+".indfile",'w')
    for ff in repfiles:
        i = 1
        for line in open(ff).readlines():
            if ")   :  " in line:
                ll = line.strip().split()
                print >>outfile, " ".join([ll[0],ll[0],ll[2],ll[0].split('.')[0]])+" :  "+" ".join(ll[4:])
                i += 1
        print >>outfile, ""
    outfile.close()
    
    ## get template CLUMPP params file, I host a template file at this download
    get_ipython().system(' wget -q www.dereneaton.com/downloads/clumpp.paramfile -O analysis_structure/clumpp.paramsfile')

    ## fill in the template: tell it the format to use individuals
    stderr = get_ipython().getoutput(" sed -i '/type of data /c\\DATATYPE 0    # individuals //' analysis_structure/clumpp.paramsfile")
    stderr = get_ipython().getoutput(" sed -i '/individuals or populations /c\\C 27    # N individuals //' analysis_structure/clumpp.paramsfile")
    stderr = get_ipython().getoutput(" sed -i '/Print the permuted /c\\PRINT_PERMUTED_DATA 0    # no print //' analysis_structure/clumpp.paramsfile")
    
    ## call CLUMPP
    NREPS = 10
    NINDS = 27
    args = "-i analysis_structure/RADr%s.indfile \
            -o analysis_structure/RADr%s.outfile \
            -j analysis_structure/RADr%s.miscfile \
            -k %s -r %s -c %s " % (K,K,K,K,NREPS,NINDS)
    stderr = get_ipython().getoutput(' ~/local/src/CLUMPP_Linux64.1.1.2/CLUMPP analysis_structure/clumpp.paramsfile $args')


# In[ ]:


## run CLUMPP over 10 iterations at each level of K
for i in range(2,9):
    run_CLUMPP(i)


# ## Make a stacked barplot

# In[3]:


def structplot(infile, K, colorder=[], barorder=[]):
    dat = open(infile).readlines()
    D = OrderedDict()
    for line in dat:
        ll = line.strip().split()
        vals = [float(i)*100 for i in ll[5:]]
    
        ## order of the bars
        if not barorder:
            D[ll[1]] = [vals[i] for i in range(K)]
        else: 
            D[ll[1]] = [vals[i] for i in barorder]
    
    ## sort into the same order of individuals
    ## as in the microsatellite data set of Cavender-Bares
    sorteds = [11,16,21,25,
               19,10,12,18,
               17,14,15,13,
               8,7,9,
               24,4,20,5,6,
               1,3,2,
               22,23,
               26,27]
    
    ## ordered color map
    cmap = sns.color_palette("Set3", 8)
    #cmap = sns.dark_palette("skyblue", 8, reverse=True)
    if not colorder:
        ncmap = [cmap[i] for i in range(K)]
    else:
        ncmap = [cmap[i] for i in colorder]

    fig, ax = plt.subplots(figsize=(10.,1.0))

    ## normalize values to 100
    for i in D:
        D[i] = [100*(j/sum(D[i])) for j in D[i]]

    ## make plot
    for col in range(len(D.values()[0])):
        if col > 0:
            ax.bar(range(len(D.keys())), 
                   [D[str(i)][col] for i in sorteds],
                   bottom = [sum(D[str(i)][:col]) for i in sorteds],
                   color=ncmap[col], width=0.95, edgecolor='')       
        else:
            ax.bar(range(len(D.keys())), 
                   [D[str(i)][col] for i in sorteds],
                   bottom = 0, color=ncmap[col], width=0.95, edgecolor="")
    ax.set_xticks([])
    ax.set_yticks([])

    ll = [0, 3.95, 7.95, 11.95, 14.95,
          19.95, 22.95, 26.95]
    for l in ll:
        ax.axvline(x=l, linewidth=1.5, color="#262626")
    ax.axhline(y=0.05, linewidth=1.3, color="#262626")
    ax.axhline(y=100, linewidth=1.3, color="#262626")

    ax.set_xlim(-0.,27.)
    ax.set_ylim(-0.,100.)
    
    #ax.set_ylabel("Percent ancestry")
    ax.grid(b=0)
    plt.savefig("figures/Fig2.RADstruct_"+str(K)+".svg")
    plt.savefig("figures/Fig2.RADstruct_"+str(K)+".png", dpi=300)
    plt.show()


# In[115]:


structplot("analysis_structure/RADr2.outfile", 2, 
           colorder=[0,3], barorder = [1,0])


# In[116]:


structplot("analysis_structure/RADr3.outfile", 3, 
           colorder=[0,3,1], barorder = [0,1,2])


# In[117]:


structplot("analysis_structure/RADr4.outfile", 4, 
           colorder=[0,3,1,5], barorder = [3,2,0,1])


# In[118]:


structplot("analysis_structure/RADr5.outfile", 5, 
           colorder=[0,2,3,5,1], barorder = [1,0,2,3,4])


# In[119]:


structplot("analysis_structure/RADr6.outfile", 6, 
           colorder=[0,4,2,5,3,1], barorder = [0,1,2,3,4,5])


# In[120]:


structplot("analysis_structure/RADr7.outfile", 7, 
           colorder=[0,5,6,3,2,4,1], barorder = [0,1,2,3,6,5,4])


# In[4]:


structplot("analysis_structure/RADr8.outfile", 8,
           colorder=[0,6,3,5,2,7,4,1], barorder = [7,6,4,5,2,0,1,3])


# ### Calculating optimal K
# The "_f" output files were zipped into the directory as analysis_structure/struct.zip which is in the `git` repository. This file was uploaded to the online resource [Structue Harvester](http://taylor0.biology.ucla.edu/structureHarvester/) which created the output files that are saved in the `git` repository in analysis_structure/Harvester.tar.gz. This analysis found K=3 as the optimal K value (highest Delta K) using the Evanno's K method, including the output table below. 

# In[ ]:


## K	Reps	Mean LnP(K)	Stdev LnP(K)	Ln'(K)	|Ln''(K)|	Delta K
## 2	10	-150180.520000	758.978748	—	—	—
## 3	10	-132621.090000	35.788218	17559.430000	26700.270000	746.063129
## 4	10	-141761.930000	904.565336	-9140.840000	298432.520000	329.918148
## 5	10	-449335.290000	240461.093921	-307573.360000	10843.300000	0.045094
## 6	10	-767751.950000	1281470.488619	-318416.660000	2029640.650000	1.583837
## 7	10	-3115809.260000	2635226.639916	-2348057.310000	344387.090000	0.130686
## 8	10	-5808253.660000	3569473.653017	-2692444.400000	—	—