#!/usr/bin/env python
# coding: utf-8

# In[1]:


import csv
import gzip
import io
import re

import pandas
import requests


# In[ ]:


# In[53]:


# http://csb.pitt.edu/erc_analysis/Methods.php
with gzip.open('download/erc_mam33.gz', 'rb') as read_file:
    text = io.TextIOWrapper(read_file)
    for i, line in enumerate(text):
        row = line.strip('\n').split('\t')
        print(row[:8])
        if i > 8: break


# In[ ]:


# In[26]:


# 17487 lines in erc_mam33.gz
len(header)


# In[2]:


def read_erc_mat(path):
    read_file = gzip.open('download/erc_mam33.gz', 'rb')
    text = io.TextIOWrapper(read_file)
    reader = csv.reader(text, delimiter='\t')
    genes = next(reader)
    for source, correlations in zip(genes[1:], reader):
        targets = genes[:len(correlations)]
        for target, correlation in zip(targets, correlations):
            if correlation == '':
                continue
            yield source, target, correlation
    read_file.close()


# In[ ]:


# In[3]:


def read_ucsc_map(path):
    map_df = pandas.read_table(path, names=['ucsc_id', 'symbol', 'entrez_id'], comment='#')
    map_df = map_df.query("entrez_id != 'n/a'")
    ucsc_to_entrez = dict()
    for ucsc_id, entrez_id in zip(map_df.ucsc_id, map_df.entrez_id):
        assert ucsc_id not in ucsc_to_entrez
        ucsc_to_entrez[ucsc_id] = entrez_id
    return ucsc_to_entrez


# In[11]:


pattern = re.compile('list_uids=([0-9]+)" TARGET=_blank class="toc">Entrez Gene')
import sys

def query_ucsc_gene(ucsc_id):
    print('webquery for', ucsc_id)
    sys.stdout.flush()
    url = 'https://genome.ucsc.edu/cgi-bin/hgGene?hgg_gene={}&org=human'.format(ucsc_id)
    response = requests.get(url)
    match = re.search(pattern, response.text)
    if not match:
        return None
    entrez_id = int(match.group(1))
    print('webquery matched {} to {}'.format(ucsc_id, entrez_id))
    return entrez_id


# In[ ]:


# In[14]:


map_list = list()
for genome_build in [38, 19, 18]:
    path = 'download/ucsc-gene-map-hg{}.tsv'.format(genome_build)
    map_list.append(read_ucsc_map(path))

cache = dict()
def get_entrez(ucsc_id):
    if ucsc_id in cache:
        return cache[ucsc_id]
    for ucsc_to_entrez in map_list:
        entrez_id = ucsc_to_entrez.get(ucsc_id)
        if entrez_id:
            cache[ucsc_id] = entrez_id
            return entrez_id
    entrez_id = query_ucsc_gene(ucsc_id)
    cache[ucsc_id] = entrez_id
    return entrez_id


# In[15]:


write_file = gzip.open('data/erc_mam33.tsv.gz', 'wb')
text = io.TextIOWrapper(write_file, line_buffering = True)
writer = csv.writer(text, delimiter='\t')
writer.writerow(['source_ucsc', 'source_entrez', 'target_ucsc', 'target_entrez', 'correlation'])
erc_gen = read_erc_mat('download/erc_mam33.gz')
for source_ucsc, target_ucsc, correlation in erc_gen:
    source_entrez = get_entrez(source_ucsc)
    target_entrez = get_entrez(target_ucsc)
    row = source_ucsc, source_entrez, target_ucsc, target_entrez, correlation
    writer.writerow(row)
write_file.close()


# In[ ]: