#!/usr/bin/env python # coding: utf-8 # In[1]: import csv import gzip import io import re import pandas import requests # In[ ]: # In[53]: # http://csb.pitt.edu/erc_analysis/Methods.php with gzip.open('download/erc_mam33.gz', 'rb') as read_file: text = io.TextIOWrapper(read_file) for i, line in enumerate(text): row = line.strip('\n').split('\t') print(row[:8]) if i > 8: break # In[ ]: # In[26]: # 17487 lines in erc_mam33.gz len(header) # In[2]: def read_erc_mat(path): read_file = gzip.open('download/erc_mam33.gz', 'rb') text = io.TextIOWrapper(read_file) reader = csv.reader(text, delimiter='\t') genes = next(reader) for source, correlations in zip(genes[1:], reader): targets = genes[:len(correlations)] for target, correlation in zip(targets, correlations): if correlation == '': continue yield source, target, correlation read_file.close() # In[ ]: # In[3]: def read_ucsc_map(path): map_df = pandas.read_table(path, names=['ucsc_id', 'symbol', 'entrez_id'], comment='#') map_df = map_df.query("entrez_id != 'n/a'") ucsc_to_entrez = dict() for ucsc_id, entrez_id in zip(map_df.ucsc_id, map_df.entrez_id): assert ucsc_id not in ucsc_to_entrez ucsc_to_entrez[ucsc_id] = entrez_id return ucsc_to_entrez # In[11]: pattern = re.compile('list_uids=([0-9]+)" TARGET=_blank class="toc">Entrez Gene') import sys def query_ucsc_gene(ucsc_id): print('webquery for', ucsc_id) sys.stdout.flush() url = 'https://genome.ucsc.edu/cgi-bin/hgGene?hgg_gene={}&org=human'.format(ucsc_id) response = requests.get(url) match = re.search(pattern, response.text) if not match: return None entrez_id = int(match.group(1)) print('webquery matched {} to {}'.format(ucsc_id, entrez_id)) return entrez_id # In[ ]: # In[14]: map_list = list() for genome_build in [38, 19, 18]: path = 'download/ucsc-gene-map-hg{}.tsv'.format(genome_build) map_list.append(read_ucsc_map(path)) cache = dict() def get_entrez(ucsc_id): if ucsc_id in cache: return cache[ucsc_id] for ucsc_to_entrez in map_list: entrez_id = ucsc_to_entrez.get(ucsc_id) if entrez_id: cache[ucsc_id] = entrez_id return entrez_id entrez_id = query_ucsc_gene(ucsc_id) cache[ucsc_id] = entrez_id return entrez_id # In[15]: write_file = gzip.open('data/erc_mam33.tsv.gz', 'wb') text = io.TextIOWrapper(write_file, line_buffering = True) writer = csv.writer(text, delimiter='\t') writer.writerow(['source_ucsc', 'source_entrez', 'target_ucsc', 'target_entrez', 'correlation']) erc_gen = read_erc_mat('download/erc_mam33.gz') for source_ucsc, target_ucsc, correlation in erc_gen: source_entrez = get_entrez(source_ucsc) target_entrez = get_entrez(target_ucsc) row = source_ucsc, source_entrez, target_ucsc, target_entrez, correlation writer.writerow(row) write_file.close() # In[ ]: