import os
import gzip
import re
import pandas
# Download human entrez gene information
url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'
! wget --timestamping --directory-prefix download/ $url
--2015-06-07 14:50:22-- ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz => ‘download/.listing’ Resolving ftp.ncbi.nih.gov (ftp.ncbi.nih.gov)... 130.14.250.7, 2607:f220:41e:250::10 Connecting to ftp.ncbi.nih.gov (ftp.ncbi.nih.gov)|130.14.250.7|:21... connected. Logging in as anonymous ... Logged in! ==> SYST ... done. ==> PWD ... done. ==> TYPE I ... done. ==> CWD (1) /gene/DATA/GENE_INFO/Mammalia ... done. ==> PASV ... done. ==> LIST ... done. [ <=> ] 672 --.-K/s in 0.01s 2015-06-07 14:50:23 (56.9 KB/s) - ‘download/.listing’ saved [672] Removed ‘download/.listing’. Remote file no newer than local file ‘download/Homo_sapiens.gene_info.gz’ -- not retrieving.
# Read Entrez info dataset
path = os.path.join('download', 'Homo_sapiens.gene_info.gz')
with gzip.open(path, 'rt') as read_file:
matches = re.match(r'#Format: (.+) \(', next(read_file))
columns = matches.group(1).split(' ')
gene_df = pandas.read_table(read_file, names = columns, na_values=['-'])
len(gene_df)
56425
# extract symbols and xrefs
xref_rows = list()
symbol_rows = list()
for i, series in gene_df.iterrows():
gene_id = series.GeneID
# symbols
symbol = series.Symbol
if pandas.notnull(symbol):
symbol_rows.append((gene_id, 'symbol', symbol))
# synonyms
synonyms = series.Synonyms
if pandas.notnull(synonyms):
for synonym in synonyms.split('|'):
symbol_rows.append((gene_id, 'synonym', synonym))
# xrefs
dbXrefs = series.dbXrefs
if pandas.notnull(dbXrefs):
for xref in dbXrefs.split('|'):
db, ref = xref.split(':', 1)
xref_rows.append((gene_id, db, ref))
xref_df = pandas.DataFrame(xref_rows, columns=['GeneID', 'resource', 'identifier'])
xref_df.to_csv('data/xrefs-human.tsv', sep='\t', index=False)
symbol_df = pandas.DataFrame(symbol_rows, columns=['GeneID', 'type', 'symbol'])
symbol_df.to_csv('data/symbols-human.tsv', sep='\t', index=False)
xref_df.head()
GeneID | resource | identifier | |
---|---|---|---|
0 | 1 | MIM | 138670 |
1 | 1 | HGNC | HGNC:5 |
2 | 1 | Ensembl | ENSG00000121410 |
3 | 1 | HPRD | 00726 |
4 | 1 | Vega | OTTHUMG00000183507 |
symbol_df.head()
GeneID | type | symbol | |
---|---|---|---|
0 | 1 | symbol | A1BG |
1 | 1 | synonym | A1B |
2 | 1 | synonym | ABG |
3 | 1 | synonym | GAB |
4 | 1 | synonym | HYST2477 |
# save a select columnset
columns = ['tax_id', 'GeneID', 'Symbol', 'chromosome', 'map_location', 'type_of_gene', 'description']
select_df = gene_df[columns]
select_df.to_csv('data/genes-human.tsv', sep='\t', index=False)
select_df.head()
tax_id | GeneID | Symbol | chromosome | map_location | type_of_gene | description | |
---|---|---|---|---|---|---|---|
0 | 9606 | 1 | A1BG | 19 | 19q13.4 | protein-coding | alpha-1-B glycoprotein |
1 | 9606 | 2 | A2M | 12 | 12p13.31 | protein-coding | alpha-2-macroglobulin |
2 | 9606 | 3 | A2MP1 | 12 | 12p13.31 | pseudo | alpha-2-macroglobulin pseudogene 1 |
3 | 9606 | 9 | NAT1 | 8 | 8p22 | protein-coding | N-acetyltransferase 1 (arylamine N-acetyltrans... |
4 | 9606 | 10 | NAT2 | 8 | 8p22 | protein-coding | N-acetyltransferase 2 (arylamine N-acetyltrans... |