In [1]:

import os
import gzip
import re
import pandas

In [2]:

# Download human entrez gene information
url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'
! wget --timestamping --directory-prefix download/ $url

--2015-06-07 14:50:22--  ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz
           => ‘download/.listing’
Resolving ftp.ncbi.nih.gov (ftp.ncbi.nih.gov)... 130.14.250.7, 2607:f220:41e:250::10
Connecting to ftp.ncbi.nih.gov (ftp.ncbi.nih.gov)|130.14.250.7|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /gene/DATA/GENE_INFO/Mammalia ... done.
==> PASV ... done.    ==> LIST ... done.

    [ <=>                                   ] 672         --.-K/s   in 0.01s   

2015-06-07 14:50:23 (56.9 KB/s) - ‘download/.listing’ saved [672]

Removed ‘download/.listing’.
Remote file no newer than local file ‘download/Homo_sapiens.gene_info.gz’ -- not retrieving.

In [3]:

# Read Entrez info dataset
path = os.path.join('download', 'Homo_sapiens.gene_info.gz')

with gzip.open(path, 'rt') as read_file:
    matches = re.match(r'#Format: (.+) \(', next(read_file))
    columns = matches.group(1).split(' ')
    gene_df = pandas.read_table(read_file, names = columns, na_values=['-'])

len(gene_df)

Out[3]:

In [4]:

# extract symbols and xrefs
xref_rows = list()
symbol_rows = list()

for i, series in gene_df.iterrows():
    gene_id = series.GeneID
    
    # symbols
    symbol = series.Symbol
    if pandas.notnull(symbol):
        symbol_rows.append((gene_id, 'symbol', symbol))
    
    # synonyms
    synonyms = series.Synonyms
    if pandas.notnull(synonyms):
        for synonym in synonyms.split('|'):
            symbol_rows.append((gene_id, 'synonym', synonym))

    # xrefs
    dbXrefs = series.dbXrefs
    if pandas.notnull(dbXrefs):
        for xref in dbXrefs.split('|'):
            db, ref = xref.split(':', 1)
            xref_rows.append((gene_id, db, ref))

xref_df = pandas.DataFrame(xref_rows, columns=['GeneID', 'resource', 'identifier'])
xref_df.to_csv('data/xrefs-human.tsv', sep='\t', index=False)

symbol_df = pandas.DataFrame(symbol_rows, columns=['GeneID', 'type', 'symbol'])
symbol_df.to_csv('data/symbols-human.tsv', sep='\t', index=False)

In [6]:

xref_df.head()

Out[6]:

	GeneID	resource	identifier
0	1	MIM	138670
1	1	HGNC	HGNC:5
2	1	Ensembl	ENSG00000121410
3	1	HPRD	00726
4	1	Vega	OTTHUMG00000183507

In [7]:

symbol_df.head()

Out[7]:

	GeneID	type	symbol
0	1	symbol	A1BG
1	1	synonym	A1B
2	1	synonym	ABG
3	1	synonym	GAB
4	1	synonym	HYST2477

In [5]:

# save a select columnset
columns = ['tax_id', 'GeneID', 'Symbol', 'chromosome', 'map_location', 'type_of_gene', 'description']
select_df = gene_df[columns]
select_df.to_csv('data/genes-human.tsv', sep='\t', index=False)
select_df.head()

Out[5]:

	tax_id	GeneID	Symbol	chromosome	map_location	type_of_gene	description
0	9606	1	A1BG	19	19q13.4	protein-coding	alpha-1-B glycoprotein
1	9606	2	A2M	12	12p13.31	protein-coding	alpha-2-macroglobulin
2	9606	3	A2MP1	12	12p13.31	pseudo	alpha-2-macroglobulin pseudogene 1
3	9606	9	NAT1	8	8p22	protein-coding	N-acetyltransferase 1 (arylamine N-acetyltrans...
4	9606	10	NAT2	8	8p22	protein-coding	N-acetyltransferase 2 (arylamine N-acetyltrans...