#!/usr/bin/env python # coding: utf-8 # In[1]: import os import gzip import re import pandas # In[2]: # Download human entrez gene information url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz' get_ipython().system(' wget --timestamping --directory-prefix download/ $url') # In[3]: # Read Entrez info dataset path = os.path.join('download', 'Homo_sapiens.gene_info.gz') with gzip.open(path, 'rt') as read_file: matches = re.match(r'#Format: (.+) \(', next(read_file)) columns = matches.group(1).split(' ') gene_df = pandas.read_table(read_file, names = columns, na_values=['-']) len(gene_df) # In[4]: # extract symbols and xrefs xref_rows = list() symbol_rows = list() for i, series in gene_df.iterrows(): gene_id = series.GeneID # symbols symbol = series.Symbol if pandas.notnull(symbol): symbol_rows.append((gene_id, 'symbol', symbol)) # synonyms synonyms = series.Synonyms if pandas.notnull(synonyms): for synonym in synonyms.split('|'): symbol_rows.append((gene_id, 'synonym', synonym)) # xrefs dbXrefs = series.dbXrefs if pandas.notnull(dbXrefs): for xref in dbXrefs.split('|'): db, ref = xref.split(':', 1) xref_rows.append((gene_id, db, ref)) xref_df = pandas.DataFrame(xref_rows, columns=['GeneID', 'resource', 'identifier']) xref_df.to_csv('data/xrefs-human.tsv', sep='\t', index=False) symbol_df = pandas.DataFrame(symbol_rows, columns=['GeneID', 'type', 'symbol']) symbol_df.to_csv('data/symbols-human.tsv', sep='\t', index=False) # In[6]: xref_df.head() # In[7]: symbol_df.head() # In[5]: # save a select columnset columns = ['tax_id', 'GeneID', 'Symbol', 'chromosome', 'map_location', 'type_of_gene', 'description'] select_df = gene_df[columns] select_df.to_csv('data/genes-human.tsv', sep='\t', index=False) select_df.head()