#!/usr/bin/env python # coding: utf-8 # # converting the ehrlink pdf data supplement to a tab seperated text # # This notebook converts the [pdf of indications](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3422843/bin/amiajnl-2012-000852-s1.pdf) from: # # > McCoy et al. (2012) **Development and evaluation of a crowdsourcing methodology for knowledge base construction: identifying relationships between clinical problems and medications**. *Journal of the American Medical Informatics Association* [doi:10.1136/amiajnl-2012-000852](//dx.doi.org/10.1136/amiajnl-2012-000852) # In[3]: import csv # In[4]: # convert pdf to text get_ipython().system('pdftotext download/amiajnl-2012-000852-s1.pdf') # In[5]: def generate_lines(path): with open(path) as read_file: for line in read_file: line = line.lstrip('\x0c') if line.startswith('file://'): continue if not line.strip(): continue yield line # In[6]: n_lines = 0 with open('download/amiajnl-2012-000852-s1.tsv', 'w') as write_file: writer = csv.writer(write_file, delimiter='\t') line_gen = generate_lines('download/amiajnl-2012-000852-s1.txt') reader = csv.reader(line_gen, delimiter='|', quotechar='"', quoting=csv.QUOTE_ALL) for row in reader: row = [elem.replace('\n', ' ') for elem in row] writer.writerow(row) n_lines += 1 assert n_lines == 11166 + 1 # 1 for header # In[ ]: # In[7]: import pandas # In[8]: indication_df = pandas.read_table('download/amiajnl-2012-000852-s1.tsv') indication_df[:3] # In[28]: medication_group = indication_df.groupby(['medication_definition_id', 'medication'], as_index=False) medication_df = medication_group.agg({'problem': 'count'}).rename(columns={'problem': 'indications'}) medication_df = medication_df.sort('indications', ascending=False) medication_df.to_csv('data/medications.tsv', sep='\t', index=False) # In[29]: problem_group = indication_df.groupby(['problem_definition_id', 'problem'], as_index=False) medication_df = problem_group.agg({'medication': 'count'}).rename(columns={'medication': 'indications'}) medication_df = medication_df.sort('indications', ascending=False) medication_df.to_csv('data/problems.tsv', sep='\t', index=False) # In[ ]: