#!/usr/bin/env python
# coding: utf-8

# # converting the ehrlink pdf data supplement to a tab seperated text
# 
# This notebook converts the [pdf of indications](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3422843/bin/amiajnl-2012-000852-s1.pdf) from:
# 
# > McCoy et al. (2012) **Development and evaluation of a crowdsourcing methodology for knowledge base construction: identifying relationships between clinical problems and medications**. *Journal of the American Medical Informatics Association* [doi:10.1136/amiajnl-2012-000852](//dx.doi.org/10.1136/amiajnl-2012-000852)

# In[3]:


import csv


# In[4]:


# convert pdf to text
get_ipython().system('pdftotext download/amiajnl-2012-000852-s1.pdf')


# In[5]:


def generate_lines(path):
    with open(path) as read_file:
        for line in read_file:
            line = line.lstrip('\x0c')
            if line.startswith('file://'):
                continue
            if not line.strip():
                continue
            yield line


# In[6]:


n_lines = 0
with open('download/amiajnl-2012-000852-s1.tsv', 'w') as write_file:
    writer = csv.writer(write_file, delimiter='\t')
    line_gen = generate_lines('download/amiajnl-2012-000852-s1.txt')
    reader = csv.reader(line_gen, delimiter='|', quotechar='"', quoting=csv.QUOTE_ALL)
    for row in reader:
        row = [elem.replace('\n', ' ') for elem in row]
        writer.writerow(row)
        n_lines += 1

assert n_lines == 11166 + 1 # 1 for header


# In[ ]:


# In[7]:


import pandas


# In[8]:


indication_df = pandas.read_table('download/amiajnl-2012-000852-s1.tsv')
indication_df[:3]


# In[28]:


medication_group = indication_df.groupby(['medication_definition_id', 'medication'], as_index=False)
medication_df = medication_group.agg({'problem': 'count'}).rename(columns={'problem': 'indications'})
medication_df = medication_df.sort('indications', ascending=False)
medication_df.to_csv('data/medications.tsv', sep='\t', index=False)


# In[29]:


problem_group = indication_df.groupby(['problem_definition_id', 'problem'], as_index=False)
medication_df = problem_group.agg({'medication': 'count'}).rename(columns={'medication': 'indications'})
medication_df = medication_df.sort('indications', ascending=False)
medication_df.to_csv('data/problems.tsv', sep='\t', index=False)


# In[ ]: