This notebook converts the pdf of indications from:
McCoy et al. (2012) Development and evaluation of a crowdsourcing methodology for knowledge base construction: identifying relationships between clinical problems and medications. Journal of the American Medical Informatics Association doi:10.1136/amiajnl-2012-000852
import csv
# convert pdf to text
!pdftotext download/amiajnl-2012-000852-s1.pdf
def generate_lines(path):
with open(path) as read_file:
for line in read_file:
line = line.lstrip('\x0c')
if line.startswith('file://'):
continue
if not line.strip():
continue
yield line
n_lines = 0
with open('download/amiajnl-2012-000852-s1.tsv', 'w') as write_file:
writer = csv.writer(write_file, delimiter='\t')
line_gen = generate_lines('download/amiajnl-2012-000852-s1.txt')
reader = csv.reader(line_gen, delimiter='|', quotechar='"', quoting=csv.QUOTE_ALL)
for row in reader:
row = [elem.replace('\n', ' ') for elem in row]
writer.writerow(row)
n_lines += 1
assert n_lines == 11166 + 1 # 1 for header
import pandas
indication_df = pandas.read_table('download/amiajnl-2012-000852-s1.tsv')
indication_df[:3]
medication_definition_id | medication | problem_definition_id | problem | patient_link_frequency | link_ratio | |
---|---|---|---|---|---|---|
0 | 70 | Albuterol Sulfate (5 MG/ML) 0.5% Inhalation Ne... | 64181 | Asthma | 6 | 0.600000 |
1 | 70 | Albuterol Sulfate (5 MG/ML) 0.5% Inhalation Ne... | 64205 | Chronic Obstructive Pulmonary Disease | 3 | 0.428571 |
2 | 72 | Albuterol Sulfate 2 MG/5ML Oral Syrup | 77891 | Acute Upper Respiratory Infection | 2 | 0.666667 |
medication_group = indication_df.groupby(['medication_definition_id', 'medication'], as_index=False)
medication_df = medication_group.agg({'problem': 'count'}).rename(columns={'problem': 'indications'})
medication_df = medication_df.sort('indications', ascending=False)
medication_df.to_csv('data/medications.tsv', sep='\t', index=False)
problem_group = indication_df.groupby(['problem_definition_id', 'problem'], as_index=False)
medication_df = problem_group.agg({'medication': 'count'}).rename(columns={'medication': 'indications'})
medication_df = medication_df.sort('indications', ascending=False)
medication_df.to_csv('data/problems.tsv', sep='\t', index=False)