#!/usr/bin/env python # coding: utf-8 # ## Mechanisms notebook # In[25]: import numpy as np import pdb from Mechanisms_bot.src import pubmed_io import Mechanisms_bot.tweet_mechanism as tweet_mechanism from Mechanisms_bot.src import abstract_parser import importlib import pickle import re # ### Query Pubmed for sentences # Get PMIDs for sentences containing mechanisms and lack of clarity # In[3]: pubmed_ids = pubmed_io.get_pubmed_ids(99000) len(pubmed_ids) # Get the abstracts for those articles # In[4]: importlib.reload(pubmed_io) pubmed_abstracts = pubmed_io.get_pubmed_abstracts( pubmed_ids) len(pubmed_abstracts) # ### All of the above in one function # In[ ]: tweet_mechanism.get_recent_mech_sentences(90000) # Pubmed returns an XML. It is structured PubmedArticleList / PubmedArticle / MedlineCitation. MedlineCitation has children: PMID, and Article # # Article has children: Journal, ArticleTitle, and Abstract # # Abstract has child: AbstractText # Make sure that the mechanism and lack of clarity are in the same sentence # In[5]: good_sent = list(map(abstract_parser.get_mech_sent, pubmed_abstracts, pubmed_ids) ) good_sent = [x for x in good_sent if x['mech_sent']] len(good_sent) # Save and load, as necessary # In[8]: with open('good_sent.pickle', 'wb') as pickle_file: pickle.dump(good_sent, pickle_file ) # ### Quick stats on sentences # In[2]: with open('good_sent.pickle', 'rb') as pickle_file: good_sent = pickle.load( pickle_file ) # In[54]: unknown = {'understood', 'unclear', 'unknown'} end_unknown = sum([1 for x in good_sent if x['mech_sent'].split(' ')[-1][:-1] in unknown]) print('Number of sentences ending with variant of "unknown": {}'.format(end_unknown) ) # In[55]: however = sum([1 for x in good_sent if re.findall('however', x['mech_sent'].lower()) ]) print('Number of sentences containing "however": {}'.format(however) ) # In[56]: although = sum([1 for x in good_sent if re.findall('although', x['mech_sent'].lower()) ]) print('Number of sentences containing "although": {}'.format(although) ) # In[57]: while_count = sum([1 for x in good_sent if re.findall('while', x['mech_sent'].lower()) ]) print('Number of sentences containing "while": {}'.format(while_count) ) # In[61]: sent_length = [len(x['mech_sent']) for x in good_sent] sent_min = good_sent[np.argmin(sent_length)] print('Shortest sentence: {0}, PMID: {1}'.format(sent_min['mech_sent'], sent_min['PMID']) ) # In[63]: # These are long sentences due to punctuation for i, cur_sent in enumerate(good_sent): if cur_sent['PMID'] in {'26526306', '25454993', '26351365', '26648182'}: print(i) # A fairly long sentence: # In[47]: sent_length = [len(x['mech_sent']) for x in good_sent[:1255]] sent_max = good_sent[np.argmax(sent_length)] sent_max # A sample of mechanism sentences: # In[18]: good_sent[:100] # # Tweeting! # In[10]: importlib.reload(tweet_mechanism) new_sent, tweeted_sentences = tweet_mechanism.main(); # In[ ]: