#!/usr/bin/env python
# coding: utf-8

# ## Mechanisms notebook

# In[25]:


import numpy as np
import pdb
from Mechanisms_bot.src import pubmed_io
import Mechanisms_bot.tweet_mechanism as tweet_mechanism
from Mechanisms_bot.src import abstract_parser 
import importlib
import pickle
import re


# ### Query Pubmed for sentences
# Get PMIDs for sentences containing mechanisms and lack of clarity

# In[3]:


pubmed_ids = pubmed_io.get_pubmed_ids(99000)
len(pubmed_ids)


# Get the abstracts for those articles

# In[4]:


importlib.reload(pubmed_io)
pubmed_abstracts = pubmed_io.get_pubmed_abstracts( pubmed_ids)
len(pubmed_abstracts)


# ### All of the above in one function

# In[ ]:


tweet_mechanism.get_recent_mech_sentences(90000)


# Pubmed returns an XML. It is structured PubmedArticleList / PubmedArticle / MedlineCitation. MedlineCitation has children: PMID, and Article
# 
# Article has children: Journal, ArticleTitle, and Abstract
# 
# Abstract has child: AbstractText

# Make sure that the mechanism and lack of clarity are in the same sentence

# In[5]:


good_sent = list(map(abstract_parser.get_mech_sent, pubmed_abstracts, pubmed_ids) )
good_sent = [x for x in good_sent if x['mech_sent']]
len(good_sent)


# Save and load, as necessary

# In[8]:


with open('good_sent.pickle', 'wb') as pickle_file:
    pickle.dump(good_sent, pickle_file )


# ### Quick stats on sentences

# In[2]:


with open('good_sent.pickle', 'rb') as pickle_file:
    good_sent = pickle.load( pickle_file )


# In[54]:


unknown = {'understood', 'unclear', 'unknown'}
end_unknown = sum([1 for x in good_sent if x['mech_sent'].split(' ')[-1][:-1] in unknown])
print('Number of sentences ending with variant of "unknown": {}'.format(end_unknown) )


# In[55]:


however = sum([1 for x in good_sent if re.findall('however', x['mech_sent'].lower()) ])
print('Number of sentences containing "however": {}'.format(however) )


# In[56]:


although = sum([1 for x in good_sent if re.findall('although', x['mech_sent'].lower()) ])
print('Number of sentences containing "although": {}'.format(although) )


# In[57]:


while_count = sum([1 for x in good_sent if re.findall('while', x['mech_sent'].lower()) ])
print('Number of sentences containing "while": {}'.format(while_count) )


# In[61]:


sent_length = [len(x['mech_sent']) for x in good_sent]
sent_min = good_sent[np.argmin(sent_length)]
print('Shortest sentence: {0}, PMID: {1}'.format(sent_min['mech_sent'], sent_min['PMID'])  )


# In[63]:


# These are long sentences due to punctuation
for i, cur_sent in enumerate(good_sent):
    if cur_sent['PMID'] in {'26526306', '25454993', '26351365', '26648182'}:
        print(i)


# A fairly long sentence:

# In[47]:


sent_length = [len(x['mech_sent']) for x in good_sent[:1255]]
sent_max = good_sent[np.argmax(sent_length)]
sent_max


# A sample of mechanism sentences:

# In[18]:


good_sent[:100]


# # Tweeting!

# In[10]:


importlib.reload(tweet_mechanism)
new_sent, tweeted_sentences = tweet_mechanism.main();


# In[ ]: