from sqlalchemy import create_engine
from gensim.models import Word2Vec, KeyedVectors, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedLineDocument
from gensim.similarities.annoy import AnnoyIndexer
from gensim.utils import save_as_line_sentence
from gensim.models.word2vec import LineSentence
from gensim import utils
class StreamCorpus(object):
def __init__(self, table, limit=100):
"""
Iterator over a corpus on a postgresql database
:param table:
:param limit:
"""
self.engine = create_engine('postgresql://postgres:eueueu@localhost/pubmed',
echo=False)
self.limit = limit
self.table = table
self.conn = self.engine.connect()
@property
def text_gen(self):
return (str(text[0]) for text in self.conn.execute(f"SELECT abstract FROM {self.table} limit {self.limit}").fetchall() if text[0].strip() != '')
def __iter__(self):
for doc in self.text_gen:
yield utils.simple_preprocess(doc)
self.conn.close()
def dump_to_corpus_file(stream_corpus):
save_as_line_sentence(stream_corpus, 'articles.txt')
sc = StreamCorpus(table='articles',limit=10000)
dump_to_corpus_file(sc)
model_sent = Word2Vec(sentences=LineSentence('articles.txt'), workers=8)
model_sent.save('articles.w2v')
model_sent2 = FastText(sentences=LineSentence('articles.txt'), workers=8)
model_sent2.save('articles.ft')
model_sent3 = Doc2Vec(documents=TaggedLineDocument('articles.txt'), workers=8)
model_sent3.save('articles.d2v')
from gensim.test.utils import datapath
model_sent_accuracy = model_sent.wv.evaluate_word_analogies(datapath('questions-words.txt'))[0]
print("Word analogy accuracy with `sentences`: {:.1f}%".format(100.0 * model_sent_accuracy))
Word analogy accuracy with `sentences`: 3.5%
model_sent.wv.most_similar('zika')
[('acquired', 0.48520341515541077), ('hepatitis', 0.4786548614501953), ('ongoing', 0.46461182832717896), ('covid', 0.46365872025489807), ('possible', 0.4460738003253937), ('suspected', 0.4410059154033661), ('usutu', 0.42246460914611816), ('rubella', 0.4187973737716675), ('chikv', 0.4147232472896576), ('brazil', 0.4095695912837982)]
Now we are going to explore the clustering of documents using various distance metrics.
Based on the minimum cumulative distance that word vectors from the text document A needs to travel to match exactly the point in the vectors space occupied by document B. To know more. The vectors are generated by a Word2vec embedding.
import pandas as pd
import seaborn as sns
import numpy as np
with open('articles.txt') as f:
corpus = [line.split() for line in f.readlines()]
doc1 = corpus[0]
' '.join(doc1)
'zika virus is novel teratogenic agent associated with cerebral anomalies because of the challenges associated with assessment of antenatal diagnosis and prognosis in fetuses screening for other congenital infections mostly relies on ultrasound we aimed to assess whether similar approach might be adequate for zika virus congenital syndrome provided that early markers of infection and adequate timing for screening are established'
dists = [(id,model_sent.wv.wmdistance(doc1,d)) for id,d in enumerate(corpus)]
sorted(dists, key=lambda x:x[1], reverse=False)[:10]
[(0, 0.0), (7580, 0.6528682052528333), (7551, 0.6565560323984608), (2398, 0.6584101245021644), (5878, 0.6623878270496776), (1521, 0.66726562150337), (5291, 0.6695458647308221), (4784, 0.6737582588318226), (5922, 0.6760980985188448), (7201, 0.6764060969844079)]
' '.join(corpus[7580])
'cdc has updated its interim guidance for health care providers caring for infants born to mothers with possible zika virus infection during pregnancy laboratory testing is recommended for infants born to mothers with laboratory evidence of zika virus infection during pregnancy and infants who have abnormal clinical or neuroimaging findings suggestive of congenital zika syndrome and maternal epidemiologic link suggesting possible transmission regardless of maternal zika virus test results congenital zika syndrome is recently recognized pattern of congenital anomalies associated with zika virus infection during pregnancy that includes microcephaly intracranial calcifications or other brain anomalies or eye anomalies among others recommended infant laboratory evaluation includes both molecular real time reverse transcription polymerase chain reaction rrt pcr and serologic immunoglobulin igm testing initial samples should be collected directly from the infant in the first days of life if possible testing of cord blood is not recommended positive infant serum or urine rrt pcr test result confirms congenital zika virus infection positive zika virus igm testing with negative rrt pcr result indicates probable congenital zika virus infection in addition to infant zika virus testing initial evaluation of all infants born to mothers with laboratory evidence of zika virus infection during pregnancy should include comprehensive physical examination including neurologic examination postnatal head ultrasound and standard newborn hearing screen infants with laboratory evidence of congenital zika virus infection should have comprehensive ophthalmologic exam and hearing assessment by auditory brainstem response abr testing before month of age recommendations for follow up of infants with laboratory evidence of congenital zika virus infection depend on whether abnormalities consistent with congenital zika syndrome are present infants with abnormalities consistent with congenital zika syndrome should have coordinated evaluation by multiple specialists within the first month of life additional evaluations will be needed within the first year of life including assessments of vision hearing feeding growth and and endocrine function families and caregivers will also need ongoing psychosocial support and assistance with coordination of care infants with laboratory evidence of congenital zika virus infection without apparent abnormalities should have ongoing developmental monitoring and screening by the primary care provider repeat hearing testing is recommended this guidance will be updated when additional information becomes available'
vectors=[]
for doc_id in range(len(corpus[:100])):
inferred_vector = model_sent3.infer_vector(corpus[doc_id])
vectors.append(inferred_vector)
data = {'doc%s'%i: vec for i,vec in enumerate(vectors)}
dfdoc = pd.DataFrame(data)
dfdoc
doc0 | doc1 | doc2 | doc3 | doc4 | doc5 | doc6 | doc7 | doc8 | doc9 | ... | doc90 | doc91 | doc92 | doc93 | doc94 | doc95 | doc96 | doc97 | doc98 | doc99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.021169 | -0.406225 | 0.306174 | 0.017754 | -0.152882 | 0.177285 | -0.223070 | -0.230813 | -0.267274 | -0.191737 | ... | -0.271154 | -0.146777 | 0.436955 | -0.171762 | -0.161091 | -0.629327 | -0.603055 | -0.153522 | -0.133518 | -0.078563 |
1 | 0.075238 | -0.205888 | 0.034458 | -0.178951 | 0.093324 | 0.059353 | 0.302494 | -0.027226 | 0.164545 | 0.059933 | ... | -0.183173 | 0.144572 | -0.010087 | -0.003517 | 0.008553 | -0.212541 | -0.234060 | -0.118086 | -0.146674 | 0.086892 |
2 | 0.164354 | 0.097246 | 0.291730 | 0.268030 | 0.462329 | 0.248288 | 0.360440 | 0.296292 | 0.007848 | 0.143199 | ... | -0.220392 | 0.139836 | -0.001665 | 0.307158 | 0.095971 | 0.427869 | 0.276152 | 0.178075 | 0.172050 | 0.076469 |
3 | -0.113075 | 0.048212 | -0.050844 | -0.138734 | -0.162175 | -0.407979 | -0.068506 | -0.100371 | -0.238094 | -0.048398 | ... | -0.211948 | -0.246509 | -0.104687 | -0.249376 | -0.247753 | -0.075362 | -0.065870 | -0.178658 | 0.228032 | -0.074528 |
4 | -0.233938 | 0.207447 | 0.165686 | -0.201778 | 0.059278 | 0.169261 | -0.084627 | -0.328158 | 0.303713 | -0.046885 | ... | 0.098107 | -0.013739 | 0.060953 | -0.064543 | 0.026283 | 0.049826 | 0.165965 | 0.008358 | -0.429361 | 0.081811 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
95 | 0.019896 | 0.182583 | 0.356824 | 0.037255 | 0.231595 | -0.092559 | -0.073817 | 0.351921 | -0.136604 | 0.428351 | ... | 0.379594 | 0.162574 | -0.162180 | 0.208834 | 0.088523 | 0.674506 | 0.608945 | 0.263370 | 0.318956 | 0.057133 |
96 | 0.050456 | -0.067742 | -0.149736 | -0.015624 | -0.162374 | -0.219187 | -0.414413 | -0.123712 | 0.028145 | 0.135236 | ... | 0.323174 | 0.025379 | -0.508923 | 0.029388 | -0.255933 | 0.334791 | -0.775076 | 0.353506 | 0.249756 | 0.007594 |
97 | -0.114198 | -0.284532 | 0.150257 | -0.279301 | -0.052956 | -0.582650 | 0.191783 | -0.565030 | 0.262155 | -0.208502 | ... | 0.216867 | -0.128202 | 0.247126 | -0.234699 | -0.104273 | -0.205464 | 0.143319 | -0.278691 | -0.544962 | -0.031964 |
98 | 0.222263 | 0.444537 | 0.248003 | -0.050038 | 0.166298 | 0.223055 | -0.080637 | -0.010301 | 0.415710 | 0.453056 | ... | 1.042968 | 0.190344 | -0.383757 | -0.170256 | 0.152362 | -0.050922 | 0.226971 | 0.435638 | -0.199251 | 0.165298 |
99 | 0.171983 | -0.378073 | -0.415923 | 0.071291 | -0.273732 | -0.092025 | -0.131373 | -0.449564 | -0.513998 | -0.348724 | ... | 0.072837 | -0.233522 | 0.030361 | -0.168347 | -0.114523 | -0.160217 | -0.365731 | -0.355383 | -0.597088 | -0.114362 |
100 rows × 100 columns
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dfdoc.values.T)
array([[1.0000001 , 0.348492 , 0.33918974, ..., 0.521447 , 0.29241976, 0.62724406], [0.348492 , 0.99999994, 0.48146072, ..., 0.6756368 , 0.24174899, 0.5417718 ], [0.33918974, 0.48146072, 1.0000001 , ..., 0.520628 , 0.33048475, 0.6469104 ], ..., [0.521447 , 0.6756368 , 0.520628 , ..., 1.0000001 , 0.23646055, 0.6988205 ], [0.29241976, 0.24174899, 0.33048475, ..., 0.23646055, 1.0000001 , 0.3761577 ], [0.62724406, 0.5417718 , 0.6469104 , ..., 0.6988205 , 0.3761577 , 0.99999994]], dtype=float32)
sns.clustermap(cosine_similarity(dfdoc.values.T), center=0, cmap="vlag", metric='cosine',
linewidths=.75, figsize=(13, 13));
/usr/local/lib/python3.9/dist-packages/seaborn/matrix.py:654: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
from gensim.similarities.annoy import AnnoyIndexer
annoy_index = AnnoyIndexer(model_sent, 100)
vector = model_sent.wv["zikv"]
# The instance of AnnoyIndexer we just created is passed
approximate_neighbors = model_sent.wv.most_similar([vector], topn=11, indexer=annoy_index)
# Neatly print the approximate_neighbors and their corresponding cosine similarity values
print("Approximate Neighbors")
for neighbor in approximate_neighbors:
print(neighbor)
normal_neighbors = model_sent.wv.most_similar([vector], topn=11)
print("\nExact Neighbors")
for neighbor in normal_neighbors:
print(neighbor)
Approximate Neighbors ('zikv', 1.0) ('flavivirus', 0.558258056640625) ('viral', 0.5442558825016022) ('virus', 0.5291227102279663) ('wnv', 0.5078100562095642) ('utero', 0.5052283704280853) ('jev', 0.4979873299598694) ('denv', 0.49339520931243896) ('antibodies', 0.48890113830566406) ('zkv', 0.4836871027946472) ('subsequent', 0.4817664623260498) Exact Neighbors ('zikv', 0.9999998211860657) ('flavivirus', 0.6097279191017151) ('viral', 0.5845945477485657) ('virus', 0.5565490126609802) ('wnv', 0.515498161315918) ('utero', 0.5104020833969116) ('jev', 0.495966374874115) ('denv', 0.48670315742492676) ('antibodies', 0.47755590081214905) ('zkv', 0.46684205532073975) ('subsequent', 0.4628679156303406)