Building Word Embeddings¶

In [1]:

from sqlalchemy import create_engine
from gensim.models import Word2Vec, KeyedVectors, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedLineDocument
from gensim.similarities.annoy import AnnoyIndexer
from gensim.utils import save_as_line_sentence
from gensim.models.word2vec import LineSentence
from gensim import utils

In [2]:

class StreamCorpus(object):
    def __init__(self, table, limit=100):
        """
        Iterator over a corpus on a postgresql database
        :param table:
        :param limit:
        """
        self.engine = create_engine('postgresql://postgres:eueueu@localhost/pubmed',
                                    echo=False)
        self.limit = limit
        self.table = table
        self.conn = self.engine.connect()

    @property
    def text_gen(self):
        return (str(text[0]) for text in self.conn.execute(f"SELECT abstract FROM {self.table} limit {self.limit}").fetchall() if text[0].strip() != '')

    def __iter__(self):
        for doc in self.text_gen:
            yield utils.simple_preprocess(doc)
        self.conn.close()

def dump_to_corpus_file(stream_corpus):
    save_as_line_sentence(stream_corpus, 'articles.txt')

In [3]:

sc = StreamCorpus(table='articles',limit=10000)

In [4]:

dump_to_corpus_file(sc)

In [5]:

model_sent = Word2Vec(sentences=LineSentence('articles.txt'), workers=8)

In [6]:

model_sent.save('articles.w2v')

In [7]:

model_sent2 = FastText(sentences=LineSentence('articles.txt'),  workers=8)

In [8]:

model_sent2.save('articles.ft')

In [9]:

model_sent3 = Doc2Vec(documents=TaggedLineDocument('articles.txt'), workers=8)

In [10]:

model_sent3.save('articles.d2v')

In [11]:

from gensim.test.utils import datapath

In [12]:

model_sent_accuracy = model_sent.wv.evaluate_word_analogies(datapath('questions-words.txt'))[0]
print("Word analogy accuracy with `sentences`: {:.1f}%".format(100.0 * model_sent_accuracy))

Word analogy accuracy with `sentences`: 3.5%

In [13]:

model_sent.wv.most_similar('zika')

Out[13]:

[('acquired', 0.48520341515541077),
 ('hepatitis', 0.4786548614501953),
 ('ongoing', 0.46461182832717896),
 ('covid', 0.46365872025489807),
 ('possible', 0.4460738003253937),
 ('suspected', 0.4410059154033661),
 ('usutu', 0.42246460914611816),
 ('rubella', 0.4187973737716675),
 ('chikv', 0.4147232472896576),
 ('brazil', 0.4095695912837982)]

Clustering documents¶

Now we are going to explore the clustering of documents using various distance metrics.

Word Movers Distance¶

Based on the minimum cumulative distance that word vectors from the text document A needs to travel to match exactly the point in the vectors space occupied by document B. To know more. The vectors are generated by a Word2vec embedding.

In [11]:

import pandas as pd
import seaborn as sns
import numpy as np

In [12]:

with open('articles.txt') as f:
    corpus = [line.split() for line in f.readlines()]

In [13]:

doc1 = corpus[0]
' '.join(doc1)

Out[13]:

'zika virus is novel teratogenic agent associated with cerebral anomalies because of the challenges associated with assessment of antenatal diagnosis and prognosis in fetuses screening for other congenital infections mostly relies on ultrasound we aimed to assess whether similar approach might be adequate for zika virus congenital syndrome provided that early markers of infection and adequate timing for screening are established'

In [27]:

dists = [(id,model_sent.wv.wmdistance(doc1,d)) for id,d in enumerate(corpus)]

In [29]:

sorted(dists, key=lambda x:x[1], reverse=False)[:10]

Out[29]:

[(0, 0.0),
 (7580, 0.6528682052528333),
 (7551, 0.6565560323984608),
 (2398, 0.6584101245021644),
 (5878, 0.6623878270496776),
 (1521, 0.66726562150337),
 (5291, 0.6695458647308221),
 (4784, 0.6737582588318226),
 (5922, 0.6760980985188448),
 (7201, 0.6764060969844079)]

In [30]:

' '.join(corpus[7580])

Out[30]:

'cdc has updated its interim guidance for health care providers caring for infants born to mothers with possible zika virus infection during pregnancy laboratory testing is recommended for infants born to mothers with laboratory evidence of zika virus infection during pregnancy and infants who have abnormal clinical or neuroimaging findings suggestive of congenital zika syndrome and maternal epidemiologic link suggesting possible transmission regardless of maternal zika virus test results congenital zika syndrome is recently recognized pattern of congenital anomalies associated with zika virus infection during pregnancy that includes microcephaly intracranial calcifications or other brain anomalies or eye anomalies among others recommended infant laboratory evaluation includes both molecular real time reverse transcription polymerase chain reaction rrt pcr and serologic immunoglobulin igm testing initial samples should be collected directly from the infant in the first days of life if possible testing of cord blood is not recommended positive infant serum or urine rrt pcr test result confirms congenital zika virus infection positive zika virus igm testing with negative rrt pcr result indicates probable congenital zika virus infection in addition to infant zika virus testing initial evaluation of all infants born to mothers with laboratory evidence of zika virus infection during pregnancy should include comprehensive physical examination including neurologic examination postnatal head ultrasound and standard newborn hearing screen infants with laboratory evidence of congenital zika virus infection should have comprehensive ophthalmologic exam and hearing assessment by auditory brainstem response abr testing before month of age recommendations for follow up of infants with laboratory evidence of congenital zika virus infection depend on whether abnormalities consistent with congenital zika syndrome are present infants with abnormalities consistent with congenital zika syndrome should have coordinated evaluation by multiple specialists within the first month of life additional evaluations will be needed within the first year of life including assessments of vision hearing feeding growth and and endocrine function families and caregivers will also need ongoing psychosocial support and assistance with coordination of care infants with laboratory evidence of congenital zika virus infection without apparent abnormalities should have ongoing developmental monitoring and screening by the primary care provider repeat hearing testing is recommended this guidance will be updated when additional information becomes available'

Doc2vec¶

In [14]:

vectors=[]
for doc_id in range(len(corpus[:100])):
    inferred_vector = model_sent3.infer_vector(corpus[doc_id])
    vectors.append(inferred_vector)
data = {'doc%s'%i: vec for i,vec in enumerate(vectors)}
dfdoc = pd.DataFrame(data)
dfdoc

Out[14]:

	doc0	doc1	doc2	doc3	doc4	doc5	doc6	doc7	doc8	doc9	...	doc90	doc91	doc92	doc93	doc94	doc95	doc96	doc97	doc98	doc99
0	0.021169	-0.406225	0.306174	0.017754	-0.152882	0.177285	-0.223070	-0.230813	-0.267274	-0.191737	...	-0.271154	-0.146777	0.436955	-0.171762	-0.161091	-0.629327	-0.603055	-0.153522	-0.133518	-0.078563
1	0.075238	-0.205888	0.034458	-0.178951	0.093324	0.059353	0.302494	-0.027226	0.164545	0.059933	...	-0.183173	0.144572	-0.010087	-0.003517	0.008553	-0.212541	-0.234060	-0.118086	-0.146674	0.086892
2	0.164354	0.097246	0.291730	0.268030	0.462329	0.248288	0.360440	0.296292	0.007848	0.143199	...	-0.220392	0.139836	-0.001665	0.307158	0.095971	0.427869	0.276152	0.178075	0.172050	0.076469
3	-0.113075	0.048212	-0.050844	-0.138734	-0.162175	-0.407979	-0.068506	-0.100371	-0.238094	-0.048398	...	-0.211948	-0.246509	-0.104687	-0.249376	-0.247753	-0.075362	-0.065870	-0.178658	0.228032	-0.074528
4	-0.233938	0.207447	0.165686	-0.201778	0.059278	0.169261	-0.084627	-0.328158	0.303713	-0.046885	...	0.098107	-0.013739	0.060953	-0.064543	0.026283	0.049826	0.165965	0.008358	-0.429361	0.081811
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
95	0.019896	0.182583	0.356824	0.037255	0.231595	-0.092559	-0.073817	0.351921	-0.136604	0.428351	...	0.379594	0.162574	-0.162180	0.208834	0.088523	0.674506	0.608945	0.263370	0.318956	0.057133
96	0.050456	-0.067742	-0.149736	-0.015624	-0.162374	-0.219187	-0.414413	-0.123712	0.028145	0.135236	...	0.323174	0.025379	-0.508923	0.029388	-0.255933	0.334791	-0.775076	0.353506	0.249756	0.007594
97	-0.114198	-0.284532	0.150257	-0.279301	-0.052956	-0.582650	0.191783	-0.565030	0.262155	-0.208502	...	0.216867	-0.128202	0.247126	-0.234699	-0.104273	-0.205464	0.143319	-0.278691	-0.544962	-0.031964
98	0.222263	0.444537	0.248003	-0.050038	0.166298	0.223055	-0.080637	-0.010301	0.415710	0.453056	...	1.042968	0.190344	-0.383757	-0.170256	0.152362	-0.050922	0.226971	0.435638	-0.199251	0.165298
99	0.171983	-0.378073	-0.415923	0.071291	-0.273732	-0.092025	-0.131373	-0.449564	-0.513998	-0.348724	...	0.072837	-0.233522	0.030361	-0.168347	-0.114523	-0.160217	-0.365731	-0.355383	-0.597088	-0.114362

100 rows × 100 columns

In [15]:

from sklearn.metrics.pairwise import cosine_similarity

In [16]:

cosine_similarity(dfdoc.values.T)

Out[16]:

array([[1.0000001 , 0.348492  , 0.33918974, ..., 0.521447  , 0.29241976,
        0.62724406],
       [0.348492  , 0.99999994, 0.48146072, ..., 0.6756368 , 0.24174899,
        0.5417718 ],
       [0.33918974, 0.48146072, 1.0000001 , ..., 0.520628  , 0.33048475,
        0.6469104 ],
       ...,
       [0.521447  , 0.6756368 , 0.520628  , ..., 1.0000001 , 0.23646055,
        0.6988205 ],
       [0.29241976, 0.24174899, 0.33048475, ..., 0.23646055, 1.0000001 ,
        0.3761577 ],
       [0.62724406, 0.5417718 , 0.6469104 , ..., 0.6988205 , 0.3761577 ,
        0.99999994]], dtype=float32)

In [17]:

sns.clustermap(cosine_similarity(dfdoc.values.T), center=0, cmap="vlag", metric='cosine',
               linewidths=.75, figsize=(13, 13));

/usr/local/lib/python3.9/dist-packages/seaborn/matrix.py:654: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

Annoy index and Similarity Search¶

In [46]:

from gensim.similarities.annoy import AnnoyIndexer

In [47]:

annoy_index = AnnoyIndexer(model_sent, 100)

In [48]:

vector = model_sent.wv["zikv"]
# The instance of AnnoyIndexer we just created is passed
approximate_neighbors = model_sent.wv.most_similar([vector], topn=11, indexer=annoy_index)
# Neatly print the approximate_neighbors and their corresponding cosine similarity values
print("Approximate Neighbors")
for neighbor in approximate_neighbors:
    print(neighbor)

normal_neighbors = model_sent.wv.most_similar([vector], topn=11)
print("\nExact Neighbors")
for neighbor in normal_neighbors:
    print(neighbor)

Approximate Neighbors
('zikv', 1.0)
('flavivirus', 0.558258056640625)
('viral', 0.5442558825016022)
('virus', 0.5291227102279663)
('wnv', 0.5078100562095642)
('utero', 0.5052283704280853)
('jev', 0.4979873299598694)
('denv', 0.49339520931243896)
('antibodies', 0.48890113830566406)
('zkv', 0.4836871027946472)
('subsequent', 0.4817664623260498)

Exact Neighbors
('zikv', 0.9999998211860657)
('flavivirus', 0.6097279191017151)
('viral', 0.5845945477485657)
('virus', 0.5565490126609802)
('wnv', 0.515498161315918)
('utero', 0.5104020833969116)
('jev', 0.495966374874115)
('denv', 0.48670315742492676)
('antibodies', 0.47755590081214905)
('zkv', 0.46684205532073975)
('subsequent', 0.4628679156303406)

In [ ]: