Neste Notebook vamos aplicar um classificador não supervisionado a uma coleção de artigos extraída do índice do MediaCloud, usando Dirichlet Process Gaussian Mixture models, do Scikit-Learn. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html#example-mixture-plot-gmm-py
Vamos também explorar técnicas de Deep learning usando word2vec do pacote gensim: http://radimrehurek.com/gensim/models/word2vec.html
uma explicação mais detalhada da metodologia do word2vec pode ser encontrada aqui: https://code.google.com/p/word2vec/
import json
import os
import gensim
import pymysql
import nltk # para tokenização de sentenças
from pymongo import MongoClient
from string import punctuation, digits
import bs4
from sklearn import mixture
import datetime
import pandas as pd
%pylab inline
Populating the interactive namespace from numpy and matplotlib
Primeiro vamos definir uma função para busca no índice:
def sphinx_query(index_name, query="", facet=None):
"""
Search Sphinx index using a simple match via SphinxQL
:param index_name: Name of the index to search on
:param query: String with the query expression
:param facet: Attribute name to facet by. Must be a list
:return: JSON (array of objects)
"""
try:
assert index_name in ['mediacloud_articles', 'mediacloud_feeds', 'mediacloud_tweets']
except AssertionError:
return json.dumps({"error": "Bad index name: {}".format(index_name)})
# Setup Sphinxsearch SphinxQL connection
sphinx_conn = pymysql.connect(host='200.20.164.152', port=9306)
cursor = sphinx_conn.cursor(pymysql.cursors.DictCursor)
if facet is None:
cursor.execute("SELECT * from " + index_name + " WHERE MATCH(%(query)s) "
"LIMIT %(limit)s OPTION max_matches=%(limit)s",
{'query': query, 'limit': 100000})
else:
cursor.execute("SELECT * from "+index_name+" WHERE MATCH(%s) " + " ".join(["FACET {}".format(f) for f in facet]),
(query,))
results = cursor.fetchall()
cursor.close()
return json.dumps(results)
consulta = 'Marina silva'
res = sphinx_query("mediacloud_articles",'"{}" '.format(consulta))
res = json.loads(res)
# retemos apenas as entradas que contém sumários
#filtramos também por datas
data_ini = datetime.datetime(2014,8,10,0,0,0)
data_fim = datetime.datetime(2014,8,28,0,0,0)
res = [d for d in res if datetime.datetime.fromtimestamp(d['published'])> data_ini and datetime.datetime.fromtimestamp(d['published']) <= data_fim]
print("{} resultados retidos".format(len(res)))
2467 resultados retidos
res[:3]
[{'id': 944080, 'summary': '<div class="field content-taxonomy field-categoria-noticia">\n <h3 class="label inline">Categoria: </h3>\n <div class="item inline odd">Eleições</div>\n <h5 class="description"></h5>\n </div>\n <!-- /content-field -->\n <!-- /content-field -->\n <!-- /content-field -->\n<p><img alt="" src="http://og.infg.com.br/in/12757943-f0f-c71/FT1086A/420/GetContentCA9VS5MB.jpg" style="width: 500px; height: 300px;" /></p>\n<p><strong>Jornal GGN </strong>- Sai na próxima semana a decisão sobre quem o PSB lançará como candidato a presidente no lugar de Eduardo Campos, morto em acidente aéreo no último dia 13. Marina Silva é a favorita pelo recall que tem junto ao eleitorado desde a eleição de 2010.</p>\n<p>Mas Marina, desde que ingressou no PSB, deixou claro que esse arranjo é provisório. Dura até que a Rede Sustentabilidade saia do papel. Diante do impasse sobre o futuro da chapa em 2014, fica a pergunta: será que o PSB vai conseguir impôr todas as suas condições à Marina e se consagrar como o partido representativo da terceira via no país, ou a figura Marina Silva vai se sobressair sobre o PSB? </p>\n<p></p><p><a href="http://jornalggn.com.br/noticia/marina-silva-e-psb-quem-engole-quem" target="_blank">leia mais</a></p>', 'title': 'Marina Silva e PSB: quem engole quem?', 'published': 1408291270, 'db': '', 'link': 'http://jornalggn.com.br/noticia/marina-silva-e-psb-quem-engole-quem', '_id': '53f0a8d0dcccdd842bd1936c', 'collection': '', 'links': '{"links":[{"href":"http:\\/\\/jornalggn.com.br\\/noticia\\/marina-silva-e-psb-quem-engole-quem","type":"text\\/html","rel":"alternate"}]}', 'language': 'pt', '._id': '53f0a8d0dcccdd842bd1936c'}, {'id': 985401, 'summary': '<p>Benedito Tadeu César* A campanha no rádio e na TV já começou há uma semana e Aécio Neves ainda não disse a que veio. Além de se apresentar ao eleitorado, não fez quase mais nada. Dá a impressão de estar paralisado diante do avanço de Marina Silva. Enquanto Aécio Neves apresenta-se com uma imagem de […]</p>\n<p>The post <a href="http://www.sul21.com.br/jornal/e-aecio-vai-descer-do-muro-e-disputar-com-marina-silva/" rel="nofollow">E Aécio, vai descer do muro e disputar com Marina Silva?</a> appeared first on <a href="http://www.sul21.com.br/jornal" rel="nofollow">Sul 21</a>.</p>', 'title': 'E Aécio, vai descer do muro e disputar com Marina Silva?', 'published': 1409084608, 'db': '', 'link': 'http://www.sul21.com.br/jornal/e-aecio-vai-descer-do-muro-e-disputar-com-marina-silva/', '_id': '53fcc487dcccdd5969d2fd9c', 'collection': '', 'links': '{"links":[{"href":"http:\\/\\/www.sul21.com.br\\/jornal\\/e-aecio-vai-descer-do-muro-e-disputar-com-marina-silva\\/","type":"text\\/html","rel":"alternate"}]}', 'language': 'pt', '._id': '53fcc487dcccdd5969d2fd9c'}, {'id': 931059, 'summary': 'O consultor político Vitor Oliveira e o cientista político Fábio Ostermann traçaram o deram entrevista ao InfoMoney e destacaram cenário para Marina Silva', 'title': '"PSB caiu no colo de Marina Silva"; mas será que ela tem a cara do partido?', 'published': 1408052040, 'db': '', 'link': 'http://www.infomoney.com.br//mercados/eleicoes/noticia/3515587/psb-caiu-colo-marina-silva-mas-sera-que-ela-tem', '_id': '53ed03b9dcccdd25bf76b41a', 'collection': '', 'links': '{"links":[{"href":"http:\\/\\/www.infomoney.com.br\\/\\/mercados\\/eleicoes\\/noticia\\/3515587\\/psb-caiu-colo-marina-silva-mas-sera-que-ela-tem","type":"text\\/html","rel":"alternate"}]}', 'language': 'pt', '._id': '53ed03b9dcccdd25bf76b41a'}]
Exportando a série Temporal
ts = pd.TimeSeries(data=np.ones(len(res)),index=[datetime.datetime.fromtimestamp(d['published']) for d in res])
#ts['Count']=1
rts=ts.resample('h',how="sum")
rts.plot();
#rts.to_csv('{}+filha;filho;renata;miguel_porhora_noticias.csv'.format(consulta))
/usr/lib/python3/dist-packages/IPython/kernel/__main__.py:1: FutureWarning: TimeSeries is deprecated. Please use Series if __name__ == '__main__':
<matplotlib.axes._subplots.AxesSubplot at 0x7f0d16bdc9e8>
Vamos agora construir um corpus só com os sumários
docs = [bs4.BeautifulSoup(d['summary']).get_text() for d in res]
/usr/lib/python3/dist-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently. To get rid of this warning, change this: BeautifulSoup([your markup]) to this: BeautifulSoup([your markup], "lxml") markup_type=markup_type))
Antes de poder aplicar o word2vec, precisamos criar um banco de frases que são a matéria prima do word2vec. Vamos armazenar estas frases em um banco Mongodb para facilitar a iteração sobre as frases.
Cada Documento dividido em frases se converte na seguinte lista:
nltk.tokenize.sent_tokenize(docs[1])
[u' E AGORA MARINA SILVA ?', u'VICE NA CHAPA DE GERALDO ALCKMIN DIZ QUE \xc9 MARINA QUEM PRECISA DO PSB"ELA N\xc3O PARTICIPA DE NADA NEM SABE O PRE\xc7O DO ALUGUEL DE UM COMIT\xca ELEITORAL" - M\xe1rcio Fran\xe7a / PSB-SPO PSB de S\xe3o Paulo n\xe3o parece muito interessado na quest\xe3o nacional do Partido, que tem de substituir Eduardo Campos, falecido em tr\xe1gico acidente de avi\xe3o ocorrido na \xfaltima Quarta-Feira.', u"Marina Silva, ao que tudo indica, ser\xe1 a nova cabe\xe7a de chapa do partido socialista, mas, as resist\xeancias ao seu nome, e principalmente as condi\xe7\xf5es que j\xe1 est\xe3o sendo colocadas na mesa de negocia\xe7\xe3o para \xa0que ela assuma a candidatura, est\xe3o crescendo na medida em que o tempo passa, e os interesses eleitorais se sobrep\xf5e ao impacto do acidente e perplexidade perante a perda do ex-governador e at\xe9 ent\xe3o candidato \xe0 presid\xeancia, Eduardo Campos.Coube ao membro do PSB de S\xe3o Paulo, que \xe9 o candidato a vice na chapa de Geraldo Alckmin (PSDB), colocar de forma bastante DURA e at\xe9 agressiva, como Marina Silva DEVE se COMPORTAR de agora em diante.Segundo mat\xe9ria de O Globo, M\xe1rcio Fran\xe7a, que \xe9 o TESOUREIRO da Campanha presidencial do PSB, disse o seguinte:\u2018Marina faz pol\xedtica de um jeito que n\xe3o \xe9 o nosso\u2019,'Agora \xe9 o PSB que ter\xe1 cr\xe9dito por aceit\xe1-la como candidata''\xc9 o mundo real que tem de ser colocado para que ela fa\xe7a sua escolha.", u'Seria injusto coloc\xe1-la como candidata sem que saiba.', u'Marina n\xe3o era respons\xe1vel por nada na campanha.', u"Ela n\xe3o sabe nem quanto custa o aluguel do comit\xea'\xa0'Ela tem de nos acolher e temos de acolh\xea-la.", u'Antes ela criou cr\xe9dito, pois era a mais famosa.', u'Neste instante, criou o d\xe9bito.', u'N\xf3s \xe9 que iremos acolh\xea-la para ser candidata a presidente.', u"Agora ela se torna nossa candidata para dirigir o pa\xeds''Todos concordam que ela tem de ser a pessoa, em fun\xe7\xe3o da express\xe3o de votos e do gesto generoso dela, que aceitou sair como vice.", u'Mas para ser a candidata da coliga\xe7\xe3o ela tem de ter o discurso da coliga\xe7\xe3o, n\xe3o da Rede.', u"Como candidata, Marina fazia o que bem entendia'.ENT\xc3O...As diferen\xe7as entre MARINA SILVA/REDE e o PSB, que eram contornadas e minimizadas por Eduardo Campos, interessado nos poss\xedveis votos que a ex-senadora e uma das lideran\xe7as no campo do Meio-Ambiente, poderia lhe transferir, ficam agora mais expostas.", u'Com pouca ou nenhuma sutileza, os l\xedderes do PSB est\xe3o dizendo para Marina que, a eles, n\xe3o interessa ganhar a elei\xe7\xe3o e n\xe3o assumir de fato o poder.O FUTURO de Marina Silva eleita, estaria assim selado.', u'Ela ser\xe1 taxada de "traidora dos compromissos assumidos", ou ent\xe3o ser\xe1 rotulada como "TUTELADA".A POL\xcdTICA continua desafiando a capacidade dos homens preverem o futuro.Postado por BONDeblog S. O.\xe0s 08:49Rea\xe7\xf5es:\xa0Nenhum coment\xe1rio: Links para esta postagem Enviar por e-mailBlogThis!Compartilhar no TwitterCompartilhar no FacebookCompartilhar no OrkutCompartilhar com o PinterestMarcadores: AN\xc1LISE POL\xcdTICA, Eduardo Campos, elei\xe7\xe3o 2014, marina silva, PSB, REDE SUSTENTABILIDADE\xa0Do 007BONDeblog.\xa0 ']
#criando um banco com as frases:
client = MongoClient()
db = client.word2vec
db.drop_collection('frases')
frases = db.frases
for n,doc in enumerate(docs):
frases.insert({'doc': n, 'frases':nltk.tokenize.sent_tokenize(doc)})
Agora vamos escrever um gerador que retorne uma frase de cada vez, como uma lista de tokens. mas antes vamos reduzir as palavras para minúsculas.
sw = nltk.corpus.stopwords.words('portuguese') + list(punctuation) + ['r',u'não',u'é', u'à','quarta','feira',
u'até', u'já', ')','(','"','\'','...',
'nesta', 'leia','quinta', 'foto',u'terça',
'dub','diz','dia',u'está','sexta', u'\u2022', '']
def get_sentences():
for doc in frases.find({}):
for f in doc['frases']:
yield [w.strip().strip(punctuation).strip(digits).lower() for w in f.split() if w not in sw]
sentences = get_sentences()
model = gensim.models.Word2Vec(sentences, min_count=15, size=5000, workers=8)
-c:8: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
model[u'campos']
array([ 5.23893279e-04, 1.79088038e-05, 1.05028019e-04, 2.29172118e-04, 2.95586302e-04, 7.58165203e-04, -5.76344319e-04, 2.38053588e-04, 5.28062810e-05, 1.92863081e-04, 5.85806207e-04, 7.97907705e-04, 9.24295513e-04, 8.21865979e-04, -9.21872561e-04, -9.73194081e-04, 2.61064735e-04, -7.43841345e-04, 3.51673807e-04, -3.75185453e-04, 8.89859628e-04, 6.00494270e-04, 6.72011229e-05, -1.55870031e-04, -1.72735075e-04, 4.95197164e-05, -6.47866807e-04, 9.84237646e-04, -6.03786204e-04, -8.39359200e-05, 2.75766390e-04, -1.89963670e-04, -5.39175526e-04, 5.62078203e-04, 1.19242177e-04, 2.42233567e-04, 7.56517285e-04, -6.19249622e-05, 2.87531933e-04, -9.46478627e-04, -1.06012034e-04, -5.92135708e-04, -3.77774268e-04, 2.74946186e-04, 3.78991885e-04, 1.86964171e-04, -7.96205422e-04, -3.42984742e-04, 4.47301900e-05, -5.60187618e-05, -6.73043716e-04, 8.87513044e-04, 5.59889922e-05, -4.94957087e-04, -4.20221884e-04, -4.25604056e-04, -2.97181454e-04, -9.36319528e-04, 2.97970750e-04, -7.75025750e-04, -5.46723837e-04, 4.66827158e-04, -9.53426468e-04, -7.10491207e-04, 9.51222144e-04, -9.72778653e-04, -4.83788550e-04, -1.42255609e-04, -9.21887113e-04, 3.70483322e-04, -6.99955039e-04, 5.11854596e-04, 3.92803049e-04, 7.62677169e-04, -9.29975242e-04, -2.17419336e-04, -5.55085717e-04, -8.71628814e-04, 1.87643469e-04, -1.73587803e-04, -9.73289774e-04, 1.63087359e-04, -2.43098024e-04, -2.15552544e-04, 3.71303759e-04, -6.09576644e-04, -7.31920183e-04, -6.11542084e-04, -7.44074408e-04, 2.18918158e-05, 4.49464191e-04, -3.86785629e-04, -5.92068711e-04, -3.84862622e-04, 8.57815554e-04, -3.78661673e-04, 4.16958967e-04, -9.89507171e-05, -6.98351068e-04, 9.51259572e-04, 9.57804907e-04, 5.80066815e-04, -5.89028059e-04, 9.40719503e-04, -4.29141044e-04, -2.08327911e-04, 7.55751040e-04, 2.69514567e-04, -5.46889823e-05, -9.61516111e-04, 7.81890878e-04, -4.69731385e-05, 3.19411833e-04, -9.29796253e-04, 1.92499283e-05, -6.21327723e-04, -6.04629931e-05, -4.94555978e-04, 4.71509906e-04, 9.83222621e-04, -2.58649059e-04, -9.96271265e-05, 1.59290881e-04, -3.03994195e-04, -6.25954242e-04, -3.51568335e-04, 5.74506936e-04, 8.57352919e-04, 5.89576259e-04, 4.20273573e-04, 9.21050785e-04, -1.77620877e-05, 9.89018357e-04, 8.74065794e-04, -6.86237618e-05, -2.15866865e-04, -9.74479597e-04, 7.10288587e-04, -5.35184634e-04, 5.59966611e-05, 5.84039954e-04, -2.49185279e-04, -2.92311946e-04, -2.59064924e-04, 4.24475322e-04, 8.64766935e-06, -8.80346401e-04, -4.65822290e-04, 1.46741237e-04, 3.56685021e-04, -9.26185690e-04, 8.34594131e-04, -1.06989930e-04, 5.44855080e-04, 3.64181236e-04, -3.47429042e-04, -3.72369832e-04, -2.97406397e-04, 2.06150420e-04, 5.87075949e-04, -6.64789637e-04, 4.28912084e-04, 8.58307991e-04, -1.75266177e-04, 8.79928120e-04, -1.89004364e-04, 8.81552769e-05, 6.36734185e-04, -9.68277920e-04, 3.11788055e-04, 7.57886912e-04, 9.34953569e-04, -2.47946911e-04, 1.53302171e-04, 5.97478996e-04, -2.53253616e-04, 2.29312325e-04, -7.50559208e-04, -4.87921934e-04, -4.14028909e-04, -5.06650948e-04, 2.76281789e-04, 6.19143364e-04, 7.88605132e-04, 4.47717117e-04, -2.03682648e-05, 6.34617638e-04, -7.54247303e-04, -7.52443273e-04, -8.73103854e-04, 6.21452127e-05, -5.08156954e-05, 8.97634774e-04, 2.89845339e-05, -2.17704364e-04, -7.50960491e-04, -3.41716484e-04, -7.68916681e-04, -6.71162736e-04, -3.69461253e-04, -7.34252390e-04, 6.76503187e-05, -5.32741135e-04, 9.02711647e-04, -7.41690514e-04, 6.65260712e-04, -2.43696741e-05, 2.43325630e-04, 8.59927793e-04, -2.57895910e-04, 4.33875772e-04, 9.13934127e-05, 3.34309239e-04, -5.05356642e-04, -5.71637764e-04, 8.73375568e-04, -6.22097927e-04, 6.62013423e-04, -5.84099791e-04, 5.39807836e-04, -7.19183154e-05, 3.56500794e-04, -7.26707804e-04, 2.51671969e-04, -4.75059729e-04, 8.18891858e-04, 3.77306096e-05, -2.10286249e-04, -1.71941210e-04, -6.33325311e-04, 5.15134052e-05, 9.20835766e-04, 5.67668234e-04, 9.32389812e-04, -5.80465596e-04, -8.78936728e-04, 4.44262143e-04, 9.24268854e-04, -9.29471804e-04, -8.20283021e-04, 4.29955369e-04, -9.25454151e-05, -3.76301381e-04, -1.21094738e-04, -1.83412179e-04, 9.85971186e-04, -1.77519105e-05, 3.61571438e-04, 3.76928831e-04, -1.78786038e-04, -2.99936946e-04, 7.58341455e-04, -3.24279790e-05, -5.54069178e-04, 5.28737786e-04, 7.65940640e-04, 6.73771370e-04, -2.17277229e-05, -4.08445223e-04, 9.27292043e-04, -5.20134403e-04, -5.85513306e-04, 9.91150737e-04, -1.41596436e-04, -2.82345078e-04, -4.37668175e-04, -4.01786150e-04, -9.28851659e-04, 6.12151169e-04, 3.84094601e-04, -7.67542049e-04, -4.04195773e-04, -3.72379145e-04, 5.57815583e-05, 1.37700437e-04, -7.71536434e-04, 9.28500551e-04, -1.35422393e-04, -5.94689453e-04, -8.93246965e-04, 4.39988158e-04, 9.60051548e-04, 5.05677774e-04, -2.02398496e-06, -5.64724207e-04, -1.56770577e-04, -8.34278471e-04, -4.16954572e-05, 4.45764803e-04, 4.32301138e-04, -1.40444768e-04, 2.33138242e-04, -1.95687408e-05, -3.13134078e-04, -7.02400401e-04, -9.13420052e-04, 4.98964218e-04, -1.78242539e-04, 5.99061023e-04, -1.87499798e-04, 9.51521331e-04, 5.02253941e-04, -4.60130395e-04, -6.74830109e-04, 6.50300295e-04, 8.95398378e-04, 7.52736523e-04, -3.91425972e-04, -2.90023454e-04, 6.16393227e-05, -4.96709137e-04, -9.10982606e-04, -6.82570215e-04, 5.81153261e-04, 4.12369496e-04, 1.97709815e-04, -4.03406564e-04, -1.53987843e-04, 2.78030522e-04, -7.57803849e-04, 7.15100730e-04, 3.98327364e-04, -6.57645229e-04, 4.42969293e-04, -6.36187091e-04, 3.97284195e-04, -3.45088425e-04, -8.83301138e-04, -2.70975634e-05, 7.64860131e-04, 9.16610763e-04, -2.77226733e-04, -5.52534591e-04, -6.84835482e-04, -1.52705179e-04, -9.98962205e-04, -8.81970336e-04, 9.07477457e-04, 6.10043295e-04, -5.49547607e-04, 8.18100001e-04, -7.89020560e-04, 8.99604929e-04, 1.23563630e-04, 8.10005586e-04, 8.55899649e-04, -6.92078320e-04, 1.28470303e-04, -5.44800947e-04, -5.56548475e-04, 1.34947986e-04, -8.84350666e-05, 1.23377511e-04, 1.39125681e-04, -1.62568744e-04, -6.62047474e-04, 1.61205855e-04, 7.51926505e-04, 7.42614211e-04, -6.69916742e-04, 6.60056074e-04, 6.82296522e-04, -3.86777101e-05, 4.98551643e-04, -8.56725208e-04, 9.79297445e-04, -9.39043239e-04, 6.05766720e-04, 9.36742174e-04, -4.48285828e-05, -2.58430191e-06, 5.39918721e-04, -6.87298074e-04, -2.62357120e-04, -7.20143493e-04, 5.34245453e-04, 3.47504014e-04, -2.39483561e-04, -1.36203147e-04, 5.24296542e-04, 6.23623782e-04, 6.83981867e-04, 3.60102073e-04, 7.88629521e-04, 3.73954172e-05, 2.83613976e-04, 9.56011121e-04, -4.20490484e-04, -8.93550168e-04, 1.39869153e-04, -7.27466424e-04, -7.85416923e-05, -9.52683389e-04, 4.28150583e-04, -6.06924819e-04, -6.13922894e-05, -4.66008532e-05, -4.36828850e-04, 9.91382753e-04, -8.36372026e-04, 2.50996032e-04, 6.27341506e-04, 4.80519637e-04, -5.15225111e-04, -2.59515018e-05, -7.26724684e-05, 6.44338259e-04, -8.07514414e-04, -4.85514465e-04, 8.33143888e-04, 2.44099763e-04, 9.16489866e-04, 3.61290586e-04, -8.98460741e-04, -7.64110591e-05, 8.33873579e-04, 2.70979392e-04, 2.74727732e-04, 8.81038723e-04, -2.99580657e-04, -5.83104615e-04, 8.34427658e-04, 4.80117276e-04, 4.02216741e-04, -2.33919855e-06, -3.39063816e-04, 8.23286304e-04, -8.95432895e-04, -6.38764584e-04, -7.72423009e-05, 8.48955475e-04, 9.04241984e-04, 8.41062225e-04, -8.43735761e-04, -7.57612404e-04, 4.77302208e-04, 5.93541132e-04, 6.76513766e-04, 1.78072776e-04, -9.80433542e-04, -4.56849288e-04, 7.61639327e-04, -6.93925365e-04, -8.19590699e-04, 2.85725575e-04, 5.88593655e-04, -5.20271307e-04, -8.20187212e-04, -1.48870779e-04, -8.99339269e-04, 4.22099460e-04, 5.21900365e-04, 2.90259981e-04, 9.28250956e-04, -8.33772006e-04, 5.06795419e-04, -7.68737693e-04, -6.88438362e-04, -3.12800839e-04, 7.91816914e-04, 8.99660110e-04, 1.62126657e-04, 3.47757712e-04, -6.20392209e-04, -8.70258838e-04, -2.48638098e-04, -8.81525164e-04, -5.96576720e-04, 5.59174747e-04, 4.15783783e-04, 8.44018999e-04, -5.61107125e-04, -7.13689253e-04, -1.83506912e-04, 4.80242947e-04, 3.25008819e-04, -1.31783003e-04, -1.61290984e-04, -2.74879967e-05, -4.42265067e-04, 3.29250965e-04, 4.85369383e-04, -4.99524700e-04, -4.78953516e-05, 2.25481654e-05, -1.31745182e-04, 4.52668290e-04, -8.37180822e-04, 8.05083429e-04, 5.02907787e-04, -2.42717491e-04, -3.92102462e-04, 2.84880487e-04, -2.18000336e-04, 5.99259103e-04, 1.74565284e-04, 8.87349248e-04, -8.62279092e-04, -8.13564868e-04, -8.49096919e-04], dtype=float32)
model.most_similar(positive=[u'eduardo', u'marina' ], negative=[u'aécio'], topn=10)
[(u'pr\xf3xima', 0.0489029735326767), (u'judici\xe1rio', 0.04631877690553665), (u'nara', 0.04554399847984314), (u'psol', 0.04423150420188904), (u'voltar', 0.04336097091436386), (u'coletiva', 0.04289194196462631), (u'provocou', 0.0422094501554966), (u'ter\xe7a', 0.041922423988580704), (u'eleitorado', 0.04177185148000717), (u'tend\xeancia', 0.04028308391571045)]
model.doesnt_match(['eduardo', 'marina', 'psb', 'avião', 'futebol'])
'eduardo'
model.similarity('marina','psb')
0.016895555188071957
scatter(model.syn0[:,1],model.syn0[:,2]);
len(model.vocab)
3779
model.table
model.vocab
{u'': <gensim.models.word2vec.Vocab at 0x7fa07c337310>, u')foto': <gensim.models.word2vec.Vocab at 0x7fa07c3ee0d0>, u',': <gensim.models.word2vec.Vocab at 0x7fa07c339590>, u'-': <gensim.models.word2vec.Vocab at 0x7fa07c3d56d0>, u'.': <gensim.models.word2vec.Vocab at 0x7fa07c82b050>, u'/': <gensim.models.word2vec.Vocab at 0x7fa07c320310>, u'/02/2013)foto': <gensim.models.word2vec.Vocab at 0x7fa07c361e90>, u'/07)foto': <gensim.models.word2vec.Vocab at 0x7fa07c3f0950>, u'/08).foto': <gensim.models.word2vec.Vocab at 0x7fa07b749b50>, u'/08)foto': <gensim.models.word2vec.Vocab at 0x7fa07c361790>, u'/08/': <gensim.models.word2vec.Vocab at 0x7fa07d762750>, u'/1/2014)foto': <gensim.models.word2vec.Vocab at 0x7fa07c3d5110>, u'/10/2010)foto': <gensim.models.word2vec.Vocab at 0x7fa07c361650>, u'/15/': <gensim.models.word2vec.Vocab at 0x7fa07c35ed10>, u'/16/': <gensim.models.word2vec.Vocab at 0x7fa07c829150>, u'/21/': <gensim.models.word2vec.Vocab at 0x7fa07c81fa90>, u'/6)foto': <gensim.models.word2vec.Vocab at 0x7fa07c385150>, u'/7)foto': <gensim.models.word2vec.Vocab at 0x7fa07c353690>, u'/7/2007)foto': <gensim.models.word2vec.Vocab at 0x7fa07c3d54d0>, u'/8)foto': <gensim.models.word2vec.Vocab at 0x7fa07c361910>, u'/8/2005)foto': <gensim.models.word2vec.Vocab at 0x7fa07c3d5390>, u'/8/2014)foto': <gensim.models.word2vec.Vocab at 0x7fa07c3e4650>, u':': <gensim.models.word2vec.Vocab at 0x7fa07d763d10>, u':00marcadores': <gensim.models.word2vec.Vocab at 0x7fa07c321a50>, u':30marcadores': <gensim.models.word2vec.Vocab at 0x7fa07c349690>, u'a': <gensim.models.word2vec.Vocab at 0x7fa07c3216d0>, u'abaixo': <gensim.models.word2vec.Vocab at 0x7fa07b8cc0d0>, u'abalada': <gensim.models.word2vec.Vocab at 0x7fa07c365110>, u'abandonou': <gensim.models.word2vec.Vocab at 0x7fa07d7aaed0>, u'abastecimento': <gensim.models.word2vec.Vocab at 0x7fa07b419b50>, u'abc': <gensim.models.word2vec.Vocab at 0x7fa07c3de410>, u'abdelmassih': <gensim.models.word2vec.Vocab at 0x7fa07b1f2050>, u'aberta': <gensim.models.word2vec.Vocab at 0x7fa07c529090>, u'abertas': <gensim.models.word2vec.Vocab at 0x7fa07b303990>, u'aberto': <gensim.models.word2vec.Vocab at 0x7fa07c828310>, u'abertura': <gensim.models.word2vec.Vocab at 0x7fa07b6da210>, u'abordagem': <gensim.models.word2vec.Vocab at 0x7fa07c365710>, u'aborto': <gensim.models.word2vec.Vocab at 0x7fa07c3856d0>, u'abre': <gensim.models.word2vec.Vocab at 0x7fa07c339e90>, u'abreu': <gensim.models.word2vec.Vocab at 0x7fa07b862d90>, u'abril': <gensim.models.word2vec.Vocab at 0x7fa07c815150>, u'abrir': <gensim.models.word2vec.Vocab at 0x7fa07cc899d0>, u'abriu': <gensim.models.word2vec.Vocab at 0x7fa07c5320d0>, u'absoluta': <gensim.models.word2vec.Vocab at 0x7fa07c3e5690>, u'absolutamente': <gensim.models.word2vec.Vocab at 0x7fa07c0c5f90>, u'acaba': <gensim.models.word2vec.Vocab at 0x7fa07b7e7150>, u'acabar': <gensim.models.word2vec.Vocab at 0x7fa07cb1d250>, u'acabou': <gensim.models.word2vec.Vocab at 0x7fa07d77b790>, u'academia': <gensim.models.word2vec.Vocab at 0x7fa07b73d750>, u'acad\xeamico': <gensim.models.word2vec.Vocab at 0x7fa07b8670d0>, u'aceitar': <gensim.models.word2vec.Vocab at 0x7fa07cb24ed0>, u'aceitou': <gensim.models.word2vec.Vocab at 0x7fa07d328e90>, u'acenam': <gensim.models.word2vec.Vocab at 0x7fa07b555f90>, u'acesse': <gensim.models.word2vec.Vocab at 0x7fa07b88b910>, u'acesso': <gensim.models.word2vec.Vocab at 0x7fa07c38f4d0>, u'acha': <gensim.models.word2vec.Vocab at 0x7fa07b8bc090>, u'achar': <gensim.models.word2vec.Vocab at 0x7fa07b874950>, u'acho': <gensim.models.word2vec.Vocab at 0x7fa07c532810>, u'acidente': <gensim.models.word2vec.Vocab at 0x7fa07c337950>, u'acidentes': <gensim.models.word2vec.Vocab at 0x7fa07b842290>, u'acima': <gensim.models.word2vec.Vocab at 0x7fa07cb1b750>, u'acm': <gensim.models.word2vec.Vocab at 0x7fa07c3ee390>, u'acompanha': <gensim.models.word2vec.Vocab at 0x7fa07c3d5710>, u'acompanhada': <gensim.models.word2vec.Vocab at 0x7fa07c82ced0>, u'acompanham': <gensim.models.word2vec.Vocab at 0x7fa07c333dd0>, u'acompanhar': <gensim.models.word2vec.Vocab at 0x7fa07c353e90>, u'acompanhe': <gensim.models.word2vec.Vocab at 0x7fa07b7446d0>, u'acompanhou': <gensim.models.word2vec.Vocab at 0x7fa07c337390>, u'acontece': <gensim.models.word2vec.Vocab at 0x7fa07d32e5d0>, u'acontecendo': <gensim.models.word2vec.Vocab at 0x7fa07b813f10>, u'acontecer': <gensim.models.word2vec.Vocab at 0x7fa07b739d10>, u'aconteceu': <gensim.models.word2vec.Vocab at 0x7fa07c3876d0>, u'acordo': <gensim.models.word2vec.Vocab at 0x7fa07c341450>, u'acordos': <gensim.models.word2vec.Vocab at 0x7fa07d343b90>, u'acredita': <gensim.models.word2vec.Vocab at 0x7fa07cb2fa90>, u'acreditam': <gensim.models.word2vec.Vocab at 0x7fa07cb20910>, u'acreditar': <gensim.models.word2vec.Vocab at 0x7fa07cc61d10>, u'acredito': <gensim.models.word2vec.Vocab at 0x7fa07cc71610>, u'acrescentou': <gensim.models.word2vec.Vocab at 0x7fa07c38fc50>, u'acumulado': <gensim.models.word2vec.Vocab at 0x7fa07b71d990>, u'acusado': <gensim.models.word2vec.Vocab at 0x7fa07b84d350>, u'acusados': <gensim.models.word2vec.Vocab at 0x7fa07b4ea710>, u'acusa\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b733ed0>, u'acusa\xe7\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07b470110>, u'acusou': <gensim.models.word2vec.Vocab at 0x7fa07d782a90>, u'administrativa': <gensim.models.word2vec.Vocab at 0x7fa07b5c2590>, u'administrativo': <gensim.models.word2vec.Vocab at 0x7fa07b2f1850>, u'administra\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b6e0f50>, u'admiradores': <gensim.models.word2vec.Vocab at 0x7fa07b61a290>, u'admite': <gensim.models.word2vec.Vocab at 0x7fa07b7bf490>, u'adolescente': <gensim.models.word2vec.Vocab at 0x7fa07b2ac490>, u'adolescentes': <gensim.models.word2vec.Vocab at 0x7fa07b523590>, u'advers\xe1ria': <gensim.models.word2vec.Vocab at 0x7fa07cb24450>, u'advers\xe1rio': <gensim.models.word2vec.Vocab at 0x7fa07cb12d50>, u'advers\xe1rios': <gensim.models.word2vec.Vocab at 0x7fa07c33b750>, u'advogado': <gensim.models.word2vec.Vocab at 0x7fa07c81fc50>, u'advogados': <gensim.models.word2vec.Vocab at 0x7fa07b7f8f90>, u'aeronave': <gensim.models.word2vec.Vocab at 0x7fa07d782810>, u'aeron\xe1utica': <gensim.models.word2vec.Vocab at 0x7fa07cc613d0>, u'aeroporto': <gensim.models.word2vec.Vocab at 0x7fa07c82cc90>, u'aeroportos': <gensim.models.word2vec.Vocab at 0x7fa07c53a6d0>, u'af': <gensim.models.word2vec.Vocab at 0x7fa07b7a7f10>, u'afetados': <gensim.models.word2vec.Vocab at 0x7fa07b516a90>, u'afinal': <gensim.models.word2vec.Vocab at 0x7fa07c51e7d0>, u'afirma': <gensim.models.word2vec.Vocab at 0x7fa07c387410>, u'afirmando': <gensim.models.word2vec.Vocab at 0x7fa07cc74b90>, u'afirmar': <gensim.models.word2vec.Vocab at 0x7fa07d75c490>, u'afirmou': <gensim.models.word2vec.Vocab at 0x7fa07d343f50>, u'afroreagge': <gensim.models.word2vec.Vocab at 0x7fa07c3f0e10>, u'agenda': <gensim.models.word2vec.Vocab at 0x7fa07d31dd10>, u'agente': <gensim.models.word2vec.Vocab at 0x7fa07b4da5d0>, u'agentes': <gensim.models.word2vec.Vocab at 0x7fa07b42fb90>, u'ago': <gensim.models.word2vec.Vocab at 0x7fa07c3217d0>, u'agora': <gensim.models.word2vec.Vocab at 0x7fa07c341390>, u'agosto': <gensim.models.word2vec.Vocab at 0x7fa07c82b450>, u'agricultura': <gensim.models.word2vec.Vocab at 0x7fa07c54ddd0>, u'agroneg\xf3cio': <gensim.models.word2vec.Vocab at 0x7fa07d77d910>, u'agropecu\xe1ria': <gensim.models.word2vec.Vocab at 0x7fa07cb42a50>, u'aguarda': <gensim.models.word2vec.Vocab at 0x7fa07b24ed90>, u'aguardam': <gensim.models.word2vec.Vocab at 0x7fa07b54efd0>, u'ag\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07d3174d0>, u'ag\xeancias': <gensim.models.word2vec.Vocab at 0x7fa07c51e190>, u'aikawa/instituto': <gensim.models.word2vec.Vocab at 0x7fa07c3f04d0>, u'ainda': <gensim.models.word2vec.Vocab at 0x7fa07c341fd0>, u'aires': <gensim.models.word2vec.Vocab at 0x7fa07b661150>, u'ajuda': <gensim.models.word2vec.Vocab at 0x7fa07b819390>, u'ajudar': <gensim.models.word2vec.Vocab at 0x7fa07c5293d0>, u'ajudou': <gensim.models.word2vec.Vocab at 0x7fa07b86cc90>, u'ajuste': <gensim.models.word2vec.Vocab at 0x7fa07b749ed0>, u'ajustes': <gensim.models.word2vec.Vocab at 0x7fa07cb1b210>, u'al': <gensim.models.word2vec.Vocab at 0x7fa07c3e4c90>, u'alagoas': <gensim.models.word2vec.Vocab at 0x7fa07c532c50>, u'alan': <gensim.models.word2vec.Vocab at 0x7fa07c3617d0>, u'alberto': <gensim.models.word2vec.Vocab at 0x7fa07c33b650>, u'albuquerque': <gensim.models.word2vec.Vocab at 0x7fa07d343750>, u'alcance': <gensim.models.word2vec.Vocab at 0x7fa07d75c0d0>, u'alcan\xe7ar': <gensim.models.word2vec.Vocab at 0x7fa07c2a5950>, u'alcan\xe7ou': <gensim.models.word2vec.Vocab at 0x7fa07b648750>, u'alckmin': <gensim.models.word2vec.Vocab at 0x7fa07c334d10>, u'aldeota': <gensim.models.word2vec.Vocab at 0x7fa07b103710>, u'aldo': <gensim.models.word2vec.Vocab at 0x7fa07cb42690>, u'alegre': <gensim.models.word2vec.Vocab at 0x7fa07c3ee550>, u'alegria': <gensim.models.word2vec.Vocab at 0x7fa07c82c810>, u'alemanha': <gensim.models.word2vec.Vocab at 0x7fa07b739590>, u'alem\xe3': <gensim.models.word2vec.Vocab at 0x7fa07b2a7750>, u'alem\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b666e10>, u'alencar': <gensim.models.word2vec.Vocab at 0x7fa07b7224d0>, u'alerta': <gensim.models.word2vec.Vocab at 0x7fa07cb12850>, u'alexandre': <gensim.models.word2vec.Vocab at 0x7fa07c387b10>, u'alfredo': <gensim.models.word2vec.Vocab at 0x7fa07b79bd50>, u'algo': <gensim.models.word2vec.Vocab at 0x7fa07c341d50>, u'algum': <gensim.models.word2vec.Vocab at 0x7fa07d343450>, u'alguma': <gensim.models.word2vec.Vocab at 0x7fa07c341890>, u'algumas': <gensim.models.word2vec.Vocab at 0x7fa07c3349d0>, u'alguns': <gensim.models.word2vec.Vocab at 0x7fa07c334b50>, u'algu\xe9m': <gensim.models.word2vec.Vocab at 0x7fa07cb2f310>, u'ali': <gensim.models.word2vec.Vocab at 0x7fa07b8cc210>, u'aliado': <gensim.models.word2vec.Vocab at 0x7fa07c8159d0>, u'aliados': <gensim.models.word2vec.Vocab at 0x7fa07cb203d0>, u'alian\xe7a': <gensim.models.word2vec.Vocab at 0x7fa07c321050>, u'alian\xe7as': <gensim.models.word2vec.Vocab at 0x7fa07c334bd0>, u'alice': <gensim.models.word2vec.Vocab at 0x7fa07c339310>, u'alimenta\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b6e5790>, u'alimentos': <gensim.models.word2vec.Vocab at 0x7fa07b4c4ed0>, u'ali\xe1s': <gensim.models.word2vec.Vocab at 0x7fa07d31d110>, u'all': <gensim.models.word2vec.Vocab at 0x7fa07b3d8890>, u'almeida': <gensim.models.word2vec.Vocab at 0x7fa07b451d10>, u'aloizio': <gensim.models.word2vec.Vocab at 0x7fa07c3d5750>, u'alta': <gensim.models.word2vec.Vocab at 0x7fa07b862ad0>, u'altera\xe7\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07c532b90>, u'alternativa': <gensim.models.word2vec.Vocab at 0x7fa07c32b110>, u'alternativas': <gensim.models.word2vec.Vocab at 0x7fa07c84b0d0>, u'alto': <gensim.models.word2vec.Vocab at 0x7fa07c3e4450>, u'altos': <gensim.models.word2vec.Vocab at 0x7fa07b790b90>, u'altura': <gensim.models.word2vec.Vocab at 0x7fa07b7c47d0>, u'alunos': <gensim.models.word2vec.Vocab at 0x7fa07b6a43d0>, u'alves': <gensim.models.word2vec.Vocab at 0x7fa07c519550>, u'alvo': <gensim.models.word2vec.Vocab at 0x7fa07c33b590>, u'al\xe9m': <gensim.models.word2vec.Vocab at 0x7fa07c339850>, u'amanh\xe3': <gensim.models.word2vec.Vocab at 0x7fa07cb24090>, u'amaral': <gensim.models.word2vec.Vocab at 0x7fa07cb1d4d0>, u'amaro': <gensim.models.word2vec.Vocab at 0x7fa07cc5ddd0>, u'amazonasfoto': <gensim.models.word2vec.Vocab at 0x7fa07c3f0810>, u'ambiental': <gensim.models.word2vec.Vocab at 0x7fa07c527c90>, u'ambientalista': <gensim.models.word2vec.Vocab at 0x7fa07c81fb10>, u'ambiente': <gensim.models.word2vec.Vocab at 0x7fa07d7aa390>, u'ambos': <gensim.models.word2vec.Vocab at 0x7fa07c8294d0>, u'amea\xe7a': <gensim.models.word2vec.Vocab at 0x7fa07c34f350>, u'americana': <gensim.models.word2vec.Vocab at 0x7fa07b5a0910>, u'americano': <gensim.models.word2vec.Vocab at 0x7fa07b8746d0>, u'americanos': <gensim.models.word2vec.Vocab at 0x7fa07b5e4a50>, u'amiga': <gensim.models.word2vec.Vocab at 0x7fa07b7e7290>, u'amigo': <gensim.models.word2vec.Vocab at 0x7fa07c532390>, u'amigos': <gensim.models.word2vec.Vocab at 0x7fa07c365ad0>, u'amizade': <gensim.models.word2vec.Vocab at 0x7fa07b66b9d0>, u'amor': <gensim.models.word2vec.Vocab at 0x7fa07b5d8b50>, u'amorim': <gensim.models.word2vec.Vocab at 0x7fa07b83d490>, u'ampla': <gensim.models.word2vec.Vocab at 0x7fa07b79bcd0>, u'ampliar': <gensim.models.word2vec.Vocab at 0x7fa07b6d6c90>, u'amplia\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b6da4d0>, u'am\xe9lia': <gensim.models.word2vec.Vocab at 0x7fa07c3f0ad0>, u'am\xe9rica': <gensim.models.word2vec.Vocab at 0x7fa07c339410>, u'ana': <gensim.models.word2vec.Vocab at 0x7fa07c84b390>, u'anac': <gensim.models.word2vec.Vocab at 0x7fa07b7ac610>, u'analisar': <gensim.models.word2vec.Vocab at 0x7fa07c81f590>, u'analista': <gensim.models.word2vec.Vocab at 0x7fa07b59ba50>, u'analistas': <gensim.models.word2vec.Vocab at 0x7fa07c333590>, u'anatel': <gensim.models.word2vec.Vocab at 0x7fa07b227110>, u'and': <gensim.models.word2vec.Vocab at 0x7fa07c35e710>, u'anda': <gensim.models.word2vec.Vocab at 0x7fa07b749350>, u'andar': <gensim.models.word2vec.Vocab at 0x7fa07b8052d0>, u'andrade': <gensim.models.word2vec.Vocab at 0x7fa07b7a7f50>, u'andr\xe9': <gensim.models.word2vec.Vocab at 0x7fa07c34f9d0>, u'animais': <gensim.models.word2vec.Vocab at 0x7fa07b255ed0>, u'animal': <gensim.models.word2vec.Vocab at 0x7fa07b708850>, u'anima\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b531690>, u'anivers\xe1rio': <gensim.models.word2vec.Vocab at 0x7fa07c3f0150>, u'ano': <gensim.models.word2vec.Vocab at 0x7fa07d75c790>, u'anofoto': <gensim.models.word2vec.Vocab at 0x7fa07c3e4b10>, u'anos': <gensim.models.word2vec.Vocab at 0x7fa07c33d6d0>, u'ante': <gensim.models.word2vec.Vocab at 0x7fa07cb247d0>, u'anterior': <gensim.models.word2vec.Vocab at 0x7fa07cb24710>, u'anteriores': <gensim.models.word2vec.Vocab at 0x7fa07b5bb250>, u'anteriormente': <gensim.models.word2vec.Vocab at 0x7fa07d762e50>, u'antes': <gensim.models.word2vec.Vocab at 0x7fa07c31cd90>, u'anthony': <gensim.models.word2vec.Vocab at 0x7fa07b8749d0>, u'antigo': <gensim.models.word2vec.Vocab at 0x7fa07b88be90>, u'antonio': <gensim.models.word2vec.Vocab at 0x7fa07cb20a90>, u'ant\xf4nio': <gensim.models.word2vec.Vocab at 0x7fa07c81fcd0>, u'anual': <gensim.models.word2vec.Vocab at 0x7fa07b755f50>, u'anuncia': <gensim.models.word2vec.Vocab at 0x7fa07b72dd10>, u'anunciada': <gensim.models.word2vec.Vocab at 0x7fa07cb1d550>, u'anunciado': <gensim.models.word2vec.Vocab at 0x7fa07b8626d0>, u'anunciar': <gensim.models.word2vec.Vocab at 0x7fa07cb20b90>, u'anunciou': <gensim.models.word2vec.Vocab at 0x7fa07c366390>, u'an\xe1lise': <gensim.models.word2vec.Vocab at 0x7fa07d32cc10>, u'an\xfancio': <gensim.models.word2vec.Vocab at 0x7fa07cb1d8d0>, u'ao': <gensim.models.word2vec.Vocab at 0x7fa07d32eb50>, u'aos': <gensim.models.word2vec.Vocab at 0x7fa07b6d6f50>, u'aparece': <gensim.models.word2vec.Vocab at 0x7fa07d77d2d0>, u'aparecem': <gensim.models.word2vec.Vocab at 0x7fa07c3e5d90>, u'apareceu': <gensim.models.word2vec.Vocab at 0x7fa07c3f4850>, u'aparelhos': <gensim.models.word2vec.Vocab at 0x7fa07b1e7750>, u'apartamento': <gensim.models.word2vec.Vocab at 0x7fa07b47e290>, u'apenas': <gensim.models.word2vec.Vocab at 0x7fa07c33d710>, u'aperto': <gensim.models.word2vec.Vocab at 0x7fa07c385e50>, u'apesar': <gensim.models.word2vec.Vocab at 0x7fa07cb12c50>, u'aplicativo': <gensim.models.word2vec.Vocab at 0x7fa07c5749d0>, u'apoia': <gensim.models.word2vec.Vocab at 0x7fa07c532110>, u'apoiar': <gensim.models.word2vec.Vocab at 0x7fa07c81ff90>, u'apoio': <gensim.models.word2vec.Vocab at 0x7fa07c34f890>, u'apoios': <gensim.models.word2vec.Vocab at 0x7fa07c828ed0>, u'apoiou': <gensim.models.word2vec.Vocab at 0x7fa07b8856d0>, u'aponta': <gensim.models.word2vec.Vocab at 0x7fa07b86c550>, u'apontado': <gensim.models.word2vec.Vocab at 0x7fa07c519fd0>, u'apontam': <gensim.models.word2vec.Vocab at 0x7fa07c82cb10>, u'apontar': <gensim.models.word2vec.Vocab at 0x7fa07b7f8dd0>, u'apontou': <gensim.models.word2vec.Vocab at 0x7fa07c527810>, u'aposentadoria': <gensim.models.word2vec.Vocab at 0x7fa07b6e5690>, u'aposta': <gensim.models.word2vec.Vocab at 0x7fa07c333750>, u'appeared': <gensim.models.word2vec.Vocab at 0x7fa07c339d90>, u'apreens\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b59bdd0>, u'apresenta': <gensim.models.word2vec.Vocab at 0x7fa07cb2f750>, u'apresentada': <gensim.models.word2vec.Vocab at 0x7fa07c815110>, u'apresentado': <gensim.models.word2vec.Vocab at 0x7fa07b744750>, u'apresentadores': <gensim.models.word2vec.Vocab at 0x7fa07c337e10>, u'apresentando': <gensim.models.word2vec.Vocab at 0x7fa07c53a490>, u'apresentar': <gensim.models.word2vec.Vocab at 0x7fa07c339890>, u'apresenta\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b5dee90>, u'apresenta\xe7\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07b41d1d0>, u'apresentou': <gensim.models.word2vec.Vocab at 0x7fa07c31ca10>, u'aprovada': <gensim.models.word2vec.Vocab at 0x7fa07b564790>, u'aprova\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c515190>, u'aproveitou': <gensim.models.word2vec.Vocab at 0x7fa07d782cd0>, u'aprovou': <gensim.models.word2vec.Vocab at 0x7fa07c321210>, u'aproximadamente': <gensim.models.word2vec.Vocab at 0x7fa07b84d790>, u'aproxima\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c3e48d0>, u'apurar': <gensim.models.word2vec.Vocab at 0x7fa07b700790>, u'apura\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c515310>, u'ap\xf3s': <gensim.models.word2vec.Vocab at 0x7fa07c31c410>, u'aquavi\xe1rios': <gensim.models.word2vec.Vocab at 0x7fa07c06b7d0>, u'aqui': <gensim.models.word2vec.Vocab at 0x7fa07d31d550>, u'aquisi\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b787e10>, u'ar': <gensim.models.word2vec.Vocab at 0x7fa07d7824d0>, u'aracaju': <gensim.models.word2vec.Vocab at 0x7fa07c387e10>, u'arag\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b1482d0>, u'arapiraca': <gensim.models.word2vec.Vocab at 0x7fa07c3e4c50>, u'ara\xfajo': <gensim.models.word2vec.Vocab at 0x7fa07b4bb8d0>, u'arcebispo': <gensim.models.word2vec.Vocab at 0x7fa07c361690>, u'arcoplex': <gensim.models.word2vec.Vocab at 0x7fa07b1036d0>, u'argentina': <gensim.models.word2vec.Vocab at 0x7fa07b65cf90>, u'argentino': <gensim.models.word2vec.Vocab at 0x7fa07b831250>, u'argumento': <gensim.models.word2vec.Vocab at 0x7fa07b879450>, u'argumentou': <gensim.models.word2vec.Vocab at 0x7fa07c519490>, u'arma': <gensim.models.word2vec.Vocab at 0x7fa07b6b0290>, u'armado': <gensim.models.word2vec.Vocab at 0x7fa07b34e350>, u'armando': <gensim.models.word2vec.Vocab at 0x7fa07c2a5650>, u'armas': <gensim.models.word2vec.Vocab at 0x7fa07d317d10>, u'arraes': <gensim.models.word2vec.Vocab at 0x7fa07c3d5310>, u'arrecada\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07cc89310>, u'arremeteu': <gensim.models.word2vec.Vocab at 0x7fa07cc614d0>, u'arruda': <gensim.models.word2vec.Vocab at 0x7fa07b70d990>, u'arte': <gensim.models.word2vec.Vocab at 0x7fa07b432210>, u'artes': <gensim.models.word2vec.Vocab at 0x7fa07b678990>, u'articula\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b82b650>, u'artigo': <gensim.models.word2vec.Vocab at 0x7fa07b7f80d0>, u'artigos': <gensim.models.word2vec.Vocab at 0x7fa07b87f690>, u'artista': <gensim.models.word2vec.Vocab at 0x7fa07c263550>, u'artistas': <gensim.models.word2vec.Vocab at 0x7fa07b378f50>, u'as': <gensim.models.word2vec.Vocab at 0x7fa07d784110>, u'aspectos': <gensim.models.word2vec.Vocab at 0x7fa07c52d850>, u'assembleia': <gensim.models.word2vec.Vocab at 0x7fa07b84df50>, u'assessor': <gensim.models.word2vec.Vocab at 0x7fa07cb2f150>, u'assessores': <gensim.models.word2vec.Vocab at 0x7fa07cb2fe50>, u'assessoria': <gensim.models.word2vec.Vocab at 0x7fa07b813190>, u'assim': <gensim.models.word2vec.Vocab at 0x7fa07c33dd90>, u'assinada': <gensim.models.word2vec.Vocab at 0x7fa07c38fbd0>, u'assinado': <gensim.models.word2vec.Vocab at 0x7fa07b84dbd0>, u'assis': <gensim.models.word2vec.Vocab at 0x7fa07b837810>, u'assistente': <gensim.models.word2vec.Vocab at 0x7fa07b368ad0>, u'assist\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07b3e8510>, u'associados': <gensim.models.word2vec.Vocab at 0x7fa07b848250>, u'associa\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b744f50>, u'assume': <gensim.models.word2vec.Vocab at 0x7fa07cb174d0>, u'assumir': <gensim.models.word2vec.Vocab at 0x7fa07c32b3d0>, u'assumiu': <gensim.models.word2vec.Vocab at 0x7fa07c3413d0>, u'assunto': <gensim.models.word2vec.Vocab at 0x7fa07c320350>, u'assuntos': <gensim.models.word2vec.Vocab at 0x7fa07b782810>, u'assun\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b3ce950>, u'atacama': <gensim.models.word2vec.Vocab at 0x7fa07afd9d90>, u'atacar': <gensim.models.word2vec.Vocab at 0x7fa07b72d190>, u'ataque': <gensim.models.word2vec.Vocab at 0x7fa07c38f390>, u'ataques': <gensim.models.word2vec.Vocab at 0x7fa07b879c10>, u'atende': <gensim.models.word2vec.Vocab at 0x7fa07b787f90>, u'atender': <gensim.models.word2vec.Vocab at 0x7fa07c32bfd0>, u'atendimento': <gensim.models.word2vec.Vocab at 0x7fa07b42aad0>, u'aten\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b867c50>, u'atingir': <gensim.models.word2vec.Vocab at 0x7fa07b62c910>, u'atingiu': <gensim.models.word2vec.Vocab at 0x7fa07b57fed0>, u'atitude': <gensim.models.word2vec.Vocab at 0x7fa07d77b890>, u'atividade': <gensim.models.word2vec.Vocab at 0x7fa07b790110>, u'atividades': <gensim.models.word2vec.Vocab at 0x7fa07b790d50>, u'ativos': <gensim.models.word2vec.Vocab at 0x7fa07b7555d0>, u'ato': <gensim.models.word2vec.Vocab at 0x7fa07cb17550>, u'ator': <gensim.models.word2vec.Vocab at 0x7fa07b678b10>, u'atos': <gensim.models.word2vec.Vocab at 0x7fa07b4da4d0>, u'atrair': <gensim.models.word2vec.Vocab at 0x7fa07c3f4b90>, u'atraso': <gensim.models.word2vec.Vocab at 0x7fa07b5d8310>, u'atrav\xe9s': <gensim.models.word2vec.Vocab at 0x7fa07b7625d0>, u'atra\xe7\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07bf95d90>, u'atriz': <gensim.models.word2vec.Vocab at 0x7fa07b27b650>, u'atr\xe1s': <gensim.models.word2vec.Vocab at 0x7fa07cb245d0>, u'atuais': <gensim.models.word2vec.Vocab at 0x7fa07c815610>, u'atual': <gensim.models.word2vec.Vocab at 0x7fa07d7827d0>, u'atualmente': <gensim.models.word2vec.Vocab at 0x7fa07c519290>, u'atuar': <gensim.models.word2vec.Vocab at 0x7fa07d318e90>, u'atua\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b744cd0>, u'at\xe9': <gensim.models.word2vec.Vocab at 0x7fa07d318ed0>, u'audi\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07c84b950>, u'augusto': <gensim.models.word2vec.Vocab at 0x7fa07cc61fd0>, u'aula': <gensim.models.word2vec.Vocab at 0x7fa07b15d190>, u'aulas': <gensim.models.word2vec.Vocab at 0x7fa07b27e890>, u'aumenta': <gensim.models.word2vec.Vocab at 0x7fa07b503f10>, u'aumentar': <gensim.models.word2vec.Vocab at 0x7fa07b84d550>, u'aumento': <gensim.models.word2vec.Vocab at 0x7fa07d31ded0>, u'aumentou': <gensim.models.word2vec.Vocab at 0x7fa07b6fb910>, u'aus\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07b5a7790>, u'autonomia': <gensim.models.word2vec.Vocab at 0x7fa07b854850>, u'autor': <gensim.models.word2vec.Vocab at 0x7fa07b7e1c50>, u'autores': <gensim.models.word2vec.Vocab at 0x7fa07c261110>, u'autoria': <gensim.models.word2vec.Vocab at 0x7fa07b4752d0>, u'autoridade': <gensim.models.word2vec.Vocab at 0x7fa07d762410>, u'autoridades': <gensim.models.word2vec.Vocab at 0x7fa07b755190>, u'autoriza\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b8134d0>, u'auxiliar': <gensim.models.word2vec.Vocab at 0x7fa07b7d3ed0>, u'aux\xedlio': <gensim.models.word2vec.Vocab at 0x7fa07b564a90>, u'av': <gensim.models.word2vec.Vocab at 0x7fa07b83d710>, u'avalia': <gensim.models.word2vec.Vocab at 0x7fa07c385350>, u'avaliar': <gensim.models.word2vec.Vocab at 0x7fa07b602950>, u'avalia\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07d32e650>, u'avaliou': <gensim.models.word2vec.Vocab at 0x7fa07b499910>, u'avan\xe7o': <gensim.models.word2vec.Vocab at 0x7fa07c339ad0>, u'avan\xe7ou': <gensim.models.word2vec.Vocab at 0x7fa07b437890>, u'avenida': <gensim.models.word2vec.Vocab at 0x7fa07b6aad10>, u'aventura': <gensim.models.word2vec.Vocab at 0x7fa07d763c10>, u'avia\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c31c150>, u'avi\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c320590>, u'avi\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07b523c90>, u'av\xf4': <gensim.models.word2vec.Vocab at 0x7fa07c3d5350>, u'a\xe7o': <gensim.models.word2vec.Vocab at 0x7fa07b2e4dd0>, u'a\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b85df50>, u'a\xe7\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07d317590>, u'a\xe7\xfacar': <gensim.models.word2vec.Vocab at 0x7fa07b65c410>, u'a\xe9cio': <gensim.models.word2vec.Vocab at 0x7fa07c33db10>, u'a\xe9rea': <gensim.models.word2vec.Vocab at 0x7fa07c387850>, u'a\xe9reo': <gensim.models.word2vec.Vocab at 0x7fa07cb12050>, u'a\xe9reos': <gensim.models.word2vec.Vocab at 0x7fa07b625390>, u'a\xed': <gensim.models.word2vec.Vocab at 0x7fa07d343950>, u'b': <gensim.models.word2vec.Vocab at 0x7fa07c84b510>, u'bacia': <gensim.models.word2vec.Vocab at 0x7fa07b854f10>, u'bahia': <gensim.models.word2vec.Vocab at 0x7fa07c353550>, u'baiana': <gensim.models.word2vec.Vocab at 0x7fa07c3f0450>, u'baiano': <gensim.models.word2vec.Vocab at 0x7fa07b493110>, u'bairro': <gensim.models.word2vec.Vocab at 0x7fa07cb17590>, u'bairros': <gensim.models.word2vec.Vocab at 0x7fa07b5284d0>, u'baixa': <gensim.models.word2vec.Vocab at 0x7fa07b733110>, u'baixo': <gensim.models.word2vec.Vocab at 0x7fa07cb17b90>, u'baixos': <gensim.models.word2vec.Vocab at 0x7fa07b3bd450>, u'balan\xe7o': <gensim.models.word2vec.Vocab at 0x7fa07c3331d0>, u'bancada': <gensim.models.word2vec.Vocab at 0x7fa07d31df10>, u'banco': <gensim.models.word2vec.Vocab at 0x7fa07c321d50>, u'bancos': <gensim.models.word2vec.Vocab at 0x7fa07d77b050>, u'band': <gensim.models.word2vec.Vocab at 0x7fa07b56b8d0>, u'banda': <gensim.models.word2vec.Vocab at 0x7fa07b749f50>, u'bandeira': <gensim.models.word2vec.Vocab at 0x7fa07d77d6d0>, u'bandeirantes': <gensim.models.word2vec.Vocab at 0x7fa07c3e5350>, u'bandfoto': <gensim.models.word2vec.Vocab at 0x7fa07b56bd90>, u'banqueiros': <gensim.models.word2vec.Vocab at 0x7fa07b858450>, u'bar': <gensim.models.word2vec.Vocab at 0x7fa07b5bb490>, u'barbosa': <gensim.models.word2vec.Vocab at 0x7fa07b854490>, u'barra': <gensim.models.word2vec.Vocab at 0x7fa07b6f5750>, u'barros': <gensim.models.word2vec.Vocab at 0x7fa07b73df50>, u'base': <gensim.models.word2vec.Vocab at 0x7fa07d75c950>, u'baseada': <gensim.models.word2vec.Vocab at 0x7fa07b744b90>, u'baseado': <gensim.models.word2vec.Vocab at 0x7fa07b874410>, u'basta': <gensim.models.word2vec.Vocab at 0x7fa07b684a90>, u'bastante': <gensim.models.word2vec.Vocab at 0x7fa07d328290>, u'bastidores': <gensim.models.word2vec.Vocab at 0x7fa07cc8b150>, u'batalha': <gensim.models.word2vec.Vocab at 0x7fa07b62ce50>, u'bate': <gensim.models.word2vec.Vocab at 0x7fa07b86c4d0>, u'bater': <gensim.models.word2vec.Vocab at 0x7fa07c54d0d0>, u'bateu': <gensim.models.word2vec.Vocab at 0x7fa07b33f450>, u'batista': <gensim.models.word2vec.Vocab at 0x7fa07c35e210>, u'bbc': <gensim.models.word2vec.Vocab at 0x7fa07b874990>, u'bc': <gensim.models.word2vec.Vocab at 0x7fa07b74ffd0>, u'beb\xea': <gensim.models.word2vec.Vocab at 0x7fa07b508c50>, u'beira': <gensim.models.word2vec.Vocab at 0x7fa07b41dd10>, u'bela': <gensim.models.word2vec.Vocab at 0x7fa07b7908d0>, u'beleza': <gensim.models.word2vec.Vocab at 0x7fa07b6fb6d0>, u'belo': <gensim.models.word2vec.Vocab at 0x7fa07c349190>, u'bem': <gensim.models.word2vec.Vocab at 0x7fa07d328fd0>, u'benef\xedcio': <gensim.models.word2vec.Vocab at 0x7fa07b728ad0>, u'benef\xedcios': <gensim.models.word2vec.Vocab at 0x7fa07b3fca10>, u'benfica': <gensim.models.word2vec.Vocab at 0x7fa07b0f5d10>, u'bens': <gensim.models.word2vec.Vocab at 0x7fa07b7ac4d0>, u'berlim': <gensim.models.word2vec.Vocab at 0x7fa07b2a73d0>, u'bernardo': <gensim.models.word2vec.Vocab at 0x7fa07c3f06d0>, u'beto': <gensim.models.word2vec.Vocab at 0x7fa07d343710>, u'bezerra': <gensim.models.word2vec.Vocab at 0x7fa07c527610>, u'bezerra/futura': <gensim.models.word2vec.Vocab at 0x7fa07b56b9d0>, u'bienal': <gensim.models.word2vec.Vocab at 0x7fa07c3e5d50>, u'bilh\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07d317410>, u'bilh\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07c3395d0>, u'bloco': <gensim.models.word2vec.Vocab at 0x7fa07b6a4e10>, u'blog': <gensim.models.word2vec.Vocab at 0x7fa07c337050>, u'bndes': <gensim.models.word2vec.Vocab at 0x7fa07b71d3d0>, u'boa': <gensim.models.word2vec.Vocab at 0x7fa07c341c10>, u'boas': <gensim.models.word2vec.Vocab at 0x7fa07b70eb90>, u'boatos': <gensim.models.word2vec.Vocab at 0x7fa07b728b50>, u'boca': <gensim.models.word2vec.Vocab at 0x7fa07b776d90>, u'bola': <gensim.models.word2vec.Vocab at 0x7fa07b72d650>, u'bolsa': <gensim.models.word2vec.Vocab at 0x7fa07b842ad0>, u'bolsas': <gensim.models.word2vec.Vocab at 0x7fa07b854f90>, u'bom': <gensim.models.word2vec.Vocab at 0x7fa07c341690>, u'bombeiros': <gensim.models.word2vec.Vocab at 0x7fa07b8318d0>, u'bondeblog': <gensim.models.word2vec.Vocab at 0x7fa07d32c910>, u'bonner': <gensim.models.word2vec.Vocab at 0x7fa07b854710>, u'bons': <gensim.models.word2vec.Vocab at 0x7fa07c3de8d0>, u'bordo': <gensim.models.word2vec.Vocab at 0x7fa07c3652d0>, u'bovespa': <gensim.models.word2vec.Vocab at 0x7fa07b4373d0>, u'br-': <gensim.models.word2vec.Vocab at 0x7fa07b3e5190>, u'branca': <gensim.models.word2vec.Vocab at 0x7fa07b493cd0>, u'branco': <gensim.models.word2vec.Vocab at 0x7fa07cb24950>, u'brancos': <gensim.models.word2vec.Vocab at 0x7fa07c82b110>, u'brasil': <gensim.models.word2vec.Vocab at 0x7fa07c334150>, u'brasila\xe9cio': <gensim.models.word2vec.Vocab at 0x7fa07c3eead0>, u'brasilcandidato': <gensim.models.word2vec.Vocab at 0x7fa07c3f0ed0>, u'brasilcartaz': <gensim.models.word2vec.Vocab at 0x7fa07b55c7d0>, u'brasileira': <gensim.models.word2vec.Vocab at 0x7fa07c337910>, u'brasileiras': <gensim.models.word2vec.Vocab at 0x7fa07c529c50>, u'brasileiro': <gensim.models.word2vec.Vocab at 0x7fa07d7627d0>, u'brasileiros': <gensim.models.word2vec.Vocab at 0x7fa07cb12b50>, u'brasilhomem': <gensim.models.word2vec.Vocab at 0x7fa07b55c750>, u'bras\xedlia': <gensim.models.word2vec.Vocab at 0x7fa07d32e610>, u'brazil': <gensim.models.word2vec.Vocab at 0x7fa07c519650>, u'bra\xe7o': <gensim.models.word2vec.Vocab at 0x7fa07b71d490>, u'briga': <gensim.models.word2vec.Vocab at 0x7fa07b819fd0>, u'brito': <gensim.models.word2vec.Vocab at 0x7fa07b7e70d0>, u'brito/coliga\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c3f0e90>, u'brit\xe2nico': <gensim.models.word2vec.Vocab at 0x7fa07b8745d0>, u'brown': <gensim.models.word2vec.Vocab at 0x7fa07b7ca550>, u'bruno': <gensim.models.word2vec.Vocab at 0x7fa07b7ac7d0>, u'bruto': <gensim.models.word2vec.Vocab at 0x7fa07b3098d0>, u'buenos': <gensim.models.word2vec.Vocab at 0x7fa07b661110>, u'busca': <gensim.models.word2vec.Vocab at 0x7fa07d760f50>, u'buscando': <gensim.models.word2vec.Vocab at 0x7fa07c333650>, u'buscar': <gensim.models.word2vec.Vocab at 0x7fa07b825850>, u'by': <gensim.models.word2vec.Vocab at 0x7fa07c35e390>, u'b\xe1sica': <gensim.models.word2vec.Vocab at 0x7fa07b755cd0>, u'b\xe1sico': <gensim.models.word2vec.Vocab at 0x7fa07b620fd0>, u'c': <gensim.models.word2vec.Vocab at 0x7fa07b770f10>, u'cabe': <gensim.models.word2vec.Vocab at 0x7fa07b8c7b10>, u'cabe\xe7a': <gensim.models.word2vec.Vocab at 0x7fa07c34f5d0>, u'cabe\xe7alho': <gensim.models.word2vec.Vocab at 0x7fa07b88b890>, u'cabo': <gensim.models.word2vec.Vocab at 0x7fa07b831650>, u'cada': <gensim.models.word2vec.Vocab at 0x7fa07c333710>, u'cadastro': <gensim.models.word2vec.Vocab at 0x7fa07bf1fc10>, u'cadeia': <gensim.models.word2vec.Vocab at 0x7fa07b40a1d0>, u'caf\xe9': <gensim.models.word2vec.Vocab at 0x7fa07c353a50>, u'cai': <gensim.models.word2vec.Vocab at 0x7fa07b8bc4d0>, u'cair': <gensim.models.word2vec.Vocab at 0x7fa07cb24490>, u'caiu': <gensim.models.word2vec.Vocab at 0x7fa07d782890>, u'caixa': <gensim.models.word2vec.Vocab at 0x7fa07c320790>, u'caixa-preta': <gensim.models.word2vec.Vocab at 0x7fa07b700f10>, u'caixas': <gensim.models.word2vec.Vocab at 0x7fa07d782550>, u'caix\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b8cc190>, u'cambial': <gensim.models.word2vec.Vocab at 0x7fa07b232c90>, u'camilo': <gensim.models.word2vec.Vocab at 0x7fa07b8bc610>, u'caminha': <gensim.models.word2vec.Vocab at 0x7fa07c3de350>, u'caminhada': <gensim.models.word2vec.Vocab at 0x7fa07c334750>, u'caminho': <gensim.models.word2vec.Vocab at 0x7fa07c341350>, u'caminhos': <gensim.models.word2vec.Vocab at 0x7fa07cb1db90>, u'caminh\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b831890>, u'caminh\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07b633b90>, u'campal': <gensim.models.word2vec.Vocab at 0x7fa07b61a610>, u'campanha': <gensim.models.word2vec.Vocab at 0x7fa07c33b8d0>, u'campanhas': <gensim.models.word2vec.Vocab at 0x7fa07c32b310>, u'campeonato': <gensim.models.word2vec.Vocab at 0x7fa07b2c0590>, u'campo': <gensim.models.word2vec.Vocab at 0x7fa07c333610>, u'campos': <gensim.models.word2vec.Vocab at 0x7fa07c341a90>, u'camposeduardo': <gensim.models.word2vec.Vocab at 0x7fa07c385050>, u'camposfoto': <gensim.models.word2vec.Vocab at 0x7fa07b57fa10>, u'canad\xe1': <gensim.models.word2vec.Vocab at 0x7fa07b75b150>, u'canal': <gensim.models.word2vec.Vocab at 0x7fa07c3e4890>, u'candidata': <gensim.models.word2vec.Vocab at 0x7fa07c321e90>, u'candidato': <gensim.models.word2vec.Vocab at 0x7fa07c337650>, u'candidatos': <gensim.models.word2vec.Vocab at 0x7fa07cb20590>, u'candidatura': <gensim.models.word2vec.Vocab at 0x7fa07c341410>, u'candidaturas': <gensim.models.word2vec.Vocab at 0x7fa07c829490>, u'cantor': <gensim.models.word2vec.Vocab at 0x7fa07b3108d0>, u'cantora': <gensim.models.word2vec.Vocab at 0x7fa07b1c8a90>, u'can\xe7\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07c552490>, u'capa': <gensim.models.word2vec.Vocab at 0x7fa07c82c950>, u'capacidade': <gensim.models.word2vec.Vocab at 0x7fa07d32c810>, u'capaz': <gensim.models.word2vec.Vocab at 0x7fa07c33b6d0>, u'capital': <gensim.models.word2vec.Vocab at 0x7fa07c34f150>, u'capit\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b1d4b10>, u'cara': <gensim.models.word2vec.Vocab at 0x7fa07b762550>, u'caracter\xedsticas': <gensim.models.word2vec.Vocab at 0x7fa07d760050>, u'caravana': <gensim.models.word2vec.Vocab at 0x7fa07b09f410>, u'cardoso': <gensim.models.word2vec.Vocab at 0x7fa07c334f50>, u'carga': <gensim.models.word2vec.Vocab at 0x7fa07b6f5b90>, u'cargo': <gensim.models.word2vec.Vocab at 0x7fa07d7aa1d0>, u'cargos': <gensim.models.word2vec.Vocab at 0x7fa07b7b1f10>, u'carlos': <gensim.models.word2vec.Vocab at 0x7fa07c31c510>, u'carlos\xe0s': <gensim.models.word2vec.Vocab at 0x7fa07c321a10>, u'caro': <gensim.models.word2vec.Vocab at 0x7fa07b88b550>, u'carolina': <gensim.models.word2vec.Vocab at 0x7fa07c3e4dd0>, u'carreata': <gensim.models.word2vec.Vocab at 0x7fa07c3eee50>, u'carrega': <gensim.models.word2vec.Vocab at 0x7fa07c34f650>, u'carregam': <gensim.models.word2vec.Vocab at 0x7fa07b555e90>, u'carreira': <gensim.models.word2vec.Vocab at 0x7fa07b848090>, u'carro': <gensim.models.word2vec.Vocab at 0x7fa07b837210>, u'carros': <gensim.models.word2vec.Vocab at 0x7fa07b55cfd0>, u'carta': <gensim.models.word2vec.Vocab at 0x7fa07c34f110>, u'cartaz': <gensim.models.word2vec.Vocab at 0x7fa07b555650>, u'cartazes': <gensim.models.word2vec.Vocab at 0x7fa07b55c910>, u'carteira': <gensim.models.word2vec.Vocab at 0x7fa07b5bbe10>, u'cartel': <gensim.models.word2vec.Vocab at 0x7fa07b5c2790>, u'cart\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b708b50>, u'cart\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07c5747d0>, u'carvalho': <gensim.models.word2vec.Vocab at 0x7fa07b6b0bd0>, u'car\xe1ter': <gensim.models.word2vec.Vocab at 0x7fa07c341950>, u'casa': <gensim.models.word2vec.Vocab at 0x7fa07cb175d0>, u'casal': <gensim.models.word2vec.Vocab at 0x7fa07cc61f10>, u'casamento': <gensim.models.word2vec.Vocab at 0x7fa07c815c10>, u'casas': <gensim.models.word2vec.Vocab at 0x7fa07b8373d0>, u'caso': <gensim.models.word2vec.Vocab at 0x7fa07c341d10>, u'casos': <gensim.models.word2vec.Vocab at 0x7fa07c334c90>, u'castro': <gensim.models.word2vec.Vocab at 0x7fa07b879cd0>, u'catarina': <gensim.models.word2vec.Vocab at 0x7fa07d343e90>, u'categoria': <gensim.models.word2vec.Vocab at 0x7fa07c321b50>, u'caucaia': <gensim.models.word2vec.Vocab at 0x7fa07c931550>, u'causa': <gensim.models.word2vec.Vocab at 0x7fa07c33b410>, u'causados': <gensim.models.word2vec.Vocab at 0x7fa07b3ceb50>, u'causas': <gensim.models.word2vec.Vocab at 0x7fa07c527910>, u'causou': <gensim.models.word2vec.Vocab at 0x7fa07b7ed5d0>, u'ca\xedram': <gensim.models.word2vec.Vocab at 0x7fa07c3dec90>, u'ce': <gensim.models.word2vec.Vocab at 0x7fa07b749ad0>, u'cearense': <gensim.models.word2vec.Vocab at 0x7fa07b41da50>, u'cearenses': <gensim.models.word2vec.Vocab at 0x7fa07b419bd0>, u'cear\xe1': <gensim.models.word2vec.Vocab at 0x7fa07c8281d0>, u'cec\xedlia': <gensim.models.word2vec.Vocab at 0x7fa07b555210>, u'cedo': <gensim.models.word2vec.Vocab at 0x7fa07d762f50>, u'celebrada': <gensim.models.word2vec.Vocab at 0x7fa07b61a310>, u'celular': <gensim.models.word2vec.Vocab at 0x7fa07b336050>, u'celvio\xe0s': <gensim.models.word2vec.Vocab at 0x7fa07d763cd0>, u'cemit\xe9rio': <gensim.models.word2vec.Vocab at 0x7fa07cc5dd90>, u'cena': <gensim.models.word2vec.Vocab at 0x7fa07c334850>, u'cenas': <gensim.models.word2vec.Vocab at 0x7fa07c3d5b90>, u'cenipa': <gensim.models.word2vec.Vocab at 0x7fa07b45a690>, u'centenas': <gensim.models.word2vec.Vocab at 0x7fa07b8c7e50>, u'center': <gensim.models.word2vec.Vocab at 0x7fa07b5a0f50>, u'centerplex': <gensim.models.word2vec.Vocab at 0x7fa07b103790>, u'cento': <gensim.models.word2vec.Vocab at 0x7fa07b67ddd0>, u'central': <gensim.models.word2vec.Vocab at 0x7fa07c320890>, u'centro': <gensim.models.word2vec.Vocab at 0x7fa07d32ecd0>, u'centros': <gensim.models.word2vec.Vocab at 0x7fa07b451110>, u'cen\xe1rio': <gensim.models.word2vec.Vocab at 0x7fa07cb1bd90>, u'cen\xe1rios': <gensim.models.word2vec.Vocab at 0x7fa07cb20710>, u'cerca': <gensim.models.word2vec.Vocab at 0x7fa07d32e0d0>, u'cerim\xf4nia': <gensim.models.word2vec.Vocab at 0x7fa07b61a3d0>, u'certa': <gensim.models.word2vec.Vocab at 0x7fa07d31d650>, u'certame': <gensim.models.word2vec.Vocab at 0x7fa07bf22190>, u'certamente': <gensim.models.word2vec.Vocab at 0x7fa07c815510>, u'certeza': <gensim.models.word2vec.Vocab at 0x7fa07c82b9d0>, u'certo': <gensim.models.word2vec.Vocab at 0x7fa07c32bcd0>, u'cessar-fogo': <gensim.models.word2vec.Vocab at 0x7fa07b236f90>, u'cessna': <gensim.models.word2vec.Vocab at 0x7fa07c3205d0>, u'chama': <gensim.models.word2vec.Vocab at 0x7fa07cc791d0>, u'chamada': <gensim.models.word2vec.Vocab at 0x7fa07b7c4890>, u'chamado': <gensim.models.word2vec.Vocab at 0x7fa07d32e590>, u'chamar': <gensim.models.word2vec.Vocab at 0x7fa07c54d490>, u'chamou': <gensim.models.word2vec.Vocab at 0x7fa07b4f9550>, u'chance': <gensim.models.word2vec.Vocab at 0x7fa07c34fcd0>, u'chances': <gensim.models.word2vec.Vocab at 0x7fa07c320f50>, u'chapa': <gensim.models.word2vec.Vocab at 0x7fa07c34f610>, u'chapafoto': <gensim.models.word2vec.Vocab at 0x7fa07c3e4b90>, u'chefe': <gensim.models.word2vec.Vocab at 0x7fa07cc71050>, u'chega': <gensim.models.word2vec.Vocab at 0x7fa07c3ded90>, u'chegada': <gensim.models.word2vec.Vocab at 0x7fa07b6c6fd0>, u'chegam': <gensim.models.word2vec.Vocab at 0x7fa07c353710>, u'chegando': <gensim.models.word2vec.Vocab at 0x7fa07b570cd0>, u'chegar': <gensim.models.word2vec.Vocab at 0x7fa07c33bd50>, u'chegaram': <gensim.models.word2vec.Vocab at 0x7fa07c3879d0>, u'chegou': <gensim.models.word2vec.Vocab at 0x7fa07c527850>, u'cheio': <gensim.models.word2vec.Vocab at 0x7fa07cb129d0>, u'chico': <gensim.models.word2vec.Vocab at 0x7fa07d77d310>, u'chile': <gensim.models.word2vec.Vocab at 0x7fa07b70d050>, u'china': <gensim.models.word2vec.Vocab at 0x7fa07b75b190>, u'choque': <gensim.models.word2vec.Vocab at 0x7fa07d75cc50>, u'chuva': <gensim.models.word2vec.Vocab at 0x7fa07b398050>, u'chuvas': <gensim.models.word2vec.Vocab at 0x7fa07b21cdd0>, u'ch\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c53a610>, u'cia': <gensim.models.word2vec.Vocab at 0x7fa07b6531d0>, u'ciclo': <gensim.models.word2vec.Vocab at 0x7fa07c31ce10>, u'cid': <gensim.models.word2vec.Vocab at 0x7fa07b8bc490>, u'cidadania': <gensim.models.word2vec.Vocab at 0x7fa07b739b50>, u'cidade': <gensim.models.word2vec.Vocab at 0x7fa07c3f08d0>, u'cidades': <gensim.models.word2vec.Vocab at 0x7fa07b787bd0>, u'cidad\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b7b1f90>, u'cidad\xe3os': <gensim.models.word2vec.Vocab at 0x7fa07c844f50>, u'cientista': <gensim.models.word2vec.Vocab at 0x7fa07cb1bbd0>, u'cientistas': <gensim.models.word2vec.Vocab at 0x7fa07cb207d0>, u'cient\xedfica': <gensim.models.word2vec.Vocab at 0x7fa07b7d3c90>, u'cima': <gensim.models.word2vec.Vocab at 0x7fa07c52d650>, u'cinco': <gensim.models.word2vec.Vocab at 0x7fa07c337250>, u'cine': <gensim.models.word2vec.Vocab at 0x7fa07b103810>, u'cinegrafista': <gensim.models.word2vec.Vocab at 0x7fa07c387bd0>, u'cinema': <gensim.models.word2vec.Vocab at 0x7fa07b21c390>, u'citados': <gensim.models.word2vec.Vocab at 0x7fa07b8c7110>, u'citation': <gensim.models.word2vec.Vocab at 0x7fa07c320610>, u'citou': <gensim.models.word2vec.Vocab at 0x7fa07c82c610>, u'civil': <gensim.models.word2vec.Vocab at 0x7fa07c31c190>, u'civis': <gensim.models.word2vec.Vocab at 0x7fa07cb1b090>, u'ci\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07b653c50>, u'ci\xeancias': <gensim.models.word2vec.Vocab at 0x7fa07c3d5550>, u'clara': <gensim.models.word2vec.Vocab at 0x7fa07c385290>, u'claramente': <gensim.models.word2vec.Vocab at 0x7fa07b88bd50>, u'claro': <gensim.models.word2vec.Vocab at 0x7fa07d317950>, u'classe': <gensim.models.word2vec.Vocab at 0x7fa07b831750>, u'cliente': <gensim.models.word2vec.Vocab at 0x7fa07b70e3d0>, u'clientes': <gensim.models.word2vec.Vocab at 0x7fa07b7824d0>, u'clima': <gensim.models.word2vec.Vocab at 0x7fa07c31cad0>, u'clique': <gensim.models.word2vec.Vocab at 0x7fa07b87f110>, u'clube': <gensim.models.word2vec.Vocab at 0x7fa07b7bf810>, u'cl\xe1udio': <gensim.models.word2vec.Vocab at 0x7fa07c321c10>, u'cl\xe9lio': <gensim.models.word2vec.Vocab at 0x7fa07b555ed0>, u'cni': <gensim.models.word2vec.Vocab at 0x7fa07c353650>, u'cobertura': <gensim.models.word2vec.Vocab at 0x7fa07c337790>, u'cobran\xe7a': <gensim.models.word2vec.Vocab at 0x7fa07b678d50>, u'coca\xedna': <gensim.models.word2vec.Vocab at 0x7fa07b16f690>, u'coelho': <gensim.models.word2vec.Vocab at 0x7fa07b8c71d0>, u'coelhofoto': <gensim.models.word2vec.Vocab at 0x7fa07c3e49d0>, u'coisa': <gensim.models.word2vec.Vocab at 0x7fa07c52dbd0>, u'coisas': <gensim.models.word2vec.Vocab at 0x7fa07cc71b90>, u'colega': <gensim.models.word2vec.Vocab at 0x7fa07cc74610>, u'colegas': <gensim.models.word2vec.Vocab at 0x7fa07cb1ded0>, u'coletiva': <gensim.models.word2vec.Vocab at 0x7fa07cb1bad0>, u'coletivo': <gensim.models.word2vec.Vocab at 0x7fa07c353bd0>, u'cole\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b09fc50>, u'coligados': <gensim.models.word2vec.Vocab at 0x7fa07c3663d0>, u'coliga\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07d328f10>, u'colis\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b440f10>, u'collor': <gensim.models.word2vec.Vocab at 0x7fa07cb1b650>, u'colo': <gensim.models.word2vec.Vocab at 0x7fa07c3f0710>, u'coloca': <gensim.models.word2vec.Vocab at 0x7fa07c35efd0>, u'colocada': <gensim.models.word2vec.Vocab at 0x7fa07c828450>, u'colocado': <gensim.models.word2vec.Vocab at 0x7fa07d328710>, u'colocados': <gensim.models.word2vec.Vocab at 0x7fa07c3dee10>, u'colocar': <gensim.models.word2vec.Vocab at 0x7fa07c32bd10>, u'colocou': <gensim.models.word2vec.Vocab at 0x7fa07c3e5210>, u'coluna': <gensim.models.word2vec.Vocab at 0x7fa07cb17850>, u'colunista': <gensim.models.word2vec.Vocab at 0x7fa07c366750>, u'com': <gensim.models.word2vec.Vocab at 0x7fa07c33dad0>, u'comandante': <gensim.models.word2vec.Vocab at 0x7fa07b6253d0>, u'comando': <gensim.models.word2vec.Vocab at 0x7fa07d343f10>, u'comandou': <gensim.models.word2vec.Vocab at 0x7fa07c3612d0>, u'combate': <gensim.models.word2vec.Vocab at 0x7fa07d77b4d0>, u'combater': <gensim.models.word2vec.Vocab at 0x7fa07d77da90>, u'comboio': <gensim.models.word2vec.Vocab at 0x7fa07b5239d0>, u'comemora': <gensim.models.word2vec.Vocab at 0x7fa07c3e4c10>, u'comemorou': <gensim.models.word2vec.Vocab at 0x7fa07c3eee90>, u'comentar': <gensim.models.word2vec.Vocab at 0x7fa07b5c7090>, u'comentou': <gensim.models.word2vec.Vocab at 0x7fa07cc717d0>, u'coment\xe1rio': <gensim.models.word2vec.Vocab at 0x7fa07c337f50>, u'coment\xe1rios': <gensim.models.word2vec.Vocab at 0x7fa07d763d50>, u'comerciais': <gensim.models.word2vec.Vocab at 0x7fa07b6ea290>, u'comercial': <gensim.models.word2vec.Vocab at 0x7fa07b6da250>, u'come\xe7a': <gensim.models.word2vec.Vocab at 0x7fa07c33bbd0>, u'come\xe7am': <gensim.models.word2vec.Vocab at 0x7fa07b64ed90>, u'come\xe7ando': <gensim.models.word2vec.Vocab at 0x7fa07c33d750>, u'come\xe7ar': <gensim.models.word2vec.Vocab at 0x7fa07c387310>, u'come\xe7aram': <gensim.models.word2vec.Vocab at 0x7fa07c515cd0>, u'come\xe7o': <gensim.models.word2vec.Vocab at 0x7fa07b86cf10>, u'come\xe7ou': <gensim.models.word2vec.Vocab at 0x7fa07c321f90>, u'comiss\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c515110>, u'comitiva': <gensim.models.word2vec.Vocab at 0x7fa07b61a590>, u'comit\xea': <gensim.models.word2vec.Vocab at 0x7fa07c349990>, u'como': <gensim.models.word2vec.Vocab at 0x7fa07c333110>, u'como\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c33d350>, u'companheira': <gensim.models.word2vec.Vocab at 0x7fa07c38fe10>, u'companheiro': <gensim.models.word2vec.Vocab at 0x7fa07c82c510>, u'companhia': <gensim.models.word2vec.Vocab at 0x7fa07d317fd0>, u'companhias': <gensim.models.word2vec.Vocab at 0x7fa07d317a10>, u'compara\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b7553d0>, u'competi\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b2643d0>, u'compet\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07b5bba50>, u'completa': <gensim.models.word2vec.Vocab at 0x7fa07b620910>, u'completamente': <gensim.models.word2vec.Vocab at 0x7fa07cc71790>, u'completo': <gensim.models.word2vec.Vocab at 0x7fa07c387550>, u'complexo': <gensim.models.word2vec.Vocab at 0x7fa07b5756d0>, u'comportamento': <gensim.models.word2vec.Vocab at 0x7fa07c334650>, u'composi\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b86cf50>, u'composto': <gensim.models.word2vec.Vocab at 0x7fa07b50da90>, u'compra': <gensim.models.word2vec.Vocab at 0x7fa07c341810>, u'comprar': <gensim.models.word2vec.Vocab at 0x7fa07c0c5b50>, u'compras': <gensim.models.word2vec.Vocab at 0x7fa07b437550>, u'compromisso': <gensim.models.word2vec.Vocab at 0x7fa07c334050>, u'compromissos': <gensim.models.word2vec.Vocab at 0x7fa07c321250>, u'comum': <gensim.models.word2vec.Vocab at 0x7fa07c3f4550>, u'comunicado': <gensim.models.word2vec.Vocab at 0x7fa07b813a10>, u'comunica\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07d760e90>, u'comunidade': <gensim.models.word2vec.Vocab at 0x7fa07c3eebd0>, u'com\xe9dia': <gensim.models.word2vec.Vocab at 0x7fa07c86a490>, u'com\xe9rcio': <gensim.models.word2vec.Vocab at 0x7fa07b6ef850>, u'concedeu': <gensim.models.word2vec.Vocab at 0x7fa07b87f8d0>, u'conceito': <gensim.models.word2vec.Vocab at 0x7fa07c385b10>, u'concentra\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b80b750>, u'concess\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c56c750>, u'concluiu': <gensim.models.word2vec.Vocab at 0x7fa07b790cd0>, u'conclus\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b7ac810>, u'concorre': <gensim.models.word2vec.Vocab at 0x7fa07cb17810>, u'concorrente': <gensim.models.word2vec.Vocab at 0x7fa07c3e5290>, u'concorrentes': <gensim.models.word2vec.Vocab at 0x7fa07d77dc50>, u'concorrer': <gensim.models.word2vec.Vocab at 0x7fa07c31ca90>, u'concorr\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07b6971d0>, u'concurso': <gensim.models.word2vec.Vocab at 0x7fa07b3c5f50>, u'condenado': <gensim.models.word2vec.Vocab at 0x7fa07b2d4590>, u'condi\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07d77d410>, u'condi\xe7\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07c334a10>, u'condutor': <gensim.models.word2vec.Vocab at 0x7fa07b795110>, u'condu\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c385410>, u'confedera\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b749c50>, u'conferir': <gensim.models.word2vec.Vocab at 0x7fa07b87f150>, u'confian\xe7a': <gensim.models.word2vec.Vocab at 0x7fa07cb42950>, u'confira': <gensim.models.word2vec.Vocab at 0x7fa07b7ac190>, u'confirmada': <gensim.models.word2vec.Vocab at 0x7fa07c82b310>, u'confirmar': <gensim.models.word2vec.Vocab at 0x7fa07c82bd50>, u'confirmaram': <gensim.models.word2vec.Vocab at 0x7fa07b528750>, u'confirma\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c519d90>, u'confirmou': <gensim.models.word2vec.Vocab at 0x7fa07c84be50>, u'conflito': <gensim.models.word2vec.Vocab at 0x7fa07b692a50>, u'conflitos': <gensim.models.word2vec.Vocab at 0x7fa07cc89e10>, u'conforme': <gensim.models.word2vec.Vocab at 0x7fa07c84b910>, u'conforta': <gensim.models.word2vec.Vocab at 0x7fa07c361890>, u'confronto': <gensim.models.word2vec.Vocab at 0x7fa07c2a5c90>, u'confrontos': <gensim.models.word2vec.Vocab at 0x7fa07b4efd10>, u'congresso': <gensim.models.word2vec.Vocab at 0x7fa07c361b50>, u'conhece': <gensim.models.word2vec.Vocab at 0x7fa07b8c73d0>, u'conhecer': <gensim.models.word2vec.Vocab at 0x7fa07b661190>, u'conhecida': <gensim.models.word2vec.Vocab at 0x7fa07c339390>, u'conhecido': <gensim.models.word2vec.Vocab at 0x7fa07c33d090>, u'conhecidos': <gensim.models.word2vec.Vocab at 0x7fa07b7a2990>, u'conhecimento': <gensim.models.word2vec.Vocab at 0x7fa07c52db10>, u'conhecimentos': <gensim.models.word2vec.Vocab at 0x7fa07b40a050>, u'conjunto': <gensim.models.word2vec.Vocab at 0x7fa07c353250>, u'conquistar': <gensim.models.word2vec.Vocab at 0x7fa07b4a7890>, u'consci\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07c52d910>, u'consecutivo': <gensim.models.word2vec.Vocab at 0x7fa07c527e90>, u'consegue': <gensim.models.word2vec.Vocab at 0x7fa07c51e550>, u'conseguido': <gensim.models.word2vec.Vocab at 0x7fa07cb12790>, u'conseguir': <gensim.models.word2vec.Vocab at 0x7fa07cb12290>, u'conseguiram': <gensim.models.word2vec.Vocab at 0x7fa07b32b750>, u'conseguiu': <gensim.models.word2vec.Vocab at 0x7fa08c6a7d50>, u'conselho': <gensim.models.word2vec.Vocab at 0x7fa07b85de90>, u'conselhos': <gensim.models.word2vec.Vocab at 0x7fa07b842790>, u'consenso': <gensim.models.word2vec.Vocab at 0x7fa07b79b890>, u'consequ\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07b81f510>, u'consequ\xeancias': <gensim.models.word2vec.Vocab at 0x7fa07cc71990>, u'conservadores': <gensim.models.word2vec.Vocab at 0x7fa07d75c550>, u'considera': <gensim.models.word2vec.Vocab at 0x7fa07b848850>, u'considerada': <gensim.models.word2vec.Vocab at 0x7fa07c527110>, u'considerado': <gensim.models.word2vec.Vocab at 0x7fa07b5ce210>, u'consideram': <gensim.models.word2vec.Vocab at 0x7fa07c4014d0>, u'considerando': <gensim.models.word2vec.Vocab at 0x7fa07b862550>, u'considerar': <gensim.models.word2vec.Vocab at 0x7fa07b768410>, u'consigo': <gensim.models.word2vec.Vocab at 0x7fa07b762b90>, u'constitui\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b74f5d0>, u'construir': <gensim.models.word2vec.Vocab at 0x7fa07c828510>, u'constru\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c844690>, u'constru\xeddo': <gensim.models.word2vec.Vocab at 0x7fa07c334c50>, u'consulta': <gensim.models.word2vec.Vocab at 0x7fa07b86c310>, u'consultoria': <gensim.models.word2vec.Vocab at 0x7fa07b879d10>, u'consumidor': <gensim.models.word2vec.Vocab at 0x7fa07b755ed0>, u'consumidores': <gensim.models.word2vec.Vocab at 0x7fa07c2c2810>, u'consumo': <gensim.models.word2vec.Vocab at 0x7fa07b6ef110>, u'conta': <gensim.models.word2vec.Vocab at 0x7fa07cb12dd0>, u'contar': <gensim.models.word2vec.Vocab at 0x7fa07c32be50>, u'contar\xe1': <gensim.models.word2vec.Vocab at 0x7fa07b7b79d0>, u'contas': <gensim.models.word2vec.Vocab at 0x7fa07c829310>, u'contato': <gensim.models.word2vec.Vocab at 0x7fa07cc61590>, u'conter': <gensim.models.word2vec.Vocab at 0x7fa07c32b190>, u'contexto': <gensim.models.word2vec.Vocab at 0x7fa07c321ad0>, u'conte\xfado': <gensim.models.word2vec.Vocab at 0x7fa07b874b90>, u'continente': <gensim.models.word2vec.Vocab at 0x7fa07b2644d0>, u'continua': <gensim.models.word2vec.Vocab at 0x7fa07d32c790>, u'continuam': <gensim.models.word2vec.Vocab at 0x7fa07b813310>, u'continuar': <gensim.models.word2vec.Vocab at 0x7fa07d763b10>, u'continuar\xe1': <gensim.models.word2vec.Vocab at 0x7fa07c0c5d50>, u'continue': <gensim.models.word2vec.Vocab at 0x7fa07b862fd0>, u'continuidade': <gensim.models.word2vec.Vocab at 0x7fa07b8c2e10>, u'continuou': <gensim.models.word2vec.Vocab at 0x7fa07cc5d990>, u'contou': <gensim.models.word2vec.Vocab at 0x7fa07b81f410>, u'contra': <gensim.models.word2vec.Vocab at 0x7fa07c33b910>, u'contradi\xe7\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07c34f7d0>, u'contrapontopig': <gensim.models.word2vec.Vocab at 0x7fa07d763d90>, u'contrata\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b42fad0>, u'contrato': <gensim.models.word2vec.Vocab at 0x7fa07b7f8c90>, u'contratos': <gensim.models.word2vec.Vocab at 0x7fa07b7e7450>, u'contribui\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b653090>, u'contributing': <gensim.models.word2vec.Vocab at 0x7fa07c5195d0>, u'controlada': <gensim.models.word2vec.Vocab at 0x7fa07b59b490>, u'controlar': <gensim.models.word2vec.Vocab at 0x7fa07b269850>, u'controle': <gensim.models.word2vec.Vocab at 0x7fa07cc61550>, u'contr\xe1rio': <gensim.models.word2vec.Vocab at 0x7fa07d7600d0>, u'contudo': <gensim.models.word2vec.Vocab at 0x7fa07cb209d0>, u'conven\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c84be90>, u'conversa': <gensim.models.word2vec.Vocab at 0x7fa07c361f10>, u'conversar': <gensim.models.word2vec.Vocab at 0x7fa07d7608d0>, u'conversas': <gensim.models.word2vec.Vocab at 0x7fa07c515c90>, u'conversou': <gensim.models.word2vec.Vocab at 0x7fa07c366590>, u'convic\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c529190>, u'convite': <gensim.models.word2vec.Vocab at 0x7fa07c38fdd0>, u'conviv\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07c82c410>, u'coordenador': <gensim.models.word2vec.Vocab at 0x7fa07d31d050>, u'coordenadora': <gensim.models.word2vec.Vocab at 0x7fa07b813990>, u'coordenadores': <gensim.models.word2vec.Vocab at 0x7fa07c32b2d0>, u'coordena\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07c532650>, u'copa': <gensim.models.word2vec.Vocab at 0x7fa07c3d5450>, u'cor': <gensim.models.word2vec.Vocab at 0x7fa07b5b55d0>, u'coragem': <gensim.models.word2vec.Vocab at 0x7fa07cc5dbd0>, u'cora\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07cb12e90>, u'corinthians': <gensim.models.word2vec.Vocab at 0x7fa07b3a5c10>, u'coronel': <gensim.models.word2vec.Vocab at 0x7fa07b633990>, u'corpo': <gensim.models.word2vec.Vocab at 0x7fa07c387c90>, u'corpos': <gensim.models.word2vec.Vocab at 0x7fa07b55ca90>, u'corre': <gensim.models.word2vec.Vocab at 0x7fa07b768350>, u'correio': <gensim.models.word2vec.Vocab at 0x7fa07b7b7510>, u'correios': <gensim.models.word2vec.Vocab at 0x7fa07b57f310>, u'correligion\xe1rios': <gensim.models.word2vec.Vocab at 0x7fa07d7aa250>, u'corrente': <gensim.models.word2vec.Vocab at 0x7fa07b4eac50>, u'corrida': <gensim.models.word2vec.Vocab at 0x7fa07c339290>, u'corrup\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b8cca90>, u'corte': <gensim.models.word2vec.Vocab at 0x7fa07b776c10>, u'cortejo': <gensim.models.word2vec.Vocab at 0x7fa07b8317d0>, u'costa': <gensim.models.word2vec.Vocab at 0x7fa07c31c210>, u'costuma': <gensim.models.word2vec.Vocab at 0x7fa07b842ed0>, u'cotado': <gensim.models.word2vec.Vocab at 0x7fa07c8290d0>, u'cota\xe7\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07bf952d0>, u'couvert': <gensim.models.word2vec.Vocab at 0x7fa07b094d90>, u'cozinha': <gensim.models.word2vec.Vocab at 0x7fa07b713b90>, u'crato': <gensim.models.word2vec.Vocab at 0x7fa07c353d10>, u'credibilidade': <gensim.models.word2vec.Vocab at 0x7fa07b7ca390>, u'cresce': <gensim.models.word2vec.Vocab at 0x7fa07b7e1c10>, u'crescendo': <gensim.models.word2vec.Vocab at 0x7fa07c349f90>, u'crescer': <gensim.models.word2vec.Vocab at 0x7fa07cb24510>, u'cresceu': <gensim.models.word2vec.Vocab at 0x7fa07b6e0990>, u'crescimento': <gensim.models.word2vec.Vocab at 0x7fa07c53a410>, u'criada': <gensim.models.word2vec.Vocab at 0x7fa07b7b1690>, u'criado': <gensim.models.word2vec.Vocab at 0x7fa07c515a10>, u'crian\xe7a': <gensim.models.word2vec.Vocab at 0x7fa07b6972d0>, u'crian\xe7as': <gensim.models.word2vec.Vocab at 0x7fa07b488090>, u'criar': <gensim.models.word2vec.Vocab at 0x7fa07c334290>, u'cria\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07b8253d0>, u'crime': <gensim.models.word2vec.Vocab at 0x7fa07b842850>, u'crimes': <gensim.models.word2vec.Vocab at 0x7fa07b63fc50>, u'criou': <gensim.models.word2vec.Vocab at 0x7fa07d3289d0>, u'crise': <gensim.models.word2vec.Vocab at 0x7fa07d32e850>, u'cristina': <gensim.models.word2vec.Vocab at 0x7fa07b1d4090>, u'criticar': <gensim.models.word2vec.Vocab at 0x7fa07c3f4f10>, u'criticou': <gensim.models.word2vec.Vocab at 0x7fa07cb1d610>, u'cruz': <gensim.models.word2vec.Vocab at 0x7fa07bfcd710>, u'cruz/abr': <gensim.models.word2vec.Vocab at 0x7fa07c361ed0>, u'cruz/abrlula': <gensim.models.word2vec.Vocab at 0x7fa07c3d57d0>, u'cruz/abrmarina': <gensim.models.word2vec.Vocab at 0x7fa07c361ad0>, u'cr\xe9dito': <gensim.models.word2vec.Vocab at 0x7fa07d3285d0>, u'cr\xedtica': <gensim.models.word2vec.Vocab at 0x7fa07b885250>, u'cr\xedticas': <gensim.models.word2vec.Vocab at 0x7fa07c81f850>, u'cr\xf4nica': <gensim.models.word2vec.Vocab at 0x7fa07b7f8110>, u'cuidar': <gensim.models.word2vec.Vocab at 0x7fa07d31d0d0>, u'cuja': <gensim.models.word2vec.Vocab at 0x7fa07c33d250>, u'cujo': <gensim.models.word2vec.Vocab at 0x7fa07c3410d0>, u'culpa': <gensim.models.word2vec.Vocab at 0x7fa07b87f7d0>, u'cultura': <gensim.models.word2vec.Vocab at 0x7fa07c0c5bd0>, u'culturais': <gensim.models.word2vec.Vocab at 0x7fa07b44ab90>, u'cultural': <gensim.models.word2vec.Vocab at 0x7fa07d760f10>, u'cumpre': <gensim.models.word2vec.Vocab at 0x7fa07b7f36d0>, u'cumprimenta': <gensim.models.word2vec.Vocab at 0x7fa07c3eec90>, u'cumprimento': <gensim.models.word2vec.Vocab at 0x7fa07b44a350>, u'cumprir': <gensim.models.word2vec.Vocab at 0x7fa07c337b90>, u'cumpriu': <gensim.models.word2vec.Vocab at 0x7fa07b5e4210>, u'cunha': <gensim.models.word2vec.Vocab at 0x7fa07cc712d0>, u'curitiba': <gensim.models.word2vec.Vocab at 0x7fa07c3de510>, u'curso': <gensim.models.word2vec.Vocab at 0x7fa07b8c7810>, u'cursos': <gensim.models.word2vec.Vocab at 0x7fa07b3edb50>, u'curto': <gensim.models.word2vec.Vocab at 0x7fa07b67dfd0>, u'custa': <gensim.models.word2vec.Vocab at 0x7fa07d328910>, u'custo': <gensim.models.word2vec.Vocab at 0x7fa07cb20490>, u'custos': <gensim.models.word2vec.Vocab at 0x7fa07b700a90>, u'c\xe1': <gensim.models.word2vec.Vocab at 0x7fa07b85d5d0>, u'c\xe1lculo': <gensim.models.word2vec.Vocab at 0x7fa07b7137d0>, u'c\xe2mara': <gensim.models.word2vec.Vocab at 0x7fa07c33bf90>, u'c\xe2mbio': <gensim.models.word2vec.Vocab at 0x7fa07b7ca250>, u'c\xe2meras': <gensim.models.word2vec.Vocab at 0x7fa07b5453d0>, u'c\xe2ncer': <gensim.models.word2vec.Vocab at 0x7fa07c5651d0>, u'c\xe9u': <gensim.models.word2vec.Vocab at 0x7fa07b555850>, u'c\xedcero': <gensim.models.word2vec.Vocab at 0x7fa07c3f0fd0>, u'c\xf3digo': <gensim.models.word2vec.Vocab at 0x7fa07b885890>, u'c\xfapula': <gensim.models.word2vec.Vocab at 0x7fa07d762d10>, u'd': <gensim.models.word2vec.Vocab at 0x7fa07b6a4650>, u'da': <gensim.models.word2vec.Vocab at 0x7fa07cb17250>, u'dada': <gensim.models.word2vec.Vocab at 0x7fa07d317690>, u'dado': <gensim.models.word2vec.Vocab at 0x7fa07c3f4610>, u'dados': <gensim.models.word2vec.Vocab at 0x7fa07c82b350>, u'dando': <gensim.models.word2vec.Vocab at 0x7fa07b8cc850>, u'daniel': <gensim.models.word2vec.Vocab at 0x7fa07b61a690>, u'dantas': <gensim.models.word2vec.Vocab at 0x7fa07b66b590>, u'dan\xe7a': <gensim.models.word2vec.Vocab at 0x7fa07c3ee290>, u'daquele': <gensim.models.word2vec.Vocab at 0x7fa07b8c7f50>, u'daqueles': <gensim.models.word2vec.Vocab at 0x7fa07cc5db90>, u'daqui': <gensim.models.word2vec.Vocab at 0x7fa07cc71910>, u'dar': <gensim.models.word2vec.Vocab at 0x7fa07c334910>, u'dar\xe1': <gensim.models.word2vec.Vocab at 0x7fa07b7b7850>, u'das': <gensim.models.word2vec.Vocab at 0x7fa07c81f5d0>, u'data': <gensim.models.word2vec.Vocab at 0x7fa07c82b210>, u'datafolha': <gensim.models.word2vec.Vocab at 0x7fa07cb12650>, u'davi': <gensim.models.word2vec.Vocab at 0x7fa07b3c3850>, u'david': <gensim.models.word2vec.Vocab at 0x7fa07c261550>, u'da\xed': <gensim.models.word2vec.Vocab at 0x7fa07b825050>, u'de': <gensim.models.word2vec.Vocab at 0x7fa07c349710>, u'debate': <gensim.models.word2vec.Vocab at 0x7fa07d7821d0>, u'debater': <gensim.models.word2vec.Vocab at 0x7fa07c2a6fd0>, u'debates': <gensim.models.word2vec.Vocab at 0x7fa07d784b50>, u'decidir': <gensim.models.word2vec.Vocab at 0x7fa07c3f46d0>, u'decidiu': <gensim.models.word2vec.Vocab at 0x7fa07d762810>, u'decis\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07cb1bf90>, u'decis\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07c365050>, u'declarado': <gensim.models.word2vec.Vocab at 0x7fa07b805b90>, u'declara\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07d32ea50>, u'declara\xe7\xf5es': <gensim.models.word2vec.Vocab at 0x7fa07cb20a50>, u'declarou': <gensim.models.word2vec.Vocab at 0x7fa07d32ec50>, u'decolou': <gensim.models.word2vec.Vocab at 0x7fa07cb42e90>, u'defende': <gensim.models.word2vec.Vocab at 0x7fa07cb1b1d0>, u'defender': <gensim.models.word2vec.Vocab at 0x7fa07c52d210>, u'defendeu': <gensim.models.word2vec.Vocab at 0x7fa07cb1d890>, u'defesa': <gensim.models.word2vec.Vocab at 0x7fa07d77da10>, u'defici\xeancia': <gensim.models.word2vec.Vocab at 0x7fa07b63a8d0>, u'define': <gensim.models.word2vec.Vocab at 0x7fa07c829fd0>, u'definido': <gensim.models.word2vec.Vocab at 0x7fa07b795f50>, u'definir': <gensim.models.word2vec.Vocab at 0x7fa07c527410>, u'definiu': <gensim.models.word2vec.Vocab at 0x7fa07c38f8d0>, u'defini\xe7\xe3o': <gensim.models.word2vec.Vocab at 0x7fa07cc89790>, u'deixa': <gensim.models.word2vec.Vocab at 0x7fa07c8282d0>, u'deixam': <gensim.models.word2vec.Vocab at 0x7fa07c82b2d0>, u'deixando': <gensim.models.word2vec.Vocab at 0x7fa07d782f50>, u'deixar': <gensim.models.word2vec.Vocab at 0x7fa07c5290d0>, u'deixaram': <gensim.models.word2vec.Vocab at 0x7fa07b8cc310>, u'deixe': <gensim.models.word2vec.Vocab at 0x7fa07cb24690>, u'deixou': <gensim.models.word2vec.Vocab at 0x7fa07c33b050>, ...}
dpgmm = mixture.DPGMM(n_components=10,n_iter=5, covariance_type='diag')
dpgmm.fit(model.syn0)
DPGMM(alpha=1.0, covariance_type='diag', init_params='wmc', min_covar=None, n_components=10, n_iter=5, params='wmc', random_state=<mtrand.RandomState object at 0x7fa0bae71af8>, thresh=0.01, verbose=False)
dpgmm.converged_
False
plot(dpgmm.predict(model.syn0));
print dpgmm.means_.shape
plot(dpgmm.means_.T);
(10, 5000)
gmm = mixture.GMM(n_components=3, covariance_type='diag')
gmm.fit(model.syn0)
GMM(covariance_type='diag', init_params='wmc', min_covar=0.001, n_components=3, n_init=1, n_iter=100, params='wmc', random_state=None, thresh=0.01)
print gmm.means_.shape
plot(gmm.means_.T);
(3, 5000)
plot(gmm.predict(model.syn0));
gmm.converged_
True
mixture.GMM?
from __future__ import print_function
from pprint import pprint
from time import time
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
t0 = time()
hasher = HashingVectorizer(n_features=10000,
stop_words=sw, non_negative=True,
norm=None, binary=False)
vectorizer = make_pipeline(hasher, TfidfTransformer())
X = vectorizer.fit_transform(docs)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()
done in 29.694485s n_samples: 6615, n_features: 10000
n_clusters = 3
dpgmm = mixture.DPGMM(covariance_type='diag')
km = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1,
verbose=True, n_jobs=8)
#dpgmm.fit(X)
labels = range(X.shape[0]-1)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=3, n_init=1, n_jobs=8, precompute_distances=True, random_state=None, tol=0.0001, verbose=True) done in 0.395s Top terms per cluster: Initialization complete Iteration 0, inertia 5680.000 Iteration 1, inertia 5610.934 Converged at iteration 1
k_means_labels = km.labels_
k_means_cluster_centers = km.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)
# KMeans
colors = ['#4EACC5', '#FF9C34', '#4E9A06']
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
#plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.')
plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6)
title('KMeans')
#set_xticks(())
#set_yticks(())
pylab.text(-3.5, 1.8, 'inertia: %f' % (km.inertia_));
<matplotlib.figure.Figure at 0x7fa07c9248d0>
mixture.DPGMM?
X.shape
(6615, 10000)
array([0, 0, 0, ..., 1, 1, 1], dtype=int32)