import os
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.analysis import CharsetFilter, StemmingAnalyzer
from whoosh import fields
from whoosh.support.charset import accent_map
from IPython.core.display import display, HTML
O Whoosh é uma biblioteca em Python para indexação de textos.
my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
schema = Schema(content=TEXT(stored=True, analyzer=my_analyzer), tid=TEXT(stored=True))
if os.path.exists('indexdir'):
ix = open_dir('indexdir')
else:
os.mkdir('indexdir')
ix = create_in("indexdir", schema)
import nltk
from nltk.corpus import machado
textos = [machado.raw(fid) for fid in machado.fileids()]
writer = ix.writer()
for tid in machado.fileids():
writer.add_document(content=machado.raw(tid), tid=tid)
writer.commit()
from whoosh.qparser import QueryParser
qw = 'cafe com leite'
qp = QueryParser("content", ix.schema)
query = qp.parse(qw)
with ix.searcher() as searcher:
results = searcher.search(query)
for hit in results:
tid = hit.fields()['tid']
display(HTML(f'<b>{tid.split("/")[1]}: </b'+ hit.highlights("content")))