%load_ext autoreload
%autoreload 2
from IPython.display import Image
from graphviz import Digraph
import wikipedia_ner.parse as wikipedia_ner
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) <ipython-input-1-e2e2a3ef45df> in <module>() 3 from IPython.display import Image 4 from graphviz import Digraph ----> 5 import wikipedia_ner.parse as wikipedia_ner /Users/jonathanraiman/Desktop/Coding/python_packages/wikipedia_ner/wikipedia_ner/parse/__init__.py in <module>() 3 """ 4 ----> 5 from .parse import parse_dump 6 from .dump_result import DumpResult 7 from .pages import ParsedPage, ParsedPageChild, ParsedPageParent /Users/jonathanraiman/Desktop/Coding/python_packages/wikipedia_ner/wikipedia_ner/parse/parse.py in <module>() ----> 1 from epub_conversion import convert_wiki_to_lines 2 from epub_conversion.wiki_decoder import almost_smart_open 3 from .utils import line_converter 4 from .dump_result import DumpResult 5 /usr/local/lib/python3.4/site-packages/epub_conversion/__init__.py in <module>() 6 """ 7 ----> 8 from .converter import Converter 9 from .wiki_decoder import convert_wiki_to_lines, convert_wiki_to_corpus 10 /usr/local/lib/python3.4/site-packages/epub_conversion/converter.py in <module>() ----> 1 from .utils import get_files_from_path, convert_epub_to_lines, convert_lines_to_text, open_book 2 import gzip 3 4 class Converter(object): 5 """ /usr/local/lib/python3.4/site-packages/epub_conversion/utils.py in <module>() 1 import os ----> 2 from xml_cleaner import to_raw_text 3 from epub import open_epub, BadEpubFile 4 from zipfile import BadZipFile 5 /usr/local/lib/python3.4/site-packages/xml_cleaner/__init__.py in <module>() 12 pyximport.install() 13 ---> 14 from .wiki_markup_processing import to_raw_text, remove_brackets 15 from .word_tokenizer import split_sentences, split_and_group_sentences, split_punct 16 ImportError: No module named 'xml_cleaner.wiki_markup_processing'
def update_graph(graph, dict_node, code = [], name = None):
if name == None and len(dict_node.keys()) > 1:
name = 'ROOT'
elif name == None and len(dict_node.keys()) == 1:
return update_graph(graph, dict_node[list(dict_node.keys())[0]], code, list(dict_node.keys())[0])
graph.node(str(code), "%s %r" % (name, code))
# connect to parents
if len(code) > 0:
graph.edge(str(code[:-1]), str(code))
for i, subkey in enumerate(dict_node.keys()):
update_graph(graph, dict_node[subkey], code + [i], subkey)
return graph
Example tree structure visible in Wikipedia or in a taxonomy
graph = Digraph(comment='The Round Table', format='png')
update_graph(graph, {
"World": {
"News": {
"World news": {},
"Tech news": {}
},
"Science" : {
"solar": {},
"CS":{}
},
"Religion" : {
"Temples":{},
"Study":{}
},
"Politics": {
"USA Politics": {},
"French Politics": {}
}
}})
Image(graph.render('test-output/round-table', view=True))
Here we extract wikipedia articles that are not special pages, redirection, or disambiguations. Among those pages we now take out strings of text with references to other articles to learn named entity recognition over a hierarchical softmax tree
parseresult = wikipedia_ner.parse_dump("/Users/jonathanraiman/Desktop/Coding/enwiki.bz2",
max_articles = 200)
200 articles seen so far. Processing 35.408 articles / s : position 9851453
most_common_category = wikipedia_ner.ParsedPage.categories_counter.most_common(1)[0][0]
print("In '%s' the children are %r" % (most_common_category, ", ".join([parseresult.index2target[child] for child in list(wikipedia_ner.ParsedPage.categories[most_common_category].children)])))
In 'Category : Member states of the United Nations' the children are 'Afghanistan, Algeria, Andorra, Antigua and Barbuda, Azerbaijan, Angola, Albania'