For this tutorial we will use data from the project "Quantitative Historical Linguistics". The website of the project provides a ZIP package of GrAF/XML files for the printed sources that were digitized within the project:
http://www.quanthistling.info/data/downloads/xml/data.zip
The ZIP package contains several files encoded as described in the ISO standard 24612 "Linguistic annotation framework (LAF)". The QuantHistLing data contains dictionary and wordlist sources. Those were first tokenized into entries, for each entry you will find annotations for at least the head word(s) ("head" annotation) and translation(s) ("translation" annotation) in the case of dictionaries. We will only use the dictionaries of the "Witotoan" compoment in this tutorial. The ZIP package also contains a CSV file "sources.csv" with some information for each source, for example the languages as ISO codes, type of source, etc. Be aware that the ZIP package contains a filtered version of the sources: only entries that contain a Spanish word that is part of the Spanish swadesh list are included in the download package.
For a simple example how to parse one of the source please see here:
http://graf-python.readthedocs.org/en/latest/Querying%20GrAF%20graphs.html
import os
import sys
import csv
import codecs
import collections
import glob
import graf
import scipy.sparse
import networkx
In the first step we download and extract the data. You may change to a local "tmp" directory before the download or just download the data to the current working directory. For this you need to install the Python library requests
. You may also download and extract the data manually, the data is only downloaded for you if the file sources.csv
is not found.
os.chdir("/Users/pbouda/Projects/git-github/notebooks/polysemy")
if not os.path.exists("sources.csv"):
import requests
import zipfile
r = requests.get(
"http://www.quanthistling.info/data/downloads/xml/data.zip")
with open("data.zip", "wb") as f:
f.write(r.content)
z = zipfile.ZipFile("data.zip")
z.extractall()
Now we open the file "sources.csv" and read out all the sources that are part of the component "Witotoan" and that are dictionaries. We will store a list of those source in witotoan_sources
:
sources = csv.reader(open("sources.csv", "rU"), delimiter="\t")
dict_sources = list()
for source in sources:
if source[0] != "QLCID" and source[1] == "dictionary":
dict_sources.append(source[0])
Next we define a helper function that transform a GrAF graph into a networkx graph. For this we traverse the graph by querying for all entries. For each entry we look for connected nodes that have "head" or "translation" annotation. All of those nodes that are Spanish are stored in the list spa
. All non-Spanish annotations are stored in others
. In the end the collected annotation are added to the new networkx graph, and each spanish node is connected to all the other nodes for each entry:
import unicodedata
tbl = dict.fromkeys(i for i in xrange(sys.maxunicode)
if unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('S'))
def remove_punctuation(text):
return text.translate(tbl)
if not os.path.exists(os.path.join("stopwords", "spanish")):
import requests
import zipfile
r = requests.get(
"https://github.com/nltk/nltk_data/blob/gh-pages/packages/corpora/stopwords.zip?raw=true")
with open("stopwords.zip", "wb") as f:
f.write(r.content)
z = zipfile.ZipFile("stopwords.zip")
z.extractall()
stopwords = list()
with codecs.open(os.path.join("stopwords", "spanish"), "r", "utf-8") as f:
for line in f:
stopwords.append(line.strip())
parser = graf.GraphParser()
all_dicts_frame = None
parsed_first = False
spa_to_indi = collections.defaultdict(set)
indi = set()
spa = set()
spa_to_spa = []
for d in dict_sources:
for f in glob.glob(os.path.join(d, "dict-*-dictinterpretation.xml")):
#print("Parsing {0}...".format(f))
graf_graph = parser.parse(f)
for (node_id, node) in graf_graph.nodes.items():
if node_id.endswith("..entry"):
entry_spa = set()
spa_to_spa_tmp = list()
others = set()
for e in node.out_edges:
if e.annotations.get_first().label == "head" or e.annotations.get_first().label == "translation":
# get lang
for n in e.to_node.links[0][0].nodes:
if n.annotations.get_first().label == "iso-639-3":
if n.annotations.get_first().features.get_value("substring") == "spa":
substr = remove_punctuation(e.to_node.annotations.get_first().features.get_value("substring"))
collo = set()
for w in substr.split(" "):
if w not in stopwords:
entry_spa.add(w)
collo.add(w)
if len(collo) > 1:
spa_to_spa_tmp.append(list(collo))
break
else:
trans = u"{0}|{1}".format(e.to_node.annotations.get_first().features.get_value("substring"), d)
others.add(trans)
break
if len(entry_spa) > 0 and len(others) > 0:
#spa_to_spa.append(list(entry_spa))
spa_to_spa.extend(spa_to_spa_tmp)
for head in entry_spa:
for translation in others:
spa_to_indi[head].add(translation)
spa.add(head)
indi.add(translation)
import gc
gc.collect()
30862
spa = list(spa)
indi = list(indi)
indi_indices = { w: i for i, w in enumerate(indi) }
spa_indices = { w: i for i, w in enumerate(spa) }
all_dicts_cooc = scipy.sparse.lil_matrix((len(indi), len(spa)))
#all_dicts_cooc = numpy.zeros((len(indi), len(spa)))
len(spa)
45515
for i, head in enumerate(spa):
for trans in spa_to_indi[head]:
all_dicts_cooc[indi_indices[trans], i] = 1
all_dicts_spa_collo = scipy.sparse.lil_matrix((len(spa), len(spa)))
for j, p in enumerate(spa_to_spa):
for i in range(len(p)-1):
for w in p[i+1:]:
all_dicts_spa_collo[spa_indices[p[i]], spa_indices[w]] += 1
spa_to_spa[19764]
[u'quedar', u'carbonizado']
Matrix calculations do not work on lil matrices
all_dicts_cooc = scipy.sparse.csc_matrix(all_dicts_cooc)
all_dicts_spa_collo = scipy.sparse.csc_matrix(all_dicts_spa_collo)
spa_similarity = all_dicts_cooc.T * all_dicts_cooc
spa_similarity_without_collo = spa_similarity - all_dicts_spa_collo
g = networkx.Graph(spa_similarity)
#solitary = [ n for n, d in g.degree_iter() if d==2 ]
#g.remove_nodes_from(solitary)
labels = dict(zip(range(len(spa)), spa))
#labels = { k: v for k,v in enumerate(spa) if k in g }
g2 = networkx.relabel_nodes(g, labels)
word = u"casa"
cutoff = 50
comer_nodes = g2[word]
comer_graph = networkx.Graph()
comer_graph.add_node(word)
for n in comer_nodes:
if comer_nodes[n]['weight'] >= cutoff:
comer_graph.add_node(n)
comer_graph.add_edge(word, n, weight=comer_nodes[n]['weight'])
len(comer_graph)
18
from networkx.readwrite import json_graph
import json
comer_json = json_graph.node_link_data(comer_graph)
#json.dump(bodyparts_json, codecs.open("bodyparts_graph.json", "w", "utf-8"))
from IPython.display import HTML, Javascript
from IPython.core.display import display
html = """
<style>
.link {
stroke: #999;
stroke-opacity: .6;
}
.link:hover {
stroke: #000;
stroke-opacity: 1.0;
}
</style>
<script src="http://d3js.org/d3.v3.min.js"></script>
<div id="nav"></div>
"""
javascript = """
var color = d3.scale.category20();
var width = 500,
height = 400;
var svg = d3.select("#nav").append("svg")
.attr("width", width)
.attr("height", height);
var force = d3.layout.force()
.gravity(.05)
.distance(100)
.charge(-250)
.size([width, height]);
var json = """ + json.dumps(comer_json) + """;
//d3.json("bodyparts_graph.json", function(error, json) {
force
.nodes(json.nodes)
.links(json.links)
.start();
var link = svg.selectAll("line.link")
.data(json.links)
.enter().append("line")
.attr("class", "link")
.style("stroke-width", function(d) { return d.weight/""" + str(cutoff) + """; });
var node = svg.selectAll("circle.node")
.data(json.nodes)
.enter().append("g")
.attr("class", "node")
.call(force.drag);
node.append("circle")
.attr("r", 5);
//.style("fill", function(d) { return color(d.group); })
node.append("text")
.attr("dx", 12)
.attr("dy", ".35em")
.text(function(d) { return d.id });
force.on("tick", function() {
link.attr("x1", function(d) { return d.source.x; })
.attr("y1", function(d) { return d.source.y; })
.attr("x2", function(d) { return d.target.x; })
.attr("y2", function(d) { return d.target.y; });
node.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; });
});
//});
"""
display(HTML(html))
display(Javascript(javascript))