For this tutorial we will use data from the project "Quantitative Historical Linguistics". The website of the project provides a ZIP package of GrAF/XML files for the printed sources that were digitized within the project:
http://www.quanthistling.info/data/downloads/xml/data.zip
The ZIP package contains several files encoded as described in the ISO standard 24612 "Linguistic annotation framework (LAF)". The QuantHistLing data contains dictionary and wordlist sources. Those were first tokenized into entries, for each entry you will find annotations for at least the head word(s) ("head" annotation) and translation(s) ("translation" annotation) in the case of dictionaries. We will only use the dictionaries of the "Witotoan" compoment in this tutorial. The ZIP package also contains a CSV file "sources.csv" with some information for each source, for example the languages as ISO codes, type of source, etc. Be aware that the ZIP package contains a filtered version of the sources: only entries that contain a Spanish word that is part of the Spanish swadesh list are included in the download package.
For a simple example how to parse one of the source please see here:
http://graf-python.readthedocs.org/en/latest/Querying%20GrAF%20graphs.html
The following Python libraries are required to process the GrAF/XML files, calculate the co-ocurrence matrices and visualize the polysemy:
To visualize the graphs we use the D3.js library, but we will load this on-the-fly when we start with the visualization.
import os
import sys
import csv
import codecs
import collections
import glob
import re
import graf
import scipy.sparse
import networkx
In the first step we download and extract the data. You may change to a local "tmp" directory before the download or just download the data to the current working directory. For this you need to install the Python library requests
. You may also download and extract the data manually, the data is only downloaded for you if the file sources.csv
is not found.## Get the sources
Change to the directory where you extracted the ZIP archive that you downloaded from the QuantHistLing website:
os.chdir("/Users/pbouda/Projects/git-github/notebooks/polysemy")
if not os.path.exists("sources.csv"):
import requests
import zipfile
r = requests.get(
"http://www.quanthistling.info/data/downloads/xml/data.zip")
with open("data.zip", "wb") as f:
f.write(r.content)
z = zipfile.ZipFile("data.zip")
z.extractall()
Now we open the file "sources.csv" and read out all the sources that are dictionaries. We will store a list of those source in dict_sources
:
sources = csv.reader(open("sources.csv", "rU"), delimiter="\t")
dict_sources = list()
for source in sources:
if source[0] != "QLCID" and source[1] == "dictionary":
dict_sources.append(source[0])
with codecs.open("body-part-terms-spanish.txt", "r", "utf-8") as f:
bodyparts = f.read().splitlines()
if not os.path.exists(os.path.join("stopwords", "spanish")):
import requests
import zipfile
r = requests.get(
"https://github.com/nltk/nltk_data/blob/gh-pages/packages/corpora/stopwords.zip?raw=true")
with open("stopwords.zip", "wb") as f:
f.write(r.content)
z = zipfile.ZipFile("stopwords.zip")
z.extractall()
stopwords = list()
with codecs.open(os.path.join("stopwords", "spanish"), "r", "utf-8") as f:
for line in f:
stopwords.append(line.strip())
re_stopwords_str = u"(?={0})".format(u"|".join(stopwords))
re_stopwords = re.compile(re_stopwords_str)
import unicodedata
tbl = dict.fromkeys(i for i in xrange(sys.maxunicode)
if unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('S'))
def remove_punctuation(text):
return text.translate(tbl)
parser = graf.GraphParser()
spa_to_indi = collections.defaultdict(list)
#spa_to_spa = []
indi = set()
spa = set()
for d in dict_sources:
for f in glob.glob(os.path.join(d, "dict-*-dictinterpretation.xml")):
#print("Parsing {0}...".format(f))
graf_graph = parser.parse(f)
for (node_id, node) in graf_graph.nodes.items():
if node_id.endswith("..entry"):
entry_spa = set()
#spa_to_spa_tmp = list()
entry_indi = set()
for e in node.out_edges:
if e.annotations.get_first().label == "head" or e.annotations.get_first().label == "translation":
# get lang
for n in e.to_node.links[0][0].nodes:
if n.annotations.get_first().label == "iso-639-3":
if n.annotations.get_first().features.get_value("substring") == "spa":
substr = e.to_node.annotations.get_first().features.get_value("substring")
substr = remove_punctuation(substr)
substr = re_stopwords.sub("", substr)
if substr in bodyparts:
entry_spa.add(substr)
break
else:
substr = e.to_node.annotations.get_first().features.get_value("substring")
entry_indi.add(u"{0}|{1}".format(substr, d))
break
if len(entry_spa) > 0:
for head in entry_spa:
for translation in entry_indi:
spa_to_indi[head].append(translation)
spa.add(head)
indi.add(translation)
spa = list(spa)
indi = list(indi)
all_dicts_cooc = scipy.sparse.lil_matrix((len(indi), len(spa)))
#all_dicts_cooc = numpy.zeros((len(indi), len(spa)))
for i, head in enumerate(spa):
for trans in spa_to_indi[head]:
all_dicts_cooc[indi.index(trans), i] = 1
all_dicts_cooc = scipy.sparse.csc_matrix(all_dicts_cooc)
spa_similarity = all_dicts_cooc.T * all_dicts_cooc
g = networkx.Graph(spa_similarity)
solitary = [ n for n, d in g.degree_iter() if d==2 ]
g.remove_nodes_from(solitary)
labels = dict(zip(range(len(spa)), spa))
#labels = { k: v for k,v in enumerate(spa) if k in g }
g2 = networkx.relabel_nodes(g, labels)
from networkx.readwrite import json_graph
import json
bodyparts_json = json_graph.node_link_data(g2)
json.dump(bodyparts_json, codecs.open("bodyparts_graph.json", "w", "utf-8"))