#!/usr/bin/env python
# coding: utf-8
#
#
#
#
#
# # Cooccurrences of lexemes between the books of the Hebrew Bible
# # Research Question
# What does linguistic variation between bible books tell us about their origin, and about the evolution and transmission of their texts?
# # Method
# We study the co-occurrences of lexemes across the books of the bible and represent this data in a undirected weighted graph, where the books are nodes.
# There are edges between very pair of books that share a lexeme occurrence.
# Edges are weighted: the more lexemes are shared by a pair of books, the heavier the edge. However, the weight is corrected and normalized as well.
#
# * *correction*: frequent lexemes contribute less to the weight than rare lexemes,
# * *normalization*: the weight contribution of a lexeme is divided by the number of lexemes in the union of two books.
#
# The initial plan was to consider only common nouns, but we are also experimenting with nouns in general, verbs, and all lexemes.
# Moreover, we also experiment with two measures of normalisation:
#
# * *normal*: divide by the sum of the number of distinct lexemes in the concatenation of two books
# * *quadratic*: as in *normal*, but divide by the square of the sum.
#
# More formally:
#
# Let $B$ be the set of books in the Bible.
#
# The *support* of a lexeme $l$ is defined as $S(l) = card\{b \in B\ \vert\ l \in b\}$.
#
# The lexeme content of book $b$ is defined as $L(b) = \{l\ \vert\ l \in b\}$,
#
# and the lexeme content of two books $b_1$ and $b_2$ is defined as $L(b_1, b_2) = L(b_1)\ \cup\ L(b_2)$.
#
# The cooccurrency of those two books is defined as $C(b_1, b_2) = L(b_1)\ \cap\ L(b_2)$.
#
# We now define two measures of *weight* of a cooccurrences edge between two books $b_1$ and $b_2$:
#
# $$W_1(b_1,b_2) = {\sum \{{1\over S(l)}\ \vert\ l \in C(b_1, b_2)\} \over card\,L(b_1, b_2)}$$
#
# $$W_2(b_1,b_2) = {\sum \{{1\over S(l)}\ \vert\ l \in C(b_1, b_2)\} \over (card\,L(b_1, b_2))^2}$$
# # Compute
# Import the python modules, the plot modules, the LAF-Fabric module (``laf``) and initialize the ``laf`` processor.
# In[1]:
import sys
import collections
import matplotlib.pyplot as plt
from laf.fabric import LafFabric
fabric = LafFabric()
# Load the data, especially the features we need.
# Note that the task will be named *cooccurrences*.
# After loading we retrieve the names by which we can access the various pieces of the LAF data.
# In[2]:
fabric.load('etcbc4', '--', 'cooccurrences', {
"xmlids": {"node": False, "edge": False},
"features": ("otype sp lex_utf8 book", ""),
})
exec(fabric.localnames.format(var='fabric'))
# For your convenience:
#
# * *NN*: iterator of nodes in primary data order
# * *F*: feature data
#
# You can inspect the API by giving commands like ``F.*?``, ``NN??``
# In[3]:
get_ipython().run_line_magic('psearch', 'F.*')
# In[4]:
get_ipython().run_line_magic('pinfo2', 'NN')
# We are going to generate data files for [Gephi](https://gephi.org), in its native XML format.
# Here we specify the subtasks and weighing methods.
#
# * *Subtasks* correspond to the kind of lexemes we are counting.
# * *Methods* correspond to the kind of normalization that we are applying: dividing by the sum or the square of the sum.
#
# We also spell out the XML header of a Gephi file
# In[3]:
tasks = {
'noun_common': {
'1': outfile("noun_common_1.gexf"),
'2': outfile("noun_common_2.gexf"),
},
'noun_proper': {
'1': outfile("noun_proper_1.gexf"),
'2': outfile("noun_proper_2.gexf"),
},
'verb': {
'1': outfile("verb_1.gexf"),
'2': outfile("verb_2.gexf"),
},
'verb-noun_common': {
'1': outfile("verb-noun_common_1.gexf"),
'2': outfile("verb-noun_common_2.gexf"),
},
'all': {
'1': outfile("all_1.gexf"),
'2': outfile("all_2.gexf"),
},
}
methods = {
'1': lambda x, y: float(x) / y,
'2': lambda x, y: float(x) / y / y,
}
data_header = '''
LAF-Fabric
'''
# Initialization
# In[4]:
book_name = None
books = []
lexemes = collections.defaultdict(lambda: collections.defaultdict(lambda:collections.defaultdict(lambda:0)))
lexeme_support_book = collections.defaultdict(lambda: collections.defaultdict(lambda: {}))
# Walk through the relevant nodes and collect the data:
# In[5]:
for node in NN():
this_type = F.otype.v(node)
if this_type == "word":
lexeme = F.lex_utf8.v(node)
lexemes['all'][book_name][lexeme] += 1
lexeme_support_book['all'][lexeme][book_name] = 1
p_o_s = F.sp.v(node)
if p_o_s == "subs":
lexemes['noun_common'][book_name][lexeme] += 1
lexeme_support_book['noun_common'][lexeme][book_name] = 1
lexemes['verb-noun_common'][book_name][lexeme] += 1
lexeme_support_book['verb-noun_common'][lexeme][book_name] = 1
elif p_o_s == 'nmpr':
lexemes['noun_proper'][book_name][lexeme] += 1
lexeme_support_book['noun_proper'][lexeme][book_name] = 1
elif p_o_s == "verb":
lexemes['verb'][book_name][lexeme] += 1
lexeme_support_book['verb'][lexeme][book_name] = 1
lexemes['verb-noun_common'][book_name][lexeme] += 1
lexeme_support_book['verb-noun_common'][lexeme][book_name] = 1
elif this_type == "book":
book_name = F.book.v(node)
books.append(book_name)
msg("{} ".format(book_name))
msg("Done")
# Sort the data according to the various subtasks, and compute the edges with their weights.
# In[6]:
nodes_header = '''\n'''.format(len(books))
for this_type in tasks:
lexeme_support = {}
for lexeme in lexeme_support_book[this_type]:
lexeme_support[lexeme] = len(lexeme_support_book[this_type][lexeme])
book_size = collections.defaultdict(lambda: 0)
for book in lexemes[this_type]:
book_size[book] = len(lexemes[this_type][book])
node_data = []
for node in range(len(books)):
node_data.append('''\n'''.format(node + 1, books[node]))
edge_id = 0
edge_data = collections.defaultdict(lambda: [])
for src in range(len(books)):
for tgt in range(src + 1, len(books)):
book_src = books[src]
book_tgt = books[tgt]
lexemes_src = {}
lexemes_tgt = {}
lexemes_src = lexemes[this_type][book_src]
lexemes_tgt = lexemes[this_type][book_tgt]
intersection_size = 0
weights = collections.defaultdict(lambda: 0)
for lexeme in lexemes_src:
if lexeme not in lexemes_tgt:
continue
pre_weight = lexeme_support[lexeme]
for this_method in tasks[this_type]:
weights[this_method] += methods[this_method](1000, pre_weight)
intersection_size += 1
combined_size = book_size[book_src] + book_size[book_tgt] - intersection_size
edge_id += 1
for this_method in tasks[this_type]:
edge_data[this_method].append('''\n'''.
format(edge_id, src + 1, tgt + 1, weights[this_method]/combined_size))
for this_method in tasks[this_type]:
edges_header = '''\n'''.format(len(edge_data[this_method]))
out_file = tasks[this_type][this_method]
out_file.write(data_header)
out_file.write(nodes_header)
for node_line in node_data:
out_file.write(node_line)
out_file.write("\n")
out_file.write(edges_header)
for edge_line in edge_data[this_method]:
out_file.write(edge_line)
out_file.write("\n")
out_file.write("\n")
msg("{}: nodes: {}; edges: {}".format(this_type, len(books), edge_id))
close()
# In[10]:
get_ipython().system("head -n 100 {my_file('verb-noun_common_1.gexf')}")
# # Visualization
# The output files can be loaded into Gephi and subjected to various graph rendering algorithms.
# After some playing you can get this out of it:
#
#
# The Python module *networkx* is also capable of graph layout, let us try the most obvious methods.
# In[8]:
get_ipython().run_line_magic('matplotlib', 'inline')
import networkx as nx
# In[9]:
g_nc1 = nx.read_gexf(my_file('verb-noun_common_1.gexf'), relabel=True)
# In[10]:
nx.draw_spring(g_nc1)
# In[11]:
nx.draw_circular(g_nc1)
# In[12]:
nx.draw_spectral(g_nc1)
# In[13]:
nx.draw_shell(g_nc1)
# In[14]:
nx.draw_random(g_nc1)