#!/usr/bin/env python # coding: utf-8 # # # # # # # Cooccurrences of lexemes between the books of the Hebrew Bible # # Research Question # What does linguistic variation between bible books tell us about their origin, and about the evolution and transmission of their texts? # # Method # We study the co-occurrences of lexemes across the books of the bible and represent this data in a undirected weighted graph, where the books are nodes. # There are edges between very pair of books that share a lexeme occurrence. # Edges are weighted: the more lexemes are shared by a pair of books, the heavier the edge. However, the weight is corrected and normalized as well. # # * *correction*: frequent lexemes contribute less to the weight than rare lexemes, # * *normalization*: the weight contribution of a lexeme is divided by the number of lexemes in the union of two books. # # The initial plan was to consider only common nouns, but we are also experimenting with nouns in general, verbs, and all lexemes. # Moreover, we also experiment with two measures of normalisation: # # * *normal*: divide by the sum of the number of distinct lexemes in the concatenation of two books # * *quadratic*: as in *normal*, but divide by the square of the sum. # # More formally: # # Let $B$ be the set of books in the Bible. # # The *support* of a lexeme $l$ is defined as $S(l) = card\{b \in B\ \vert\ l \in b\}$. # # The lexeme content of book $b$ is defined as $L(b) = \{l\ \vert\ l \in b\}$, # # and the lexeme content of two books $b_1$ and $b_2$ is defined as $L(b_1, b_2) = L(b_1)\ \cup\ L(b_2)$. # # The cooccurrency of those two books is defined as $C(b_1, b_2) = L(b_1)\ \cap\ L(b_2)$. # # We now define two measures of *weight* of a cooccurrences edge between two books $b_1$ and $b_2$: # # $$W_1(b_1,b_2) = {\sum \{{1\over S(l)}\ \vert\ l \in C(b_1, b_2)\} \over card\,L(b_1, b_2)}$$ # # $$W_2(b_1,b_2) = {\sum \{{1\over S(l)}\ \vert\ l \in C(b_1, b_2)\} \over (card\,L(b_1, b_2))^2}$$ # # Compute # Import the python modules, the plot modules, the LAF-Fabric module (``laf``) and initialize the ``laf`` processor. # In[1]: import sys import collections import matplotlib.pyplot as plt from laf.fabric import LafFabric fabric = LafFabric() # Load the data, especially the features we need. # Note that the task will be named *cooccurrences*. # After loading we retrieve the names by which we can access the various pieces of the LAF data. # In[2]: fabric.load('etcbc4', '--', 'cooccurrences', { "xmlids": {"node": False, "edge": False}, "features": ("otype sp lex_utf8 book", ""), }) exec(fabric.localnames.format(var='fabric')) # For your convenience: # # * *NN*: iterator of nodes in primary data order # * *F*: feature data # # You can inspect the API by giving commands like ``F.*?``, ``NN??`` # In[3]: get_ipython().run_line_magic('psearch', 'F.*') # In[4]: get_ipython().run_line_magic('pinfo2', 'NN') # We are going to generate data files for [Gephi](https://gephi.org), in its native XML format. # Here we specify the subtasks and weighing methods. # # * *Subtasks* correspond to the kind of lexemes we are counting. # * *Methods* correspond to the kind of normalization that we are applying: dividing by the sum or the square of the sum. # # We also spell out the XML header of a Gephi file # In[3]: tasks = { 'noun_common': { '1': outfile("noun_common_1.gexf"), '2': outfile("noun_common_2.gexf"), }, 'noun_proper': { '1': outfile("noun_proper_1.gexf"), '2': outfile("noun_proper_2.gexf"), }, 'verb': { '1': outfile("verb_1.gexf"), '2': outfile("verb_2.gexf"), }, 'verb-noun_common': { '1': outfile("verb-noun_common_1.gexf"), '2': outfile("verb-noun_common_2.gexf"), }, 'all': { '1': outfile("all_1.gexf"), '2': outfile("all_2.gexf"), }, } methods = { '1': lambda x, y: float(x) / y, '2': lambda x, y: float(x) / y / y, } data_header = ''' LAF-Fabric ''' # Initialization # In[4]: book_name = None books = [] lexemes = collections.defaultdict(lambda: collections.defaultdict(lambda:collections.defaultdict(lambda:0))) lexeme_support_book = collections.defaultdict(lambda: collections.defaultdict(lambda: {})) # Walk through the relevant nodes and collect the data: # In[5]: for node in NN(): this_type = F.otype.v(node) if this_type == "word": lexeme = F.lex_utf8.v(node) lexemes['all'][book_name][lexeme] += 1 lexeme_support_book['all'][lexeme][book_name] = 1 p_o_s = F.sp.v(node) if p_o_s == "subs": lexemes['noun_common'][book_name][lexeme] += 1 lexeme_support_book['noun_common'][lexeme][book_name] = 1 lexemes['verb-noun_common'][book_name][lexeme] += 1 lexeme_support_book['verb-noun_common'][lexeme][book_name] = 1 elif p_o_s == 'nmpr': lexemes['noun_proper'][book_name][lexeme] += 1 lexeme_support_book['noun_proper'][lexeme][book_name] = 1 elif p_o_s == "verb": lexemes['verb'][book_name][lexeme] += 1 lexeme_support_book['verb'][lexeme][book_name] = 1 lexemes['verb-noun_common'][book_name][lexeme] += 1 lexeme_support_book['verb-noun_common'][lexeme][book_name] = 1 elif this_type == "book": book_name = F.book.v(node) books.append(book_name) msg("{} ".format(book_name)) msg("Done") # Sort the data according to the various subtasks, and compute the edges with their weights. # In[6]: nodes_header = '''\n'''.format(len(books)) for this_type in tasks: lexeme_support = {} for lexeme in lexeme_support_book[this_type]: lexeme_support[lexeme] = len(lexeme_support_book[this_type][lexeme]) book_size = collections.defaultdict(lambda: 0) for book in lexemes[this_type]: book_size[book] = len(lexemes[this_type][book]) node_data = [] for node in range(len(books)): node_data.append('''\n'''.format(node + 1, books[node])) edge_id = 0 edge_data = collections.defaultdict(lambda: []) for src in range(len(books)): for tgt in range(src + 1, len(books)): book_src = books[src] book_tgt = books[tgt] lexemes_src = {} lexemes_tgt = {} lexemes_src = lexemes[this_type][book_src] lexemes_tgt = lexemes[this_type][book_tgt] intersection_size = 0 weights = collections.defaultdict(lambda: 0) for lexeme in lexemes_src: if lexeme not in lexemes_tgt: continue pre_weight = lexeme_support[lexeme] for this_method in tasks[this_type]: weights[this_method] += methods[this_method](1000, pre_weight) intersection_size += 1 combined_size = book_size[book_src] + book_size[book_tgt] - intersection_size edge_id += 1 for this_method in tasks[this_type]: edge_data[this_method].append('''\n'''. format(edge_id, src + 1, tgt + 1, weights[this_method]/combined_size)) for this_method in tasks[this_type]: edges_header = '''\n'''.format(len(edge_data[this_method])) out_file = tasks[this_type][this_method] out_file.write(data_header) out_file.write(nodes_header) for node_line in node_data: out_file.write(node_line) out_file.write("\n") out_file.write(edges_header) for edge_line in edge_data[this_method]: out_file.write(edge_line) out_file.write("\n") out_file.write("\n") msg("{}: nodes: {}; edges: {}".format(this_type, len(books), edge_id)) close() # In[10]: get_ipython().system("head -n 100 {my_file('verb-noun_common_1.gexf')}") # # Visualization # The output files can be loaded into Gephi and subjected to various graph rendering algorithms. # After some playing you can get this out of it: # # # The Python module *networkx* is also capable of graph layout, let us try the most obvious methods. # In[8]: get_ipython().run_line_magic('matplotlib', 'inline') import networkx as nx # In[9]: g_nc1 = nx.read_gexf(my_file('verb-noun_common_1.gexf'), relabel=True) # In[10]: nx.draw_spring(g_nc1) # In[11]: nx.draw_circular(g_nc1) # In[12]: nx.draw_spectral(g_nc1) # In[13]: nx.draw_shell(g_nc1) # In[14]: nx.draw_random(g_nc1)