import sys
import collections
import random
%load_ext autoreload
%autoreload 2
import laf
from laf.fabric import LafFabric
from etcbc.preprocess import prepare
from etcbc.lib import Transcription, monad_set
from etcbc.trees import Tree
fabric = LafFabric()
tr = Transcription()
0.00s This is LAF-Fabric 4.3.3 http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
API=fabric.load('calap', '--', 'trees', {
"xmlids": {"node": False, "edge": False},
"features": ('''
oid otype monads
surface_consonants
psp
phrase_type
verse_label
''',''),
"prepare": prepare,
}, verbose='NORMAL')
exec(fabric.localnames.format(var='fabric'))
0.00s LOADING API: please wait ... 0.00s INFO: USING DATA COMPILED AT: 2014-06-27T12-29-20 0.33s LOGFILE=/Users/dirk/laf-fabric-output/calap/trees/__log__trees.txt 0.38s INFO: DATA LOADED FROM SOURCE calap AND ANNOX -- FOR TASK trees AT 2014-07-15T16-05-01
Here we define the formatting of the trees.
Not all nodes will be shown in the output.
The nodes that are shown, have abbreviated names.
Nodes with True
will be shown, nodes with False
will be suppressed.
Suppressing a node leaves its children in place. Another way of looking at it, is: we replace a node by its children.
Exception: when a node is visited twice, the second visit refers to the tree built by the first visit. In that case, we do not suppress the node.
N.B. It turns out that the -atom
nodes are never visited twice.
We abbreviate the part-of-speech tags. We include the pos-info by inserting a unary node right above each word.
type_info = (
("word", ''),
("phrase_atom", 'U'),
("phrase", 'P'),
("clause_atom", 'S'),
)
type_table = dict(t for t in type_info)
type_order = [t[0] for t in type_info]
pos_table = {
'adjective': 'aj',
'adverb': 'av',
'conjunction': 'cj',
'interjection': 'ij',
'interrogative': 'ir',
'negative': 'ng',
'noun': 'n',
'preposition': 'pp',
'pronoun': 'pr',
'verb': 'vb',
}
tree_types = ('clause_atom', 'phrase', 'phrase_atom', 'word')
(root_type, leaf_type, clause_type) = (tree_types[0], tree_types[-1], 'clause_atom')
tree = Tree(API, otypes=tree_types, clause_type=None, ccr_feature=None, pt_feature='phrase_type', pos_feature='psp', mother_feature=None)
tree.restructure_clauses(None)
results = tree.relations()
parent = results['rparent']
sisters = results['sisters']
children = results['rchildren']
elder_sister = results['elder_sister']
root_verse = {}
for n in NN():
otype = F.otype.v(n)
if otype == 'verse': cur_verse = F.verse_label.v(n)
elif otype == root_type: root_verse[n] = cur_verse
msg("Ready for processing")
0.00s LOADING API with EXTRAs: please wait ... 0.00s INFO: USING DATA COMPILED AT: 2014-06-27T12-29-20 0.10s INFO: DATA LOADED FROM SOURCE calap AND ANNOX -- FOR TASK trees AT 2014-07-15T16-05-08 0.00s Start computing parent and children relations for objects of type clause_atom, phrase, phrase_atom, word 1.29s 100000 nodes 1.82s 141611 nodes: 130181 have parents and 87691 have children 1.82s Restructuring clauses: deep copying tree relations 2.86s Ready for processing
def get_tag(node):
otype = F.otype.v(node)
tag = type_table[otype]
if tag == 'P': tag = F.phrase_type.v(node)
is_word = tag == ''
pos = pos_table[F.psp.v(node)] if is_word else None
monad = int(F.monads.v(node)) if is_word else None
text = '"{}"'.format(tr.to_syriac(F.surface_consonants.v(node))) if is_word else None
return (tag, pos, monad, text, is_word)
def passage_roots(verse_label):
sought = []
grab = -1
for n in NN():
if grab == 1: continue
otype = F.otype.v(n)
if otype == 'verse':
check = F.verse_label.v(n) == verse_label
if check: grab = 0
elif grab == 0: grab = 1
if grab == 0 and otype == root_type: sought.append(n)
return sought
def showcases(cases, ofile):
out = outfile(ofile)
for snode in cases:
out.write("\n====================\n{}\n{}\n{} bhs_id={} laf_node={}:\n".format(
root_verse[snode], cases[snode], root_type, F.oid.v(snode), snode,
))
for kind in ('e', 'r'):
out.write("\nTree based on monad embedding {}\n\n".format(
"only" if kind == 'e' else " and mother+clause_constituent relation"
))
(tree_rep, words_rep, bmonad) = tree.write_tree(snode, kind, get_tag, rev=False, leafnumbers=False)
out.write("{}\n\n{}\n".format(words_rep, tree_rep))
out.write("\nDepth={}\n".format(tree.depth(snode, kind)))
out.write(tree.debug_write_tree(snode, kind, legenda=kind=='r'))
out.close()
msg("Writing {} trees".format(root_type))
trees = outfile("trees.txt")
verse_label = ''
s = 0
chunk = 10000
sc = 0
for node in NN():
otype = F.otype.v(node)
oid = F.oid.v(node)
if otype == 'verse':
verse_label = F.verse_label.v(node)
continue
if otype != root_type: continue
(tree_rep, words_rep, bmonad) = tree.write_tree(node, 'r', get_tag, rev=False, leafnumbers=False)
trees.write("\n#{}\tnode={}\toid={}\tbmonad={}\t{}\n{}\n".format(
verse_label, node, oid, bmonad, words_rep, tree_rep,
))
s += 1
sc += 1
if sc == chunk:
msg("{} trees written".format(s))
sc = 0
trees.close()
msg("{} trees written".format(s))
11s Writing clause_atom trees 18s 10000 trees written 19s 11411 trees written
#1
msg("Counting {}s ...".format(root_type))
msg("There are {} {}s".format(len(set(NN(test=F.otype.v, value=root_type))), root_type))
23s Counting clause_atoms ... 23s There are 11411 clause_atoms
#2
msg("Checking parents of {}s".format(root_type))
exceptions = set()
for node in NN(test=F.otype.v, value=root_type):
if node in parent: exceptions.add(node)
if len(exceptions) == 0:
msg("No {} has a parent".format(root_type))
else:
msg("{} {}s have a parent:".format(len(exceptions), root_type))
for n in sorted(exceptions):
p = parent[n]
msg("{} {} [{}] has {} parent {} [{}]".format(
root_type, n, F.monads.v(n),
F.otype.v(p), p, F.monads.v(p)
))
25s Checking parents of clause_atoms 25s 13 clause_atoms have a parent: 25s clause_atom 53963 [257] has phrase parent 65481 [257,261,268] 25s clause_atom 54180 [1259-1260] has phrase parent 66141 [1252,1259-1260] 25s clause_atom 61168 [36003-36005] has phrase parent 87851 [35993,35995-36005] 25s clause_atom 61185 [36057-36058] has phrase parent 87888 [36053,36057-36058] 25s clause_atom 61204 [36115-36117] has phrase parent 87929 [36110-36111,36115-36117] 25s clause_atom 61254 [36317-36319] has phrase parent 88069 [36312-36313,36317-36319] 25s clause_atom 61449 [37049] has phrase parent 88611 [37049,37056] 25s clause_atom 63071 [43736] has phrase parent 93449 [43736,43744-43745,43750-43751] 25s clause_atom 64144 [48344] has phrase parent 96589 [48344,48346-48347] 25s clause_atom 64146 [48346-48347] has phrase parent 96589 [48344,48346-48347] 25s clause_atom 64178 [48471-48472] has phrase parent 96675 [48465-48466,48471-48472] 25s clause_atom 64655 [50599] has phrase parent 98073 [50592-50594,50599] 25s clause_atom 64664 [50627-50628] has phrase parent 98093 [50620-50621,50627-50628]
#3 (again a check on #1)
msg("Checking the types of root nodes ...")
exceptions = collections.defaultdict(lambda: [])
sn = 0
for node in NN():
otype = F.otype.v(node)
if otype not in type_table: continue
if otype == root_type: sn += 1
if node not in parent and node not in elder_sister and otype != root_type:
exceptions[otype].append(node)
if len(exceptions) == 0:
msg("All top nodes are {}s".format(root_type))
else:
msg("Top nodes which are not {}s:".format(root_type))
for t in sorted(exceptions):
msg("{}: {}x".format(t, len(exceptions[t])), withtime=False)
msg("{} {}s seen".format(sn, root_type))
for c in exceptions[clause_type]:
(s, st) = tree.get_root(c, 'e')
v = root_verse[s]
msg("{}={}, {}={}={}, verse={}".format(clause_type, c, root_type, st, s, v), withtime=False)
29s Checking the types of root nodes ... 29s Top nodes which are not clause_atoms: phrase: 32x 29s 11411 clause_atoms seen
#4, 5
def get_top(kind, rel, rela, multi):
seen = set()
top_nodes = set()
start_nodes = set(NN(test=F.otype.v, value=kind))
next_nodes = start_nodes
msg("Starting from {} nodes ...".format(kind))
while len(next_nodes):
new_next_nodes = set()
for node in next_nodes:
if node in seen: continue
seen.add(node)
is_top = True
if node in rel:
is_top = False
if multi:
for c in rel[node]: new_next_nodes.add(c)
else:
new_next_nodes.add(rel[node])
if node in rela:
is_top = False
if multi:
for c in rela[node]: new_next_nodes.add(c)
else:
new_next_nodes.add(rela[node])
if is_top: top_nodes.add(node)
next_nodes = new_next_nodes
top_types = collections.defaultdict(lambda: 0)
for t in top_nodes:
top_types[F.otype.v(t)] += 1
for t in top_types:
msg("From {} {} nodes reached {} {} nodes".format(len(start_nodes), kind, top_types[t], t), withtime=False)
msg("Embedding trees")
get_top(leaf_type, tree.eparent, {}, False)
get_top(root_type, tree.echildren, {}, True)
msg("Restructd trees")
get_top(leaf_type, tree.rparent, tree.elder_sister, False)
get_top(root_type, tree.rchildren, tree.sisters, True)
msg("Done")
32s Embedding trees 32s Starting from word nodes ... From 53920 word nodes reached 11398 clause_atom nodes From 53920 word nodes reached 32 phrase nodes 33s Starting from clause_atom nodes ... From 11411 clause_atom nodes reached 53864 word nodes 33s Restructd trees 33s Starting from word nodes ... From 53920 word nodes reached 11398 clause_atom nodes From 53920 word nodes reached 32 phrase nodes 33s Starting from clause_atom nodes ... From 11411 clause_atom nodes reached 53864 word nodes 33s Done
#7
msg("Which types embed which types and how often? ...")
for kind in ('e', 'r'):
plinked_types = collections.defaultdict(lambda: 0)
parent = tree.eparent if kind == 'e' else tree.rparent
kindrep = 'embedding' if kind == 'e' else 'restructd'
for (c, p) in parent.items():
plinked_types[(F.otype.v(c), F.otype.v(p))] += 1
msg("Found {} parent ({}) links between types".format(len(parent), kindrep))
for lt in sorted(plinked_types):
msg("{}: {}x".format(lt, plinked_types[lt]), withtime=False)
35s Which types embed which types and how often? ... 35s Found 130181 parent (embedding) links between types ('clause_atom', 'phrase'): 13x ('phrase', 'clause_atom'): 34863x ('phrase_atom', 'clause_atom'): 53x ('phrase_atom', 'phrase'): 41332x ('word', 'clause_atom'): 1x ('word', 'phrase_atom'): 53919x 35s Found 130181 parent (restructd) links between types ('clause_atom', 'phrase'): 13x ('phrase', 'clause_atom'): 34863x ('phrase_atom', 'clause_atom'): 53x ('phrase_atom', 'phrase'): 41332x ('word', 'clause_atom'): 1x ('word', 'phrase_atom'): 53919x
#11
msg("Computing depths")
ntrees = 0
rntrees = 0
total_depth = {'e': 0, 'r': 0}
rtotal_depth = {'e': 0, 'r': 0}
max_depth = {'e': 0, 'r':0}
rmax_depth = {'e': 0, 'r': 0}
for node in NN(test=F.otype.v, value=root_type):
ntrees += 1
this_depth = {}
for kind in ('e', 'r'):
this_depth[kind] = tree.depth(node, kind)
different = this_depth['e'] != this_depth['r']
if different: rntrees += 1
for kind in ('e', 'r'):
if this_depth[kind] > max_depth[kind]: max_depth[kind] = this_depth[kind]
total_depth[kind] += this_depth[kind]
if different:
if this_depth[kind] > rmax_depth[kind]: rmax_depth[kind] = this_depth[kind]
rtotal_depth[kind] += this_depth[kind]
msg("{} trees seen, of which in {} cases restructuring makes a difference in depth".format(ntrees, rntrees))
if ntrees > 0:
msg("Embedding trees: max depth = {:>2}, average depth = {:.2g}".format(max_depth['e'], total_depth['e'] / ntrees))
msg("Restructd trees: max depth = {:>2}, average depth = {:.2g}".format(max_depth['r'], total_depth['r'] / ntrees))
if rntrees > 0:
msg("Statistics for cases where restructuring makes a difference:")
msg("Embedding trees: max depth = {:>2}, average depth = {:.2g}".format(rmax_depth['e'], rtotal_depth['e'] / rntrees))
msg("Restructd trees: max depth = {:>2}, average depth = {:.2g}".format(rmax_depth['r'], rtotal_depth['r'] / rntrees))
37s Computing depths 38s 11411 trees seen, of which in 0 cases restructuring makes a difference in depth 38s Embedding trees: max depth = 3, average depth = 3 38s Restructd trees: max depth = 3, average depth = 3
close()
40s Results directory: /Users/dirk/laf-fabric-output/calap/trees __log__trees.txt 3475 Tue Jul 15 18:05:49 2014 trees.txt 1624637 Tue Jul 15 18:05:27 2014
Here are the first lines of the output.
!head -n 25 {my_file('trees.txt')}
#1R 1,1 node=53920 oid=2856 bmonad=1 0 1 2 3 (S(CP(U(cj "ܘ")))(NP(U(n "ܡܠܟܐ"))(U(n "ܕܘܝܕ")))(VP(U(vb "ܣܐܒ")))) #1R 1,1 node=53921 oid=2857 bmonad=5 0 1 2 3 (S(CP(U(cj "ܘ")))(VP(U(vb "ܥܠ")))(PP(U(pp "ܒ")(n "ܫܢܝܐ")))) #1R 1,1 node=53922 oid=2858 bmonad=9 0 1 2 3 4 5 (S(CP(U(cj "ܘ")))(VP(U(vb "ܡܟܣܝܢ")))(VP(U(vb "ܗܘܘ")))(PP(U(pp "ܠܗ")))(PP(U(pp "ܒ")(n "ܠܒܘܫܐ")))) #1R 1,1 node=53923 oid=2859 bmonad=15 0 1 2 (S(CP(U(cj "ܘ")))(NegP(U(ng "ܠܐ")))(VP(U(vb "ܫܚܢ")))) #1R 1,2 node=53924 oid=2860 bmonad=18 0 1 2 3 (S(CP(U(cj "ܘ")))(VP(U(vb "ܐܡܪܘ")))(PP(U(pp "ܠܗ")))(NP(U(n "ܥܒܕܘܗܝ")))) #1R 1,2 node=53925 oid=2861 bmonad=22 0 1 2 3 4 5 6 7 8 (S(InjP(U(ij "ܗܐ")))(NP(U(n "ܥܒܕܝܟ"))(U(pp "ܩܕܡܝܟ")))(VP(U(vb "ܢܒܥܘܢ")))(PP(U(pp "ܠ")(n "ܡܪܢ"))(U(n "ܡܠܟܐ")))(NP(U(aj "ܥܠܝܡܬܐ"))(U(aj "ܒܬܘܠܬܐ")))) #1R 1,2 node=53926 oid=2862 bmonad=31 0 1 2 3 (S(CP(U(cj "ܘ")))(VP(U(vb "ܬܩܘܡ")))(PP(U(pp "ܩܕܡ")(n "ܡܠܟܐ")))) #1R 1,2 node=53927 oid=2863 bmonad=35 0 1 2 3 (S(CP(U(cj "ܘ")))(VP(U(vb "ܬܗܘܐ")))(PP(U(pp "ܠܗ")))(NP(U(aj "ܡܫܡܫܢܝܬܐ"))))