import sys import collections import random %load_ext autoreload %autoreload 2 import laf from laf.fabric import LafFabric from etcbc.preprocess import prepare from etcbc.lib import Transcription, monad_set from etcbc.trees import Tree fabric = LafFabric() tr = Transcription() API=fabric.load('calap', '--', 'trees', { "xmlids": {"node": False, "edge": False}, "features": (''' oid otype monads surface_consonants psp phrase_type verse_label ''',''), "prepare": prepare, }, verbose='NORMAL') exec(fabric.localnames.format(var='fabric')) type_info = ( ("word", ''), ("phrase_atom", 'U'), ("phrase", 'P'), ("clause_atom", 'S'), ) type_table = dict(t for t in type_info) type_order = [t[0] for t in type_info] pos_table = { 'adjective': 'aj', 'adverb': 'av', 'conjunction': 'cj', 'interjection': 'ij', 'interrogative': 'ir', 'negative': 'ng', 'noun': 'n', 'preposition': 'pp', 'pronoun': 'pr', 'verb': 'vb', } tree_types = ('clause_atom', 'phrase', 'phrase_atom', 'word') (root_type, leaf_type, clause_type) = (tree_types[0], tree_types[-1], 'clause_atom') tree = Tree(API, otypes=tree_types, clause_type=None, ccr_feature=None, pt_feature='phrase_type', pos_feature='psp', mother_feature=None) tree.restructure_clauses(None) results = tree.relations() parent = results['rparent'] sisters = results['sisters'] children = results['rchildren'] elder_sister = results['elder_sister'] root_verse = {} for n in NN(): otype = F.otype.v(n) if otype == 'verse': cur_verse = F.verse_label.v(n) elif otype == root_type: root_verse[n] = cur_verse msg("Ready for processing") def get_tag(node): otype = F.otype.v(node) tag = type_table[otype] if tag == 'P': tag = F.phrase_type.v(node) is_word = tag == '' pos = pos_table[F.psp.v(node)] if is_word else None monad = int(F.monads.v(node)) if is_word else None text = '"{}"'.format(tr.to_syriac(F.surface_consonants.v(node))) if is_word else None return (tag, pos, monad, text, is_word) def passage_roots(verse_label): sought = [] grab = -1 for n in NN(): if grab == 1: continue otype = F.otype.v(n) if otype == 'verse': check = F.verse_label.v(n) == verse_label if check: grab = 0 elif grab == 0: grab = 1 if grab == 0 and otype == root_type: sought.append(n) return sought def showcases(cases, ofile): out = outfile(ofile) for snode in cases: out.write("\n====================\n{}\n{}\n{} bhs_id={} laf_node={}:\n".format( root_verse[snode], cases[snode], root_type, F.oid.v(snode), snode, )) for kind in ('e', 'r'): out.write("\nTree based on monad embedding {}\n\n".format( "only" if kind == 'e' else " and mother+clause_constituent relation" )) (tree_rep, words_rep, bmonad) = tree.write_tree(snode, kind, get_tag, rev=False, leafnumbers=False) out.write("{}\n\n{}\n".format(words_rep, tree_rep)) out.write("\nDepth={}\n".format(tree.depth(snode, kind))) out.write(tree.debug_write_tree(snode, kind, legenda=kind=='r')) out.close() msg("Writing {} trees".format(root_type)) trees = outfile("trees.txt") verse_label = '' s = 0 chunk = 10000 sc = 0 for node in NN(): otype = F.otype.v(node) oid = F.oid.v(node) if otype == 'verse': verse_label = F.verse_label.v(node) continue if otype != root_type: continue (tree_rep, words_rep, bmonad) = tree.write_tree(node, 'r', get_tag, rev=False, leafnumbers=False) trees.write("\n#{}\tnode={}\toid={}\tbmonad={}\t{}\n{}\n".format( verse_label, node, oid, bmonad, words_rep, tree_rep, )) s += 1 sc += 1 if sc == chunk: msg("{} trees written".format(s)) sc = 0 trees.close() msg("{} trees written".format(s)) #1 msg("Counting {}s ...".format(root_type)) msg("There are {} {}s".format(len(set(NN(test=F.otype.v, value=root_type))), root_type)) #2 msg("Checking parents of {}s".format(root_type)) exceptions = set() for node in NN(test=F.otype.v, value=root_type): if node in parent: exceptions.add(node) if len(exceptions) == 0: msg("No {} has a parent".format(root_type)) else: msg("{} {}s have a parent:".format(len(exceptions), root_type)) for n in sorted(exceptions): p = parent[n] msg("{} {} [{}] has {} parent {} [{}]".format( root_type, n, F.monads.v(n), F.otype.v(p), p, F.monads.v(p) )) #3 (again a check on #1) msg("Checking the types of root nodes ...") exceptions = collections.defaultdict(lambda: []) sn = 0 for node in NN(): otype = F.otype.v(node) if otype not in type_table: continue if otype == root_type: sn += 1 if node not in parent and node not in elder_sister and otype != root_type: exceptions[otype].append(node) if len(exceptions) == 0: msg("All top nodes are {}s".format(root_type)) else: msg("Top nodes which are not {}s:".format(root_type)) for t in sorted(exceptions): msg("{}: {}x".format(t, len(exceptions[t])), withtime=False) msg("{} {}s seen".format(sn, root_type)) for c in exceptions[clause_type]: (s, st) = tree.get_root(c, 'e') v = root_verse[s] msg("{}={}, {}={}={}, verse={}".format(clause_type, c, root_type, st, s, v), withtime=False) #4, 5 def get_top(kind, rel, rela, multi): seen = set() top_nodes = set() start_nodes = set(NN(test=F.otype.v, value=kind)) next_nodes = start_nodes msg("Starting from {} nodes ...".format(kind)) while len(next_nodes): new_next_nodes = set() for node in next_nodes: if node in seen: continue seen.add(node) is_top = True if node in rel: is_top = False if multi: for c in rel[node]: new_next_nodes.add(c) else: new_next_nodes.add(rel[node]) if node in rela: is_top = False if multi: for c in rela[node]: new_next_nodes.add(c) else: new_next_nodes.add(rela[node]) if is_top: top_nodes.add(node) next_nodes = new_next_nodes top_types = collections.defaultdict(lambda: 0) for t in top_nodes: top_types[F.otype.v(t)] += 1 for t in top_types: msg("From {} {} nodes reached {} {} nodes".format(len(start_nodes), kind, top_types[t], t), withtime=False) msg("Embedding trees") get_top(leaf_type, tree.eparent, {}, False) get_top(root_type, tree.echildren, {}, True) msg("Restructd trees") get_top(leaf_type, tree.rparent, tree.elder_sister, False) get_top(root_type, tree.rchildren, tree.sisters, True) msg("Done") #7 msg("Which types embed which types and how often? ...") for kind in ('e', 'r'): plinked_types = collections.defaultdict(lambda: 0) parent = tree.eparent if kind == 'e' else tree.rparent kindrep = 'embedding' if kind == 'e' else 'restructd' for (c, p) in parent.items(): plinked_types[(F.otype.v(c), F.otype.v(p))] += 1 msg("Found {} parent ({}) links between types".format(len(parent), kindrep)) for lt in sorted(plinked_types): msg("{}: {}x".format(lt, plinked_types[lt]), withtime=False) #11 msg("Computing depths") ntrees = 0 rntrees = 0 total_depth = {'e': 0, 'r': 0} rtotal_depth = {'e': 0, 'r': 0} max_depth = {'e': 0, 'r':0} rmax_depth = {'e': 0, 'r': 0} for node in NN(test=F.otype.v, value=root_type): ntrees += 1 this_depth = {} for kind in ('e', 'r'): this_depth[kind] = tree.depth(node, kind) different = this_depth['e'] != this_depth['r'] if different: rntrees += 1 for kind in ('e', 'r'): if this_depth[kind] > max_depth[kind]: max_depth[kind] = this_depth[kind] total_depth[kind] += this_depth[kind] if different: if this_depth[kind] > rmax_depth[kind]: rmax_depth[kind] = this_depth[kind] rtotal_depth[kind] += this_depth[kind] msg("{} trees seen, of which in {} cases restructuring makes a difference in depth".format(ntrees, rntrees)) if ntrees > 0: msg("Embedding trees: max depth = {:>2}, average depth = {:.2g}".format(max_depth['e'], total_depth['e'] / ntrees)) msg("Restructd trees: max depth = {:>2}, average depth = {:.2g}".format(max_depth['r'], total_depth['r'] / ntrees)) if rntrees > 0: msg("Statistics for cases where restructuring makes a difference:") msg("Embedding trees: max depth = {:>2}, average depth = {:.2g}".format(rmax_depth['e'], rtotal_depth['e'] / rntrees)) msg("Restructd trees: max depth = {:>2}, average depth = {:.2g}".format(rmax_depth['r'], rtotal_depth['r'] / rntrees)) close() !head -n 25 {my_file('trees.txt')}