Joint work of Kenneth Bergland, Martijn Naaijer and Dirk Roorda.
We try to find rare combinations of words and see how they occur in Thora books and in the Prophets.
import sys,os
import collections, difflib
from IPython.display import HTML, display_pretty, display_html
from difflib import SequenceMatcher
import laf
from laf.fabric import LafFabric
from etcbc.preprocess import prepare
fabric = LafFabric()
0.00s This is LAF-Fabric 4.5.5 API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html Feature doc: https://shebanq.ancient-data.org/static/docs/featuredoc/texts/welcome.html
source = 'etcbc'
version = '4b'
API = fabric.load(source+version, 'lexicon', 'rare_lexemes', {
"xmlids": {"node": False, "edge": False},
"features": ('''
otype
language lex g_cons g_word_utf8 trailer_utf8 phono phono_sep
sp
book chapter verse label number
''',''),
"prepare": prepare,
"primary": False,
}, verbose='NORMAL')
exec(fabric.localnames.format(var='fabric'))
0.00s LOADING API: please wait ... 0.00s INFO: USING DATA COMPILED AT: 2015-11-02T15-08-56 0.00s INFO: USING DATA COMPILED AT: 2015-11-03T06-44-21 4.17s LOGFILE=/Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/rare_lexemes/__log__rare_lexemes.txt 13s ETCBC reference: http://laf-fabric.readthedocs.org/en/latest/texts/ETCBC-reference.html 0.00s LOADING API with EXTRAs: please wait ... 0.00s INFO: USING DATA COMPILED AT: 2015-11-02T15-08-56 0.00s INFO: USING DATA COMPILED AT: 2015-11-03T06-44-21 0.00s INFO: DATA LOADED FROM SOURCE etcbc4b AND ANNOX lexicon FOR TASK rare_lexemes AT 2015-12-17T10-55-18 0.00s INFO: DATA LOADED FROM SOURCE etcbc4b AND ANNOX lexicon FOR TASK rare_lexemes AT 2015-12-17T10-55-18
book_classes = dict(
legal = set('''
Exodus Leviticus Deuteronomium
'''.strip().split()),
prophets = set('''
Jesaia Jeremia
Ezechiel Hosea
Joel Amos Obadia Jona Micha Nahum Habakuk Zephania Haggai Sacharia Maleachi
'''.strip().split())
)
book_classes_index = {}
for (cl, books) in book_classes.items():
for book in books:
book_classes_index[book] = cl
It is a bit sorry to have all those bigrams involving an article, for example. So, as an option, we leave out the non content words, being words with a certain part-of-speech.
X art article
verb verb
subs noun
nmpr proper noun
advb adverb
X prep preposition
X conj conjunction
X prps personal pronoun
X prde demonstrative pronoun
X prin interrogative pronoun
X intj interjection
X nega negative particle
X inrg interrogative particle
adjv adjective
ONLY_SP = set('''
verb subs nmpr advb adjv
'''.strip().split())
grams = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.Counter()))
ng = 4 # the max number of members in the n-gram
msg('Collecting all <= {}-grams'.format(ng))
for c in F.otype.s('clause'):
bk = F.book.v(L.u('book', c))
words = list(L.d('word', c))
lenw = len(words)
for n in range(1, ng + 1):
for i in range(lenw - n):
this_gram = '-'.join(F.lex.v(w) if F.sp.v(w) in ONLY_SP else '*' for w in (words[j] for j in range(i, n)))
grams[n][bk][this_gram] += 1
msg('Done')
11s Collecting all <= 4-grams 16s Done
Now compute the counts for the indicated groups of books.
freqs_by_g = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.Counter()))
freqs_by_c = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.Counter()))
msg('Counting grams')
for n in grams:
ngrams = grams[n]
for bk in ngrams:
cl = book_classes_index.get(bk, 'bible')
these_grams = ngrams[bk]
for g in these_grams:
freqs_by_g[n][g][cl] += these_grams[g]
freqs_by_c[n][cl][g] += these_grams[g]
msg('Done')
18s Counting grams 19s Done
We set a threshold below which grams are considered rare and another threshold above which grams are considered abundant.
Then we list all grams that are abundant in one collection of books but rare in the other one.
RARE = 1
class_a = 'legal'
class_b = 'prophets'
results = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.Counter()))
msg('Filtering')
for RARE in range(1, 10):
for n in sorted(freqs_by_g):
nfreqs_by_g = freqs_by_g[n]
for g in nfreqs_by_g:
g_info = nfreqs_by_g[g]
nc = dict((cl, g_info.get(cl, 0)) for cl in book_classes)
if nc[class_a] <= RARE and nc[class_b] <= RARE: results[RARE][n]['rr'] += 1
elif nc[class_a] <= RARE and nc[class_b] > RARE: results[RARE][n]['ra'] += 1
elif nc[class_a] > RARE and nc[class_b] <= RARE: results[RARE][n]['ar'] += 1
elif nc[class_a] > RARE and nc[class_b] > RARE: results[RARE][n]['aa'] += 1
else: results[RARE][n]['xx'] += 1
msg('Done')
for RARE in sorted(results):
print('RARE means <= {}'.format(RARE))
for n in range(1, ng + 1):
print('\t{}-grams: rr ra ar aa = {}'.format(
n,
' '.join(
'{:>5}'.format(results[RARE].get(n, {}).get(y, 0)) for y in ('rr', 'ra', 'ar', 'aa')
),
))
21s Filtering 24s Done
RARE means <= 1 1-grams: rr ra ar aa = 1866 423 88 141 2-grams: rr ra ar aa = 10124 1276 430 700 3-grams: rr ra ar aa = 29657 1933 1116 723 4-grams: rr ra ar aa = 42812 1541 1393 458 RARE means <= 2 1-grams: rr ra ar aa = 2120 262 43 93 2-grams: rr ra ar aa = 10997 808 241 484 3-grams: rr ra ar aa = 31502 927 535 465 4-grams: rr ra ar aa = 44768 603 583 250 RARE means <= 3 1-grams: rr ra ar aa = 2232 181 28 77 2-grams: rr ra ar aa = 11403 566 178 383 3-grams: rr ra ar aa = 32193 555 346 335 4-grams: rr ra ar aa = 45384 309 351 160 RARE means <= 4 1-grams: rr ra ar aa = 2307 124 24 63 2-grams: rr ra ar aa = 11643 433 135 319 3-grams: rr ra ar aa = 32525 383 262 259 4-grams: rr ra ar aa = 45616 215 246 127 RARE means <= 5 1-grams: rr ra ar aa = 2352 96 18 52 2-grams: rr ra ar aa = 11803 334 115 278 3-grams: rr ra ar aa = 32721 295 203 210 4-grams: rr ra ar aa = 45781 153 173 97 RARE means <= 6 1-grams: rr ra ar aa = 2375 81 20 42 2-grams: rr ra ar aa = 11903 283 97 247 3-grams: rr ra ar aa = 32855 245 152 177 4-grams: rr ra ar aa = 45879 123 116 86 RARE means <= 7 1-grams: rr ra ar aa = 2386 77 21 34 2-grams: rr ra ar aa = 11974 261 82 213 3-grams: rr ra ar aa = 32950 203 121 155 4-grams: rr ra ar aa = 45929 101 101 73 RARE means <= 8 1-grams: rr ra ar aa = 2407 64 19 28 2-grams: rr ra ar aa = 12029 230 82 189 3-grams: rr ra ar aa = 33018 168 107 136 4-grams: rr ra ar aa = 45980 76 82 66 RARE means <= 9 1-grams: rr ra ar aa = 2423 52 17 26 2-grams: rr ra ar aa = 12076 209 76 169 3-grams: rr ra ar aa = 33067 147 92 123 4-grams: rr ra ar aa = 46016 58 67 63
msg("Making a mapping between a passage specification and a verse node")
passage2vnode = {}
for vs in F.otype.s('verse'):
lab = (F.book.v(vs), F.chapter.v(vs), F.verse.v(vs))
passage2vnode[lab] = vs
msg("{} verses".format(len(passage2vnode)))
21m 38s Making a mapping between a passage specification and a verse node 21m 39s 23213 verses
Look for update lexemes, where in parallel phrases one lexeme is different, for example measure nouns. See for example Ezek 45:10 - Lev 19:36 - Deut 25:15.
This is a case of parallel language use, not strictly a parallel passage.
We are going to list the lexemes that make a difference between parallel passages. For every set of parallel passages we list the lexemes that do not occur in their intersection.
We print a list of all groups of parallel verses, where the lexemes that do not occur in all members of the group are highlighted.
Every lexeme that occurs in some parallel passage, but not in the intersection of the lexemes of its parallel passages will be listed in a dictionary, keyed by that lexeme, and then by the group number, and then it has the following information: passages in which it does occur, and passages in which it does not occur.
CROSSREF_TOOL = 'parallel'
CROSSREF_DB_FILE = 'crossrefdb.csv'
SHEBANQ_PATH = os.path.abspath('{}/../../../shebanq'.format(os.getcwd))
CROSSREF_DB_DIR = '{}/static/docs/tools/{}/files'.format(SHEBANQ_PATH, CROSSREF_TOOL)
CROSSREF_DB_PATH = '{}/{}'.format(CROSSREF_DB_DIR, CROSSREF_DB_FILE)
msg(CROSSREF_DB_PATH)
PRETTY_PAIRS = 'pairs.html'
DIFF_LEX = 'difflex.csv'
21m 43s /Users/dirk/SURFdrive/current/demos/github/shebanq/static/docs/tools/parallel/files/crossrefdb.csv
loc_tpl = 'https://rawgit.com/etcbc/laf-fabric-nbs/master/lingvar/{}'
HTML('''
<a target="_blank" href="{}">parallel_pairs</a>
<a target="_blank" href="{}">difference lexemes</a>
'''.format(loc_tpl.format(PRETTY_PAIRS), loc_tpl.format(DIFF_LEX)))
msg('Reading crossrefs database')
n = 0
parallel_pairs_proto = collections.defaultdict(lambda: set())
group_index = {}
group_number = 0
with open(CROSSREF_DB_PATH) as f:
for line in f:
n += 1
if n == 1: continue
(bkx, chx, vsx, bky, chy, vsy, rd) = line.rstrip('\n').split('\t')
vx = passage2vnode[(bkx, chx, vsx)]
vy = passage2vnode[(bky, chy, vsy)]
gx = group_index.get(vx, None)
gy = group_index.get(vy, None)
if gx == None and gy == None:
group_number += 1
group_index[vx] = group_number
group_index[vy] = group_number
elif gx == None:
group_index[vx] = gy
elif gy == None:
group_index[vy] = gx
elif gx != gy:
update = [x for x in group_index if group_index[x] == gx]
for x in update: group_index[x] = gy
msg('{} entries read'.format(n))
for (x, n) in group_index.items(): parallel_pairs_proto[n].add(x)
parallel_pairs = [sorted(parallel_pairs_proto[n]) for n in parallel_pairs_proto]
parallel_pairs_proto = None
msg('Gathered {} sets of parallel verses'.format(len(parallel_pairs)))
21m 52s Reading crossrefs database 21m 52s 14354 entries read 21m 52s Gathered 1235 sets of parallel verses
msg('Building a difference lexeme table')
intersections = []
unions = []
for p in parallel_pairs:
clex = None
ulex = collections.defaultdict(lambda: set())
for x in p:
xlex = {F.lex.v(w) for w in L.d('word', x)}
clex = xlex if clex == None else clex & xlex
for lex in xlex:
ulex[lex].add(x)
intersections.append(clex)
unions.append(ulex)
msg('{} groups in table'.format(len(intersections)))
21m 59s Building a difference lexeme table 22m 00s 1235 groups in table
css = '''
<style type="text/css">
table.t {
width: 100%;
border-collapse: collapse;
direction: rtl;
border: 2px solid #aaaaaa;
}
td.t {
border: 2px solid #aaaaaa;
font-family: Ezra SIL, SBL Hebrew, Verdana, sans-serif;
font-size: x-large;
line-height: 1.7;
text-align: right;
direction: rtl;
}
td.vl {
font-family: Verdana, Arial, sans-serif;
font-size: small;
text-align: right;
color: #aaaaaa;
width: 10%;
direction: ltr;
border-left: 2px solid #aaaaaa;
border-right: 2px solid #aaaaaa;
}
span.m {
background-color: #aaaaff;
}
span.f {
background-color: #ffaaaa;
}
span.x {
background-color: #ffffaa;
color: #bb0000;
}
span.delete {
background-color: #ffaaaa;
}
span.insert {
background-color: #aaffaa;
}
span.replace {
background-color: #ffff00;
}
</style>
'''
html_file_tpl = '''<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>{}</title>
{}
</head>
<body>
<h1>Table of groups</h1>
{}
<h1>Groups</h1>
{}
</body>
</html>'''
# we want to sort passages in such a way that the verses in 1 Kings 19-26 are put before anything else
def print_chunk(i, clex, members):
result = ['''
<a name="c_{i}">Group {i}</a>
<table class="t">
'''.format(i=i)]
for v in members:
lab = '{} {}:{}'.format(F.book.v(v), F.chapter.v(v), F.verse.v(v))
text = ''.join(
(F.g_word_utf8.v(w)\
if F.lex.v(w) in clex \
else '<span class="replace">{}</span>'.format(F.g_word_utf8.v(w))
)+F.trailer_utf8.v(w) for w in L.d('word', v)
)
result.append('''
<tr class="t"><td class="vl">{rb}</td><td class="t">{rl}</td></tr>
'''.format(
rb=lab,
rl=text,
))
result.append('''</table>
''')
return ''.join(result)
def index_chunk(i, clex, members):
verse_labels = ', '.join('{} {}:{}'.format(F.book.v(v), F.chapter.v(v), F.verse.v(v)) for v in members)
return '<p><b>{i}</b> <a href="#c_{i}">{vl}</a></p>\n'.format(
vl = verse_labels, i=i,
)
msg('Pretty printing the table')
allgeni_html = []
allgenh_html = []
for (i, p) in enumerate(parallel_pairs):
clex = intersections[i]
allgeni_html.append(index_chunk(i, clex, p))
allgenh_html.append(print_chunk(i, clex, p))
outf = open('pairs.html', 'w')
outf.write(html_file_tpl.format(
'Pairs',
css,
''.join(allgeni_html),
''.join(allgenh_html),
))
outf.close()
msg('Done')
22m 21s Pretty printing the table 22m 21s Done
diff_pairs = collections.defaultdict(lambda: collections.defaultdict(lambda: {}))
for (i, p) in enumerate(parallel_pairs):
clex = intersections[i]
ulex = unions[i]
for lex in ulex:
if lex in clex: continue
diff_pairs[lex][i]['has'] = ulex[lex]
diff_pairs[lex][i]['hasnot'] = set(p) - ulex[lex]
outf = open('difflex.csv', 'w')
outf.write('lexeme\tgroup\tsize\thas?\tbook\tchapter\tverse\n')
for lex in sorted(diff_pairs):
for i in sorted(diff_pairs[lex]):
lnp = len(diff_pairs[lex][i]['has']) + len(diff_pairs[lex][i]['hasnot'])
for v in sorted(diff_pairs[lex][i]['has']):
outf.write('{}\t{}\t{}\t+\t{}\t{}\t{}\n'.format(
lex, i, lnp, F.book.v(v), F.chapter.v(v), F.verse.v(v),
))
for v in sorted(diff_pairs[lex][i]['hasnot']):
outf.write('{}\t{}\t{}\t-\t{}\t{}\t{}\n'.format(
lex, i, lnp, F.book.v(v), F.chapter.v(v), F.verse.v(v),
))
outf.close()
The result is a tab-separated file with fields:
lexeme
group number
has?
book
chapter
verse
We list the lexemes that occur in at least one group of parallel passages where the lexeme in question does not occur in all members of the group. For all such lexemes and for all such groups and for all members of such groups we make an entry. The entry has + in field has? if the lexeme occurs in that passage, else it has -.
See below for the lines corresponding to the first 4 lexemes.