Various ways to list the lexeme base of individual chapters in the Hebew Bible.
The lexeme base of a passage is the set of lexemes that occurs in that passage.
We define a function, lexbase(passages, excluded=xpassages)
,
that produces a file of the lexemes that occur in a given list of passages and do not occur in an other given list of passages.
If you have LAF-Fabric working and downloaded this notebook, you can call this function yourself in order to generate lexeme bases of arbitrary passages.
We also produce standard files with the lexeme bases of individual books, chapters and verses in the Bible.
The output files are organized as follows:
/ [ =
characters),lex_utf8
feature (the lexeme in Hebrew as it occurs in the ETCBC text database),g_entry_heb
feature (the vocalized lexeme as it is listed in the ETCBC lexicon),sp
feature (part of speech),gloss
feature.import sys, collections, re
from laf.fabric import LafFabric
from etcbc.preprocess import prepare
fabric = LafFabric()
0.00s This is LAF-Fabric 4.5.0 API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html Feature doc: http://shebanq-doc.readthedocs.org/en/latest/texts/welcome.html
version = '4b'
fabric.load('etcbc{}'.format(version), 'lexicon', 'lexemes', {
"xmlids": {"node": False, "edge": False},
"features": ('''
otype
lex lex_utf8 g_entry_heb
sp gloss
book chapter verse
''',''),
"prepare": prepare,
"primary": False,
})
exec(fabric.localnames.format(var='fabric'))
0.00s LOADING API: please wait ... 0.65s INFO: USING DATA COMPILED AT: 2015-05-04T13-46-20 0.65s INFO: USING DATA COMPILED AT: 2015-05-04T14-07-34 3.67s LOGFILE=/Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/__log__lexemes.txt 14s INFO: DATA LOADED FROM SOURCE etcbc4b AND ANNOX lexicon FOR TASK lexemes AT 2015-05-27T15-43-40
csvdir = my_file('csv')
passagedir = my_file('passage')
%mkdir -p {csvdir}
%mkdir -p {passagedir}
passage_pat = re.compile('^\s*([A-Za-z0-9_]+)\s*([0-9,-]*)\s*:?\s*([0-9,-]*)\s*$')
lex_info = {}
lex_section = {}
lex_count = collections.Counter()
for v in F.otype.s('verse'):
bk = F.book.v(L.u('book', v))
ch = F.chapter.v(L.u('chapter', v))
vs = F.verse.v(v)
for w in L.d('word', v):
lex = F.lex.v(w)
if lex not in lex_info:
lex_info[lex] = (F.lex_utf8.v(w), F.g_entry_heb.v(w), F.sp.v(w), F.gloss.v(w))
lex_section.setdefault(bk, {}).setdefault(ch, {}).setdefault(vs, collections.Counter())[lex] += 1
lex_count[lex] += 1
def verse_index():
result = {}
for v in F.verse.s():
bk = F.book.v(L.u('book', v))
ch = F.chapter.v(L.u('chapter', v))
vs = F.verse.v(v)
result.setdefault(bk, {}).setdefault(ch, {})[vs] = v
return result
vindex = verse_index()
def parse_passages(passages):
lexemes = set()
for p in passages.strip().split('|'):
lexemes |= parse_passage(p.strip())
return lexemes
def parse_ranges(rangespec, kind, passage, source, subsources=None):
numbers = set()
if rangespec == '':
if subsources == None:
return set(source.keys())
else:
for subsource in subsources:
if subsource in source:
numbers |= set(source[subsource].keys())
return numbers
ranges = rangespec.split(',')
good = True
for r in ranges:
comps = r.split('-', 1)
if len(comps) == 1:
b = comps[0]
e = comps[0]
else:
(b,e) = comps
if not (b.isdigit() and e.isdigit()):
print('Error: Not a valid {} range: [{}] in [{}]'.format(kind, r, passage))
good = False
else:
b = int(b)
e = int(e)
for c in range(b, e+1):
crep = str(c)
if subsources == None:
if crep not in source:
print('Warning: No such {}: {} ([{}] in [{}])'.format(kind, crep, r, passage))
numbers.add(crep)
else:
for subsource in subsources:
if subsource not in source or crep not in source[subsource]:
print('Warning: No such {}: {}:{} ([{}] in [{}])'.format(kind, subsource, crep, r, passage))
numbers.add(crep)
return numbers
def parse_passage(passage):
lexemes = set()
result = passage_pat.match(passage)
if result == None:
print('Error: Not a valid passage: {}'.format(passage))
return lexemes
(book, chapterspec, versespec) = result.group(1,2,3)
if book not in vindex:
print('Error: Not a valid book: {} in {}'.format(book, passage))
return lexemes
chapters = parse_ranges(chapterspec, 'chapter', passage, vindex[book])
verses = parse_ranges(versespec, 'verse', passage, vindex[book], chapters)
vnodes = set()
for ch in vindex[book]:
if ch not in chapters: continue
for vs in vindex[book][ch]:
if vs not in verses: continue
vnodes.add(vindex[book][ch][vs])
lexemes = set()
for v in vnodes:
for w in L.d('word', v):
lexemes.add(F.lex.v(w))
return lexemes
def lexbase(passages, excluded=None):
lexemes = parse_passages(passages)
outlexemes = set() if excluded == None else parse_passages(excluded)
lexemes -= outlexemes
fileid = '{}{}'.format(
passages,
'' if excluded == None else ' minus {}'.format(excluded)
)
filename = 'passage/{}.csv'.format(fileid.replace(':','_'))
of = outfile(filename)
i = 0
limit = 20
nlex = len(lexemes)
shown = min((nlex, limit))
print('==== {} ==== showing {} of {} lexemes here ===='.format(fileid, shown, nlex))
for lx in sorted(lexemes, key=lambda x: (-lex_count[x], x)):
(l_utf8, l_vc, l_sp, l_gl) = lex_info[lx]
line = '"{}",{},{}","{}","{}","{}"\n'.format(lx, lex_count[lx], l_utf8, l_vc, l_sp, l_gl)
of.write(line)
if i < limit: sys.stdout.write(line)
i += 1
of.close()
print('See {}\n'.format(my_file(filename)))
Here are some examples of the flexibility with which you can call the lexbase
function.
lexbase('Genesis 2', excluded=None)
lexbase('Genesis 2', excluded='Genesis 1')
lexbase('Genesis 3-4,10', excluded='Genesis 1-2')
lexbase('Exodus', excluded='Genesis')
lexbase('Numeri 1-3:10-15|Judices 5:1,3,5,7,9|Ruth 4', excluded='Chronica_I|Chronica_II')
==== Genesis 2 ==== showing 20 of 131 lexemes here ==== "W",51004,ו","וְ","conj","and" "H",30386,ה","הַ","art","the" "L",20447,ל","לְ","prep","to" "B",15767,ב","בְּ","prep","in" ">T",11017,את","אֵת","prep","<object marker>" "MN",7681,מן","מִן","prep","from" "JHWH/",6828,יהוה/","יהוה","nmpr","YHWH" "<L",5869,על","עַל","prep","upon" ">L",5521,אל","אֶל","prep","to" ">CR",5500,אשׁר","אֲשֶׁר","conj","<relative>" "KL/",5495,כל/","כֹּל","subs","whole" ">MR[",5378,אמר[","אָמַר","verb","say" "L>",5249,לא","לֹא","nega","not" "KJ",4483,כי","כִּי","conj","that" "HJH[",3561,היה[","הָיָה","verb","be" "K",2965,כ","כְּ","prep","as" "<FH[",2629,עשׂה[","עָשָׂה","verb","make" ">LHJM/",2601,אלהים/","אֱלֹהִים","subs","god(s)" "BW>[",2570,בוא[","בֹּוא","verb","come" ">RY/",2504,ארץ/","אֶרֶץ","subs","earth" See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Genesis 2.csv ==== Genesis 2 minus Genesis 1 ==== showing 20 of 88 lexemes here ==== "JHWH/",6828,יהוה/","יהוה","nmpr","YHWH" "L>",5249,לא","לֹא","nega","not" "BW>[",2570,בוא[","בֹּוא","verb","come" ">JC/",2186,אישׁ/","אִישׁ","subs","man" "HLK[",1554,הלך[","הָלַךְ","verb","walk" "HW>",1409,הוא","הוּא","prps","he" ">B/",1226,אב/","אָב","subs","father" "LQX[",965,לקח[","לָקַח","verb","take" "<LH[",890,עלה[","עָלָה","verb","ascend" "CM/",876,שׁם/","שֵׁם","subs","name" "MWT[",835,מות[","מוּת","verb","die" "CM",834,שׁם","שָׁם","advb","there" ">KL[",817,אכל[","אָכַל","verb","eat" ">JN/",788,אין/","אַיִן","subs","<NEG>" ">CH/",781,אשׁה/","אִשָּׁה","subs","woman" ">LH",747,אלה","אֵלֶּה","prde","these" "R>C/",613,ראשׁ/","רֹאשׁ","subs","head" "FJM[",609,שׂים[","שִׂים","verb","put" "Z>T",604,זאת","זֹאת","prde","this" "MH",587,מה","מָה","prin","what" See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Genesis 2 minus Genesis 1.csv ==== Genesis 3-4,10 minus Genesis 1-2 ==== showing 20 of 248 lexemes here ==== "BN/",4937,בן/","בֵּן","subs","son" "JD/",1635,יד/","יָד","subs","hand" "<D",1298,עד","עַד","prep","unto" "CM<[",1168,שׁמע[","שָׁמַע","verb","hear" "<JR/",1093,עיר/","עִיר","subs","town" "JCB[",1081,ישׁב[","יָשַׁב","verb","sit" "<M",1071,עם","עִם","prep","with" ">M",1068,אם","אִם","conj","if" "CWB[",1037,שׁוב[","שׁוּב","verb","return" "JD<[",991,ידע[","יָדַע","verb","know" "<JN/",892,עין/","עַיִן","subs","eye" "CLX[",861,שׁלח[","שָׁלַח","verb","send" ">T==",848,את==","אֵת","prep","together with" "GM",769,גם","גַּם","advb","even" ">TH",747,אתה","אַתָּה","prps","you" "H=",743,ה=","הֲ","inrg","<interrogative>" ">XR/",718,אחר/","אַחַר","subs","after" "DRK/",706,דרך/","דֶּרֶךְ","subs","way" "MYRJM/",681,מצרים/","מִצְרַיִם","nmpr","Egypt" "QWM[",664,קום[","קוּם","verb","arise" See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Genesis 3-4,10 minus Genesis 1-2.csv ==== Exodus minus Genesis ==== showing 20 of 631 lexemes here ==== "MCH=/",766,משׁה=/","מֹשֶׁה","nmpr","Moses" "QDC/",469,קדשׁ/","קֹדֶשׁ","subs","holiness" ">HRWN/",347,אהרון/","אַהֲרֹון","nmpr","Aaron" "LWJ/",296,לוי/","לֵוִי","adjv","Levite" "KTB[",231,כתב[","כָּתַב","verb","write" "JHWCW</",218,יהושׁוע/","יְהֹושׁוּעַ","nmpr","Joshua" "JC<[",205,ישׁע[","יָשַׁע","verb","help" ">BD[",193,אבד[","אָבַד","verb","perish" "LXM[",171,לחם[","לָחַם","verb","fight" "XKMH/",157,חכמה/","חָכְמָה","subs","wisdom" "FMX[",154,שׂמח[","שָׂמַח","verb","rejoice" "<DH/",149,עדה/","עֵדָה","subs","gathering" "MCKN/",140,משׁכן/","מִשְׁכָּן","subs","dwelling-place" "XWMH/",133,חומה/","חֹומָה","subs","wall" "XYJ/",124,חצי/","חֲצִי","subs","half" "JCR/",119,ישׁר/","יָשָׁר","adjv","right" "QDWC/",116,קדושׁ/","קָדֹושׁ","adjv","holy" "QVR[",115,קטר[","קָטַר","verb","smoke" "CQR/",113,שׁקר/","שֶׁקֶר","subs","lie" "<MWD/",111,עמוד/","עַמּוּד","subs","pillar" See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Exodus minus Genesis.csv ==== Numeri 1-3:10-15|Judices 5:1,3,5,7,9|Ruth 4 minus Chronica_I|Chronica_II ==== showing 20 of 52 lexemes here ==== "G>L[",103,גאל[","גָּאַל","verb","redeem" "CJT[",85,שׁית[","שִׁית","verb","put" "MKR[",80,מכר[","מָכַר","verb","sell" "ZR/",70,זר/","זָר","adjv","strange" "<D=/",69,עד=/","עֵד","subs","witness" "N<RH/",63,נערה/","נַעֲרָה","subs","girl" "RXL=/",47,רחל=/","רָחֵל","nmpr","Rachel" "XJQ/",38,חיק/","חֵיק","subs","lap" "SJNJ=/",35,סיני=/","סִינַי","nmpr","Sinai" "L>H/",34,לאה/","לֵאָה","nmpr","Leah" "RXM/",31,רחם/","רֶחֶם","subs","womb" "TJMN/",23,תימן/","תֵּימָן","subs","south" "N<L/",22,נעל/","נַעַל","subs","sandal" "N<MJ=/",21,נעמי=/","נָעֳמִי","nmpr","Naomi" "CKN/",20,שׁכן/","שָׁכֵן","subs","inhabitant" "XQQ[",19,חקק[","חָקַק","verb","engrave" "DGL/",14,דגל/","דֶּגֶל","subs","banner" "G>LH/",14,גאלה/","גְּאֻלָּה","subs","right of buying back" "KHNH/",14,כהנה/","כְּהֻנָּה","subs","priesthood" "BRQ=/",13,ברק=/","בָּרָק","nmpr","Barak" See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Numeri 1-3_10-15|Judices 5_1,3,5,7,9|Ruth 4 minus Chronica_I|Chronica_II.csv
Here we produce several lexeme files for books and chapters.
There are normal and incremental output files. In a normal output file, you find all lexemes for the indicated chapters and verses. In an incremental file, you find per indicated passage the lexemes that are new with respect to the previous passages (either the previous verses in the chapter, or the previous chapters in the book).
You can download the files as they have been generated by my LAF-Fabric installation via my SURFdrive: version 4 version 4b
outf = outfile("csv/all_lexemes.csv")
for (l, f) in sorted(lex_count.items(), key=lambda x: -x[1]):
(l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
outf.write('"{}",{},"{}","{}","{}","{}"\n'.format(
l, f, l_utf8, l_vc, l_sp, l_gl,
))
outf.close()
for bk in sorted(lex_section):
outfb = outfile("csv/{}.csv".format(bk))
outfc = outfile("csv/{}_per_ch.csv".format(bk))
outfci = outfile("csv/{}_per_ch_inc.csv".format(bk))
outfv = outfile("csv/{}_per_vs.csv".format(bk))
outfvi = outfile("csv/{}_per_vs_inc.csv".format(bk))
bk_lex = set()
for ch in sorted(lex_section[bk], key=lambda x: int(x)):
ch_lex = set()
for vs in sorted(lex_section[bk][ch], key=lambda x: int(x)):
for l in sorted(lex_section[bk][ch][vs]):
(l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
f = lex_count[l]
line = '"{}",{},{},"{}",{},"{}","{}","{}","{}"\n'.format(
bk, ch, vs, l, f, l_utf8, l_vc, l_sp, l_gl,
)
outfv.write(line)
if l not in ch_lex:
ch_lex.add(l)
outfvi.write(line)
if l not in bk_lex:
bk_lex.add(l)
for l in sorted(ch_lex):
(l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
f = lex_count[l]
line = '"{}",{},"{}",{},"{}","{}","{}","{}"\n'.format(
bk, ch, l, f, l_utf8, l_vc, l_sp, l_gl,
)
outfc.write(line)
if l not in bk_lex:
bk_lex.add(l)
outfci.write(line)
for l in sorted(bk_lex):
(l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
f = lex_count[l]
line = '"{}","{}",{},"{}","{}","{}","{}"\n'.format(
bk, l, f, l_utf8, l_vc, l_sp, l_gl,
)
outfb.write(line)
outfb.close()
outfc.close()
outfci.close()
outfv.close()
outfvi.close()