Notebook

Lexemes¶

Various ways to list the lexeme base of individual chapters in the Hebew Bible.

The lexeme base of a passage is the set of lexemes that occurs in that passage.

We define a function, lexbase(passages, excluded=xpassages), that produces a file of the lexemes that occur in a given list of passages and do not occur in an other given list of passages.

If you have LAF-Fabric working and downloaded this notebook, you can call this function yourself in order to generate lexeme bases of arbitrary passages.

We also produce standard files with the lexeme bases of individual books, chapters and verses in the Bible.

Output¶

The output files are organized as follows:

all files are comma separated text files that can imported in a spreadsheet application such as OpenOffice or Excel;
every line corresponds to a lexeme in the lexeme base and contains the following information:
- lexeme (unique identifier in transcription, containing / [ = characters),
- frequency (number of occurrences of this lexeme in the whole Hebrew Bible),
- lex_utf8 feature (the lexeme in Hebrew as it occurs in the ETCBC text database),
- g_entry_heb feature (the vocalized lexeme as it is listed in the ETCBC lexicon),
- sp feature (part of speech),
- gloss feature.

In [1]:

import sys, collections, re

from laf.fabric import LafFabric
from etcbc.preprocess import prepare
fabric = LafFabric()

  0.00s This is LAF-Fabric 4.5.0
API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
Feature doc: http://shebanq-doc.readthedocs.org/en/latest/texts/welcome.html

In [2]:

version = '4b'
fabric.load('etcbc{}'.format(version), 'lexicon', 'lexemes', {
    "xmlids": {"node": False, "edge": False},
    "features": ('''
        otype
        lex lex_utf8 g_entry_heb
        sp gloss
        book chapter verse
    ''',''),
    "prepare": prepare,
    "primary": False,
})
exec(fabric.localnames.format(var='fabric'))

  0.00s LOADING API: please wait ... 
  0.65s INFO: USING DATA COMPILED AT: 2015-05-04T13-46-20
  0.65s INFO: USING DATA COMPILED AT: 2015-05-04T14-07-34
  3.67s LOGFILE=/Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/__log__lexemes.txt
    14s INFO: DATA LOADED FROM SOURCE etcbc4b AND ANNOX lexicon FOR TASK lexemes AT 2015-05-27T15-43-40

In [3]:

csvdir = my_file('csv')
passagedir = my_file('passage')
%mkdir -p {csvdir}
%mkdir -p {passagedir}

# Passage syntax passages = | separated list of passage passage = bookname (chapterranges | (chapter : verseranges)) chapterranges = empty | (, separated list of numberrange) verseranges = empty | (, separated list of numberrange) numberrange = number | (number - number)

In [9]:

passage_pat = re.compile('^\s*([A-Za-z0-9_]+)\s*([0-9,-]*)\s*:?\s*([0-9,-]*)\s*$')

lex_info = {}
lex_section = {}
lex_count = collections.Counter()
for v in F.otype.s('verse'):
    bk = F.book.v(L.u('book', v))
    ch = F.chapter.v(L.u('chapter', v))
    vs = F.verse.v(v)
    for w in L.d('word', v):
        lex = F.lex.v(w)
        if lex not in lex_info:
            lex_info[lex] = (F.lex_utf8.v(w), F.g_entry_heb.v(w), F.sp.v(w), F.gloss.v(w))
        lex_section.setdefault(bk, {}).setdefault(ch, {}).setdefault(vs, collections.Counter())[lex] += 1
        lex_count[lex] += 1

def verse_index():
    result = {}
    for v in F.verse.s():
        bk = F.book.v(L.u('book', v))
        ch = F.chapter.v(L.u('chapter', v))
        vs = F.verse.v(v)
        result.setdefault(bk, {}).setdefault(ch, {})[vs] = v
    return result

vindex = verse_index()

def parse_passages(passages):
    lexemes = set()
    for p in passages.strip().split('|'):
        lexemes |= parse_passage(p.strip())
    return lexemes

def parse_ranges(rangespec, kind, passage, source, subsources=None):
    numbers = set()
    if rangespec == '':
        if subsources == None:
            return set(source.keys())
        else:
            for subsource in subsources:
                if subsource in source:
                    numbers |= set(source[subsource].keys())
            return numbers
    ranges = rangespec.split(',')
    good = True
    for r in ranges:
        comps = r.split('-', 1)
        if len(comps) == 1:
            b = comps[0]
            e = comps[0]
        else:
            (b,e) = comps
        if not (b.isdigit() and e.isdigit()):
            print('Error: Not a valid {} range: [{}] in [{}]'.format(kind, r, passage))
            good = False
        else:
            b = int(b)
            e = int(e)
            for c in range(b, e+1):
                crep = str(c)
                if subsources == None:
                    if crep not in source:
                        print('Warning: No such {}: {} ([{}] in [{}])'.format(kind, crep, r, passage))
                    numbers.add(crep)
                else:
                    for subsource in subsources:
                        if subsource not in source or crep not in source[subsource]:
                            print('Warning: No such {}: {}:{} ([{}] in [{}])'.format(kind, subsource, crep, r, passage))
                    numbers.add(crep)
    return numbers
    
def parse_passage(passage):
    lexemes = set()
    result = passage_pat.match(passage)
    if result == None:
        print('Error: Not a valid passage: {}'.format(passage))
        return lexemes
    (book, chapterspec, versespec) = result.group(1,2,3)
    if book not in vindex:
        print('Error: Not a valid book: {} in {}'.format(book, passage))
        return lexemes
    chapters = parse_ranges(chapterspec, 'chapter', passage, vindex[book])
    verses = parse_ranges(versespec, 'verse', passage, vindex[book], chapters)

    vnodes = set()
    for ch in vindex[book]:
        if ch not in chapters: continue
        for vs in vindex[book][ch]:
            if vs not in verses: continue
            vnodes.add(vindex[book][ch][vs])
    lexemes = set()
    for v in vnodes:
        for w in L.d('word', v):
            lexemes.add(F.lex.v(w))
    return lexemes
        
def lexbase(passages, excluded=None):
    lexemes = parse_passages(passages)
    outlexemes = set() if excluded == None else parse_passages(excluded)
    lexemes -= outlexemes
    fileid = '{}{}'.format(
        passages, 
        '' if excluded == None else ' minus {}'.format(excluded)
    )
    filename = 'passage/{}.csv'.format(fileid.replace(':','_'))
    of = outfile(filename)
    i = 0
    limit = 20
    nlex = len(lexemes)
    shown = min((nlex, limit))
    print('==== {} ==== showing {} of {} lexemes here ===='.format(fileid, shown, nlex))
    for lx in sorted(lexemes, key=lambda x: (-lex_count[x], x)):
        (l_utf8, l_vc, l_sp, l_gl) = lex_info[lx]
        line = '"{}",{},{}","{}","{}","{}"\n'.format(lx, lex_count[lx], l_utf8, l_vc, l_sp, l_gl)
        of.write(line)
        if i < limit: sys.stdout.write(line)
        i += 1
    of.close()
    print('See {}\n'.format(my_file(filename)))

Examples¶

Here are some examples of the flexibility with which you can call the lexbase function.

In [11]:

lexbase('Genesis 2', excluded=None)
lexbase('Genesis 2', excluded='Genesis 1')
lexbase('Genesis 3-4,10', excluded='Genesis 1-2')
lexbase('Exodus', excluded='Genesis')
lexbase('Numeri 1-3:10-15|Judices 5:1,3,5,7,9|Ruth 4', excluded='Chronica_I|Chronica_II')

==== Genesis 2 ==== showing 20 of 131 lexemes here ====
"W",51004,ו","וְ","conj","and"
"H",30386,ה","הַ","art","the"
"L",20447,ל","לְ","prep","to"
"B",15767,ב","בְּ","prep","in"
">T",11017,את","אֵת","prep","<object marker>"
"MN",7681,מן","מִן","prep","from"
"JHWH/",6828,יהוה/","יהוה","nmpr","YHWH"
"<L",5869,על","עַל","prep","upon"
">L",5521,אל","אֶל","prep","to"
">CR",5500,אשׁר","אֲשֶׁר","conj","<relative>"
"KL/",5495,כל/","כֹּל","subs","whole"
">MR[",5378,אמר[","אָמַר","verb","say"
"L>",5249,לא","לֹא","nega","not"
"KJ",4483,כי","כִּי","conj","that"
"HJH[",3561,היה[","הָיָה","verb","be"
"K",2965,כ","כְּ","prep","as"
"<FH[",2629,עשׂה[","עָשָׂה","verb","make"
">LHJM/",2601,אלהים/","אֱלֹהִים","subs","god(s)"
"BW>[",2570,בוא[","בֹּוא","verb","come"
">RY/",2504,ארץ/","אֶרֶץ","subs","earth"
See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Genesis 2.csv

==== Genesis 2 minus Genesis 1 ==== showing 20 of 88 lexemes here ====
"JHWH/",6828,יהוה/","יהוה","nmpr","YHWH"
"L>",5249,לא","לֹא","nega","not"
"BW>[",2570,בוא[","בֹּוא","verb","come"
">JC/",2186,אישׁ/","אִישׁ","subs","man"
"HLK[",1554,הלך[","הָלַךְ","verb","walk"
"HW>",1409,הוא","הוּא","prps","he"
">B/",1226,אב/","אָב","subs","father"
"LQX[",965,לקח[","לָקַח","verb","take"
"<LH[",890,עלה[","עָלָה","verb","ascend"
"CM/",876,שׁם/","שֵׁם","subs","name"
"MWT[",835,מות[","מוּת","verb","die"
"CM",834,שׁם","שָׁם","advb","there"
">KL[",817,אכל[","אָכַל","verb","eat"
">JN/",788,אין/","אַיִן","subs","<NEG>"
">CH/",781,אשׁה/","אִשָּׁה","subs","woman"
">LH",747,אלה","אֵלֶּה","prde","these"
"R>C/",613,ראשׁ/","רֹאשׁ","subs","head"
"FJM[",609,שׂים[","שִׂים","verb","put"
"Z>T",604,זאת","זֹאת","prde","this"
"MH",587,מה","מָה","prin","what"
See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Genesis 2 minus Genesis 1.csv

==== Genesis 3-4,10 minus Genesis 1-2 ==== showing 20 of 248 lexemes here ====
"BN/",4937,בן/","בֵּן","subs","son"
"JD/",1635,יד/","יָד","subs","hand"
"<D",1298,עד","עַד","prep","unto"
"CM<[",1168,שׁמע[","שָׁמַע","verb","hear"
"<JR/",1093,עיר/","עִיר","subs","town"
"JCB[",1081,ישׁב[","יָשַׁב","verb","sit"
"<M",1071,עם","עִם","prep","with"
">M",1068,אם","אִם","conj","if"
"CWB[",1037,שׁוב[","שׁוּב","verb","return"
"JD<[",991,ידע[","יָדַע","verb","know"
"<JN/",892,עין/","עַיִן","subs","eye"
"CLX[",861,שׁלח[","שָׁלַח","verb","send"
">T==",848,את==","אֵת","prep","together with"
"GM",769,גם","גַּם","advb","even"
">TH",747,אתה","אַתָּה","prps","you"
"H=",743,ה=","הֲ","inrg","<interrogative>"
">XR/",718,אחר/","אַחַר","subs","after"
"DRK/",706,דרך/","דֶּרֶךְ","subs","way"
"MYRJM/",681,מצרים/","מִצְרַיִם","nmpr","Egypt"
"QWM[",664,קום[","קוּם","verb","arise"
See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Genesis 3-4,10 minus Genesis 1-2.csv

==== Exodus minus Genesis ==== showing 20 of 631 lexemes here ====
"MCH=/",766,משׁה=/","מֹשֶׁה","nmpr","Moses"
"QDC/",469,קדשׁ/","קֹדֶשׁ","subs","holiness"
">HRWN/",347,אהרון/","אַהֲרֹון","nmpr","Aaron"
"LWJ/",296,לוי/","לֵוִי","adjv","Levite"
"KTB[",231,כתב[","כָּתַב","verb","write"
"JHWCW</",218,יהושׁוע/","יְהֹושׁוּעַ","nmpr","Joshua"
"JC<[",205,ישׁע[","יָשַׁע","verb","help"
">BD[",193,אבד[","אָבַד","verb","perish"
"LXM[",171,לחם[","לָחַם","verb","fight"
"XKMH/",157,חכמה/","חָכְמָה","subs","wisdom"
"FMX[",154,שׂמח[","שָׂמַח","verb","rejoice"
"<DH/",149,עדה/","עֵדָה","subs","gathering"
"MCKN/",140,משׁכן/","מִשְׁכָּן","subs","dwelling-place"
"XWMH/",133,חומה/","חֹומָה","subs","wall"
"XYJ/",124,חצי/","חֲצִי","subs","half"
"JCR/",119,ישׁר/","יָשָׁר","adjv","right"
"QDWC/",116,קדושׁ/","קָדֹושׁ","adjv","holy"
"QVR[",115,קטר[","קָטַר","verb","smoke"
"CQR/",113,שׁקר/","שֶׁקֶר","subs","lie"
"<MWD/",111,עמוד/","עַמּוּד","subs","pillar"
See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Exodus minus Genesis.csv

==== Numeri 1-3:10-15|Judices 5:1,3,5,7,9|Ruth 4 minus Chronica_I|Chronica_II ==== showing 20 of 52 lexemes here ====
"G>L[",103,גאל[","גָּאַל","verb","redeem"
"CJT[",85,שׁית[","שִׁית","verb","put"
"MKR[",80,מכר[","מָכַר","verb","sell"
"ZR/",70,זר/","זָר","adjv","strange"
"<D=/",69,עד=/","עֵד","subs","witness"
"N<RH/",63,נערה/","נַעֲרָה","subs","girl"
"RXL=/",47,רחל=/","רָחֵל","nmpr","Rachel"
"XJQ/",38,חיק/","חֵיק","subs","lap"
"SJNJ=/",35,סיני=/","סִינַי","nmpr","Sinai"
"L>H/",34,לאה/","לֵאָה","nmpr","Leah"
"RXM/",31,רחם/","רֶחֶם","subs","womb"
"TJMN/",23,תימן/","תֵּימָן","subs","south"
"N<L/",22,נעל/","נַעַל","subs","sandal"
"N<MJ=/",21,נעמי=/","נָעֳמִי","nmpr","Naomi"
"CKN/",20,שׁכן/","שָׁכֵן","subs","inhabitant"
"XQQ[",19,חקק[","חָקַק","verb","engrave"
"DGL/",14,דגל/","דֶּגֶל","subs","banner"
"G>LH/",14,גאלה/","גְּאֻלָּה","subs","right of buying back"
"KHNH/",14,כהנה/","כְּהֻנָּה","subs","priesthood"
"BRQ=/",13,ברק=/","בָּרָק","nmpr","Barak"
See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Numeri 1-3_10-15|Judices 5_1,3,5,7,9|Ruth 4 minus Chronica_I|Chronica_II.csv

Standard lexeme files¶

Here we produce several lexeme files for books and chapters.

Output kind¶

There are normal and incremental output files. In a normal output file, you find all lexemes for the indicated chapters and verses. In an incremental file, you find per indicated passage the lexemes that are new with respect to the previous passages (either the previous verses in the chapter, or the previous chapters in the book).

Output files¶

all_lexemes.csv contains a listing of all lexemes, ordered by frequency
book.csv contains a listing of all lexemes in that book
book_per_ch.csv contains a listing of all lexemes in that book, organized by chapter
book_per_ch_inc.csv contains a listing of all lexemes in that book, organized by chapter, where each chapter lists only the lexemes that did not occur in previous chapters of that book
book_per_vs.csv contains a listing of all lexemes in that book, organized by chapter and then by verse
book_per_vs_inc.csv contains a listing of all lexemes in that book, organized by chapter and then by verse, where each verse lists only the lexemes that did not occur in previous verses of that same chapter

Output location¶

You can download the files as they have been generated by my LAF-Fabric installation via my SURFdrive: version 4 version 4b

In [5]:

outf = outfile("csv/all_lexemes.csv")
for (l, f) in sorted(lex_count.items(), key=lambda x: -x[1]):
    (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
    outf.write('"{}",{},"{}","{}","{}","{}"\n'.format(
        l, f, l_utf8, l_vc, l_sp, l_gl,
    ))
outf.close()

In [6]:

for bk in sorted(lex_section):
    outfb = outfile("csv/{}.csv".format(bk))
    outfc = outfile("csv/{}_per_ch.csv".format(bk))
    outfci = outfile("csv/{}_per_ch_inc.csv".format(bk))
    outfv = outfile("csv/{}_per_vs.csv".format(bk))
    outfvi = outfile("csv/{}_per_vs_inc.csv".format(bk))
    bk_lex = set()
    for ch in sorted(lex_section[bk], key=lambda x: int(x)):
        ch_lex = set()
        for vs in sorted(lex_section[bk][ch], key=lambda x: int(x)):
            for l in sorted(lex_section[bk][ch][vs]):
                (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
                f = lex_count[l]
                line = '"{}",{},{},"{}",{},"{}","{}","{}","{}"\n'.format(
                    bk, ch, vs, l, f, l_utf8, l_vc, l_sp, l_gl,
                )
                outfv.write(line)
                if l not in ch_lex:
                    ch_lex.add(l)
                    outfvi.write(line)
                if l not in bk_lex:
                    bk_lex.add(l)
        for l in sorted(ch_lex):
            (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
            f = lex_count[l]
            line = '"{}",{},"{}",{},"{}","{}","{}","{}"\n'.format(
                bk, ch, l, f, l_utf8, l_vc, l_sp, l_gl,
            )
            outfc.write(line)
            if l not in bk_lex:
                bk_lex.add(l)
                outfci.write(line)
    for l in sorted(bk_lex):
        (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]
        f = lex_count[l]
        line = '"{}","{}",{},"{}","{}","{}","{}"\n'.format(
            bk, l, f, l_utf8, l_vc, l_sp, l_gl,
        )
        outfb.write(line)
    outfb.close()
    outfc.close()
    outfci.close()                    
    outfv.close()
    outfvi.close()

In [ ]: