The source data is a file with mannual annotations by the LingVar group on a file with participle occurrences. This file derives from a set of qdf-files, filtered to contain only participle occurrences with context information, and then enriched with a complex custom annotation by hand.
This notebook unravels the hand made annotations, distributes the relevant parts over several more columns, and finally transforms most columns in sets of level columns.
This levelling of a column means, that if a column has a limited set of values, say aap
, noot
, mies
, we replace it by three columns, has_aap, has_noot, has_mies, having only boolean values.
The code is organized in a set of successive stages, each having a limited task.
import sys, os
import collections
import re, csv
We replace the book information, originally given as label, by full information: book number, book name, book acronym.
books = {
'AMOS': ('amos', 'amo'),
'CAN': ('song of songs', 'sol'),
'DAN': ('daniel', 'dan'),
'DEUT': ('deuteronomy', 'deu'),
'ESR': ('ezra', 'ezr'),
'EST': ('esther', 'est'),
'EXO': ('exodus', 'exo'),
'EZE': ('ezekiel', 'eze'),
'GEN': ('genesis', 'gen'),
'HAB': ('habakkuk', 'hab'),
'HAG': ('haggai', 'hag'),
'HOS': ('hosea', 'hos'),
'ICHR': ('1 chronicles', '1ch'),
'IICHR': ('2 chronicles', '2ch'),
'IIKON': ('2 kings', '2ki'),
'IISA': ('2 samuel', '2sa'),
'IKON': ('1 kings', '1ki'),
'IOB': ('job', 'job'),
'ISAM': ('1 samuel', '1sa'),
'JER': ('jeremiah', 'jer'),
'JES': ('isaiah', 'isa'),
'JOE': ('joel', 'joe'),
'JONA': ('jona', 'jon'),
'JOZ': ('joshua', 'jos'),
'LEV': ('leviticus', 'lev'),
'MAL': ('malachi', 'mal'),
'MICH': ('micah', 'mic'),
'NAH': ('nahum', 'nah'),
'NEH': ('nehemiah', 'neh'),
'NUM': ('numbers', 'num'),
'OBAD': ('obadiah', 'oba'),
'PRO': ('proverbs', 'pro'),
'PS': ('psalms', 'psa'),
'QOH': ('qoheleth', 'qoh'),
'RICHT': ('judges', 'jud'),
'RUTH': ('ruth', 'rut'),
'THR': ('lamentations', 'lam'),
'ZACH': ('zechariah', 'zec'),
'ZEP': ('zephaniah', 'zep'),
}
print("{} books".format(len(books)))
39 books
Here is the code to go from stage to stage. Nothing fancy, just maintaining a bunch of global variables. Every stage has an input and output file handle, a place to drop error messages, and a set of current column headers, with index, so that we can refer to columns by name instead of by number.
Successive stages transform the data, and may or may not add columns.
base_dir = '{}/Dropbox/laf-fabric-output/etcbc4b/participle'.format(os.path.expanduser('~'))
filepat = 'participia_compleet_r{}.csv'
start_column_names = {0: tuple('C{:>02d}'.format(c+1) for c in range(17))}
column_names = None
column_index = None
def infile(f): return open('{}/{}'.format(base_dir, f))
def outfile(f): return open('{}/{}'.format(base_dir, f), mode='w')
def msg(m):
sys.stderr.write(m + '\n')
sys.stderr.flush()
passage_pat = re.compile(r'([0-9]{2})\s*([A-Z_]+)\s*([0-9]+),([0-9]+)\.([0-9]+)')
errors = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
sourcef = None
targetf = None
data = None
new_data = None
nrows = None
levels = None
the_stage = 0
def error(cat, r, f):
errors[the_stage][cat].append((r,f))
def make_passage(m):
return (m.group(1), m.group(2), m.group(3), m.group(4), m.group(5))
def stage_start(nr=None):
global the_stage
global sourcef
global targetf
global column_names
global column_index
global data
global new_data
global nrows
global levels
if nr == None:
the_stage += 1
else:
the_stage = nr
msg("===BEGIN==STAGE {}=====".format(the_stage))
column_names = list(start_column_names[the_stage - 1])
column_index = dict((name, e) for (e, name) in enumerate(column_names))
sfile = filepat.format(the_stage - 1)
tfile = filepat.format(the_stage)
msg('Column names before:\n{}'.format(', '.join(column_names)))
msg('Reading participle text data stage {} ({} => {}) ...'.format(the_stage, sfile, tfile))
errors[the_stage] = collections.defaultdict(lambda: [])
sourcef = infile(sfile)
targetf = outfile(tfile)
data = csv.reader(sourcef)
new_data = csv.writer(targetf)
nrows = 0
levels = collections.defaultdict(lambda: collections.Counter())
def stage_end(last=False):
global targetf
global columnindex
start_column_names[the_stage] = tuple(column_names)
column_index = dict((name, e) for (e, name) in enumerate(column_names))
targetf.close()
sourcef.close()
tfile = filepat.format(the_stage)
ffile = filepat.format('_final')
targetf = infile(tfile)
data = csv.reader(targetf)
ncols = len(column_names)
row_lengths = collections.Counter()
if last:
finalf = outfile(ffile)
new_data = csv.writer(finalf)
new_data.writerow(column_names)
for row in data:
row_lengths[len(row)] += 1
for (e, field) in enumerate(row):
levels[e][field] += 1
if last:
new_data.writerow(row)
targetf.close()
if last:
finalf.close()
show_n = 20
for e in sorted(levels):
valueset = levels[e].keys()
lnv = len(valueset)
examples = ' '.join(str(x) for x in sorted(valueset)[0:show_n])
rest = ' ... {} more'.format(lnv - show_n) if lnv > show_n else ''
print("{:<15} has {:>5} levels ({}{})".format(column_names[e], lnv, examples, rest))
msg("{:>5} total rows x {:>3} header columns".format(nrows, ncols))
for (rl, rw) in sorted(row_lengths.items(), key=lambda x: (-x[1], x[0])):
msg("{:>5} body rows x {:>3} actual columns ({})".format(rw, rl, 'OK' if rl == ncols else 'ERROR'))
if errors:
my_errors = errors[the_stage]
for cat in sorted(my_errors):
msg("Error: {} ({}x)".format(cat, len(my_errors[cat])))
for (r, f) in my_errors[cat]:
msg("\t{:>5}: {}".format(r, f))
msg("===END====STAGE {}=====".format(the_stage))
def show_col(colname):
global column_index
column_index = dict((name, e) for (e, name) in enumerate(column_names))
print("Start levels of column {}".format(colname))
for (val, cnt) in sorted(levels[column_index[colname]].items(), key=lambda x: (-x[1], x[0])):
print("{:<10}: {:>5}x".format(val, cnt))
print("End levels of column {}".format(colname))
stage_start(nr=1)
rencols = dict(
C02='state',
C03='k',
C04='domain',
C06='phrf',
C07='carc',
C09='conj',
C11='neg',
C12='vstem',
C14='lex',
C16='clause',
C17='comment',
)
delcols = '''C05 C17'''
for old in rencols:
column_names[column_index[old]] = rencols[old]
delcols_sorted = sorted((column_index[x] for x in delcols.split()), reverse=True)
for dc in delcols_sorted:
del column_names[dc:dc+1]
for row in data:
for dc in delcols_sorted:
del row[dc:dc+1]
nrows += 1
new_data.writerow(row)
stage_end()
===BEGIN==STAGE 1===== Column names before: C01, C02, C03, C04, C05, C06, C07, C08, C09, C10, C11, C12, C13, C14, C15, C16, C17 Reading participle text data stage 1 (participia_compleet_r0.csv => participia_compleet_r1.csv) ... 9651 total rows x 15 header columns 9651 body rows x 15 actual columns (OK) ===END====STAGE 1=====
C01 has 1294 levels (Aa Aa' Aav Ab'-2-v Ab-1- Ab-1-iv Ab-1-v Ab-2- Ab-2-v Ab-d2-v Ab-dj-v Ab-dj2-v Abv Ad Ad' Ad'-j-v Ad'v Ad-Pj- Ad-Pj-'v Ad-Pj-v ... 1274 more) state has 4 levels (: :a :c :e) k has 4 levels ( #NAAM? +K +K=) domain has 98 levels (? ?? ??? ???Q ???QQQ ??N ??ND ??NDN ??NDND ??NQ ??NQQ ??Q ??QNQ ??QQ ?N ?ND ?NQ ?NQN ?NQND ?NQNQ ... 78 more) phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP) carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more) C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more) conj has 62 levels ( <D <D->CR <D-C <D-H <L->CR <L-DJ <L-H <L-KJ >CR >CR-/W >L-H >M >T->CR >T-H >W >XR/->CR B-C B-H B-VRM/ ... 42 more) C10 has 16 levels ( >CR >M B-H C DJ H J<N/->CR K->CR KJ KMW LM<N MN MN-DJ W W-/>CR) neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/) vstem has 16 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal shaf tif) C13 has 2 levels (act pas) lex has 906 levels (<BD[ <BR=[ <BR[ <CQ[ <CR[ <CT[ <DH=[ <DH[ <DP[ <DR==[ <DR[ <FH[ <FR[ <GB[ <JN[ <KR[ <LH[ <LL[ <LM[ <LP[ ... 886 more) C15 has 9651 levels (01 GEN 01,02.07 01 GEN 01,06.07 01 GEN 01,11.07 01 GEN 01,11.14 01 GEN 01,12.07 01 GEN 01,12.14 01 GEN 01,21.07 01 GEN 01,26.07 01 GEN 01,28.07 01 GEN 01,29.07 01 GEN 01,29.14 01 GEN 01,30.07 01 GEN 02,09.07 01 GEN 02,10.07 01 GEN 02,11.07 01 GEN 02,13.07 01 GEN 02,14.07 01 GEN 03,05.07 01 GEN 03,05.14 01 GEN 03,06.07 ... 9631 more) clause has 8984 levels ([**XBRT <PC>] [>CH <Su>] [>L >XTH <Co>] [*<BD <PC>] [>DMTW <Ob>] [*<BR <PC>] [<LJNW <Co>] [TMJD <Ti>] [*<BR <PC>] [<LJW <Co>] [*<BR <PC>] [HW> W-CLC M>WT H->JC <Su>] [*<BR <PC>] [JM <Ob>] [*<BR <PC>] [L--SXR <Co>] [*<BRJM <PC>] [<L PNJ H-MLK <Co>] [*<BRJM <PC>] [<LJW <Co>] [*<BRJM <PC>] [>NXNW <Su>] [M-BJT_LXM JHWDH / <D JRKTJ HR >PRJM <sp><Co>] [*<BRJM <PC>] [B-->RY <Co>] [*<FH <PC>] [>LH <Ob>] [*<FH <PC>] [CMJM W->RY <Ob>] [*<FH <PC>] [CXR <Ob>] [<JPH <Ob>] [*<FH <PC>] [GDLWT <..>] [*<FH <PC>] [GDLWT <Ob>] [*<FH <PC>] [GDLWT <Ob>] [<D >JN XQR <Co>] [*<FH <PC>] [GDLWT <Ob>] [B-MYRJM <Lo>] [*<FH <PC>] [KL <Ob>] [*<FH <PC>] [KL >LH <Ob>] ... 8964 more)
w
and >
from new typ1 and typ3 and fill new has_wav and has_alef accordingly.stage_start(nr=2)
bookabbs = set()
typ1 = column_index['C01']
bookl = column_index['C15']
lex = column_index['lex']
for row in data:
lexval = row[lex]
if not lexval.endswith('['):
error('lexeme not ending on [', nrows, lexval)
row[lex] = lexval.strip('[')
match = passage_pat.match(row[bookl])
if match:
(booknum, bookabb, chapter, verse, seqnum) = match.groups()
bookabbs.add(bookabb)
(book_name, book_acro) = books[bookabb]
chapnum = int(chapter)
versenum = int(verse)
vlabel = '{}{}.{}'.format(book_acro, chapnum, versenum)
row[bookl:bookl+1] = (
vlabel,
int(booknum),
book_name,
book_acro,
chapnum,
versenum,
int(seqnum),
)
else:
error('Unrecognized passage', nrows, row[bookl])
if row[typ1].count('-') > 2:
error('More than 2 - in manual field', nrows, row[typ1])
row[typ1:typ1+1] = (row[typ1].replace("'",'').replace('"','') + '-----').split('-')[0:3]
row[typ1] = row[typ1].rstrip('"')
while row[typ1].endswith('v'):
row[typ1] = row[typ1][0:-1]
row[typ1+2] += 'v'
if row[typ1].startswith('n') and not (row[typ1] == 'n' or row[typ1][1] in {'w', '>'}):
error('n followed by stray characters in manual field', nrows, '-'.join(row[typ1:typ1+2]))
has_wav = 1 if 'w' in row[typ1] or 'w' in row[typ1+2] else 0
has_alef = 1 if '>' in row[typ1] or '>' in row[typ1+2] else 0
row[typ1:typ1+1] = [
row[typ1].replace('w', '').replace('>', ''),
has_wav,
has_alef,
]
row[typ1+4] = row[typ1+4].replace('w', '').replace('>', '')
if row[typ1] == 'N': row[typ1] = 'n'
if row[typ1] == 'b2c': row[typ1] = 'b'
if row[typ1].startswith('bijzin?nVv'):
row[typ1] = 'n'
row[typ1+3] = 'Vv'
new_data.writerow(row)
nrows += 1
column_names[bookl:bookl+1] = ('vlabel', 'booknum', 'bookname', 'bookacro', 'chapter', 'verse', 'seqnum')
column_names[typ1:typ1+1] = ('typ1', 'typ2', 'typ3')
column_names[typ1:typ1+1] = ('typ1', 't1_wav', 't1_alef')
stage_end()
===BEGIN==STAGE 2===== Column names before: C01, state, k, domain, phrf, carc, C08, conj, C10, neg, vstem, C13, lex, C15, clause Reading participle text data stage 2 (participia_compleet_r1.csv => participia_compleet_r2.csv) ... 9651 total rows x 25 header columns 9651 body rows x 25 actual columns (OK) ===END====STAGE 2=====
typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r) t1_wav has 2 levels (0 1) t1_alef has 2 levels (0 1) typ2 has 367 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 347 more) typ3 has 130 levels ( : :a :e D Dv Dv: Dv:a E E:a Ei: Ev Ev:c Ev:s Nl O O: O:a O:a/qa O:e ... 110 more) state has 4 levels (: :a :c :e) k has 4 levels ( #NAAM? +K +K=) domain has 98 levels (? ?? ??? ???Q ???QQQ ??N ??ND ??NDN ??NDND ??NQ ??NQQ ??Q ??QNQ ??QQ ?N ?ND ?NQ ?NQN ?NQND ?NQNQ ... 78 more) phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP) carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more) C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more) conj has 62 levels ( <D <D->CR <D-C <D-H <L->CR <L-DJ <L-H <L-KJ >CR >CR-/W >L-H >M >T->CR >T-H >W >XR/->CR B-C B-H B-VRM/ ... 42 more) C10 has 16 levels ( >CR >M B-H C DJ H J<N/->CR K->CR KJ KMW LM<N MN MN-DJ W W-/>CR) neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/) vstem has 16 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal shaf tif) C13 has 2 levels (act pas) lex has 906 levels (<BD <BR <BR= <CQ <CR <CT <DH <DH= <DP <DR <DR== <FH <FR <GB <JN <KR <LH <LL <LM <LP ... 886 more) vlabel has 6823 levels (1ch1.46 1ch10.3 1ch10.4 1ch10.5 1ch10.8 1ch11.10 1ch11.15 1ch11.2 1ch11.21 1ch11.23 1ch11.25 1ch11.39 1ch11.4 1ch11.5 1ch11.6 1ch12.1 1ch12.16 1ch12.19 1ch12.2 1ch12.24 ... 6803 more) booknum has 39 levels (1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 ... 19 more) bookname has 39 levels (1 chronicles 1 kings 1 samuel 2 chronicles 2 kings 2 samuel amos daniel deuteronomy esther exodus ezekiel ezra genesis habakkuk haggai hosea isaiah jeremiah job ... 19 more) bookacro has 39 levels (1ch 1ki 1sa 2ch 2ki 2sa amo dan deu est exo eze ezr gen hab hag hos isa jer job ... 19 more) chapter has 147 levels (1 10 101 102 103 104 105 106 107 108 109 11 110 111 112 113 114 115 116 118 ... 127 more) verse has 91 levels (1 10 11 115 118 12 121 13 130 132 14 140 141 15 150 157 158 16 162 165 ... 71 more) seqnum has 10 levels (14 21 28 35 42 49 56 63 7 70) clause has 8984 levels ([**XBRT <PC>] [>CH <Su>] [>L >XTH <Co>] [*<BD <PC>] [>DMTW <Ob>] [*<BR <PC>] [<LJNW <Co>] [TMJD <Ti>] [*<BR <PC>] [<LJW <Co>] [*<BR <PC>] [HW> W-CLC M>WT H->JC <Su>] [*<BR <PC>] [JM <Ob>] [*<BR <PC>] [L--SXR <Co>] [*<BRJM <PC>] [<L PNJ H-MLK <Co>] [*<BRJM <PC>] [<LJW <Co>] [*<BRJM <PC>] [>NXNW <Su>] [M-BJT_LXM JHWDH / <D JRKTJ HR >PRJM <sp><Co>] [*<BRJM <PC>] [B-->RY <Co>] [*<FH <PC>] [>LH <Ob>] [*<FH <PC>] [CMJM W->RY <Ob>] [*<FH <PC>] [CXR <Ob>] [<JPH <Ob>] [*<FH <PC>] [GDLWT <..>] [*<FH <PC>] [GDLWT <Ob>] [*<FH <PC>] [GDLWT <Ob>] [<D >JN XQR <Co>] [*<FH <PC>] [GDLWT <Ob>] [B-MYRJM <Lo>] [*<FH <PC>] [KL <Ob>] [*<FH <PC>] [KL >LH <Ob>] ... 8964 more)
a
in typ2 into a new feature has_a after typ2 and remove it from typ2,
which is then called typ2strip-av
-s in typ3, add this as a new field #_v after typ3,
and strip the v
-s from column typ3, which is then called typ3strip-v.After processing, analyse the column levels again, especially column typ2strip-a.
stage_start(nr=3)
levels_0 = collections.defaultdict(lambda: collections.Counter())
typ1 = column_index['typ1']
typ2 = column_index['typ2']
typ3 = column_index['typ3']
dom = column_index['domain']
dlvs = ['D', 'N', 'Q', '?']
for row in data:
d = row[dom]
lend = len(d)
these_dlvs = [0, 0, 0, 0]
for (i, lv) in enumerate(dlvs):
if d[-1] == lv:
these_dlvs[i] = 1
elif lend > 1 and d[-2] == lv:
these_dlvs[i] = 0.5
row[dom:dom+1] = these_dlvs + [lend]
(f2, f3) = (row[typ2], 0)
if row[typ1] in {'h', 'k'}:
if f2.startswith('a'):
f2 = f2.lstrip('a')
f3 = 1
row[typ2:typ2+1] = (f2, f3)
nvs = row[typ3+1].count('v')
row[typ3+1:typ3+2] = [
row[typ3+1].replace('v',''),
nvs,
]
new_data.writerow(row)
if row[typ1] in {'p', 'h', 'k'}:
levels_0[row[typ1]][row[typ2]] += 1
nrows += 1
column_names[dom:dom+1] = ['dom_{}'.format(x) for x in dlvs] + ['dom_emb']
column_names[typ2:typ2+1] = ('typ2strip-a', 't2_a')
column_names[typ3+1:typ3+2] = ('typ3strip-v', 't3#v')
stage_end()
#show_col('typ1')
#show_col('typ3strip-v')
#for val0 in sorted(levels_0):
# print("Levels of typ2strip-a if typ1 is {}:".format(val0))
# for (val, occ) in sorted(levels_0[val0].items(), key=lambda x: (-x[1], x[0])):
# print("\t{:<5} occurs {:>5}x".format(val, occ))
===BEGIN==STAGE 3===== Column names before: typ1, t1_wav, t1_alef, typ2, typ3, state, k, domain, phrf, carc, C08, conj, C10, neg, vstem, C13, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause Reading participle text data stage 3 (participia_compleet_r2.csv => participia_compleet_r3.csv) ... 9651 total rows x 31 header columns 9651 body rows x 31 actual columns (OK) ===END====STAGE 3=====
typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r) t1_wav has 2 levels (0 1) t1_alef has 2 levels (0 1) typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more) t2_a has 2 levels (0 1) typ3strip-v has 77 levels ( : :a :c :e D D: D:a E E:a E:c E:s Ei: Nl O O: O:a O:a/qa O:c O:e ... 57 more) t3#v has 6 levels (0 1 2 3 4 5) state has 4 levels (: :a :c :e) k has 4 levels ( #NAAM? +K +K=) dom_D has 3 levels (0 0.5 1) dom_N has 3 levels (0 0.5 1) dom_Q has 3 levels (0 0.5 1) dom_? has 3 levels (0 0.5 1) dom_emb has 7 levels (1 2 3 4 5 6 7) phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP) carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more) C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more) conj has 62 levels ( <D <D->CR <D-C <D-H <L->CR <L-DJ <L-H <L-KJ >CR >CR-/W >L-H >M >T->CR >T-H >W >XR/->CR B-C B-H B-VRM/ ... 42 more) C10 has 16 levels ( >CR >M B-H C DJ H J<N/->CR K->CR KJ KMW LM<N MN MN-DJ W W-/>CR) neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/) vstem has 16 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal shaf tif) C13 has 2 levels (act pas) lex has 906 levels (<BD <BR <BR= <CQ <CR <CT <DH <DH= <DP <DR <DR== <FH <FR <GB <JN <KR <LH <LL <LM <LP ... 886 more) vlabel has 6823 levels (1ch1.46 1ch10.3 1ch10.4 1ch10.5 1ch10.8 1ch11.10 1ch11.15 1ch11.2 1ch11.21 1ch11.23 1ch11.25 1ch11.39 1ch11.4 1ch11.5 1ch11.6 1ch12.1 1ch12.16 1ch12.19 1ch12.2 1ch12.24 ... 6803 more) booknum has 39 levels (1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 ... 19 more) bookname has 39 levels (1 chronicles 1 kings 1 samuel 2 chronicles 2 kings 2 samuel amos daniel deuteronomy esther exodus ezekiel ezra genesis habakkuk haggai hosea isaiah jeremiah job ... 19 more) bookacro has 39 levels (1ch 1ki 1sa 2ch 2ki 2sa amo dan deu est exo eze ezr gen hab hag hos isa jer job ... 19 more) chapter has 147 levels (1 10 101 102 103 104 105 106 107 108 109 11 110 111 112 113 114 115 116 118 ... 127 more) verse has 91 levels (1 10 11 115 118 12 121 13 130 132 14 140 141 15 150 157 158 16 162 165 ... 71 more) seqnum has 10 levels (14 21 28 35 42 49 56 63 7 70) clause has 8984 levels ([**XBRT <PC>] [>CH <Su>] [>L >XTH <Co>] [*<BD <PC>] [>DMTW <Ob>] [*<BR <PC>] [<LJNW <Co>] [TMJD <Ti>] [*<BR <PC>] [<LJW <Co>] [*<BR <PC>] [HW> W-CLC M>WT H->JC <Su>] [*<BR <PC>] [JM <Ob>] [*<BR <PC>] [L--SXR <Co>] [*<BRJM <PC>] [<L PNJ H-MLK <Co>] [*<BRJM <PC>] [<LJW <Co>] [*<BRJM <PC>] [>NXNW <Su>] [M-BJT_LXM JHWDH / <D JRKTJ HR >PRJM <sp><Co>] [*<BRJM <PC>] [B-->RY <Co>] [*<FH <PC>] [>LH <Ob>] [*<FH <PC>] [CMJM W->RY <Ob>] [*<FH <PC>] [CXR <Ob>] [<JPH <Ob>] [*<FH <PC>] [GDLWT <..>] [*<FH <PC>] [GDLWT <Ob>] [*<FH <PC>] [GDLWT <Ob>] [<D >JN XQR <Co>] [*<FH <PC>] [GDLWT <Ob>] [B-MYRJM <Lo>] [*<FH <PC>] [KL <Ob>] [*<FH <PC>] [KL >LH <Ob>] ... 8964 more)
Make levels for type2strip-a, but simplify the values.
Only for the cases where typ1 is p
.
stage_start(nr=4)
trans = '''
H = HNH
Hs = BLJ
> = EJN
>s = EJN
< = OWD
J = JC
P> = EJN
<s = OWD
PB = BLJ
1> = EJN
Js = JC
r> = EJN
r>s = EJN
>< = EJN,OWD
1>s = EJN
B = BLJ
Pb> = EJN
hB = BLJ
hJ = JC
'''
trans_table = dict(
(x.strip(),set(y.strip().split(',')))
for (x,y) in (
z.strip().split('=')
for z in trans.split('\n') if z != ''
)
)
t2_levels = set()
for x in trans_table: t2_levels |= trans_table[x]
t2_level_sorted = sorted(t2_levels)
ll = len(t2_levels)
typ1 = column_index['typ1']
typ2 = column_index['typ2strip-a']
for row in data:
if row[typ1] == 'p':
val = row[typ2]
if val not in trans_table:
error('Unrecognized level for typ2strip-a', nrows, val)
row[typ2+1:typ2+1] = ['?' for x in t2_level_sorted]
else:
these_levels = trans_table[val]
row[typ2+1:typ2+1] = [1 if x in these_levels else 0 for x in t2_level_sorted]
else:
row[typ2+1:typ2+1] = [0 for x in t2_level_sorted]
nrows += 1
new_data.writerow(row)
column_names[typ2+1:typ2+1] = ['t2_{}'.format(x) for x in t2_level_sorted]
stage_end()
#show_col('typ3strip-v')
===BEGIN==STAGE 4===== Column names before: typ1, t1_wav, t1_alef, typ2strip-a, t2_a, typ3strip-v, t3#v, state, k, dom_D, dom_N, dom_Q, dom_?, dom_emb, phrf, carc, C08, conj, C10, neg, vstem, C13, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause Reading participle text data stage 4 (participia_compleet_r3.csv => participia_compleet_r4.csv) ... 9651 total rows x 36 header columns 9651 body rows x 36 actual columns (OK) ===END====STAGE 4=====
typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r) t1_wav has 2 levels (0 1) t1_alef has 2 levels (0 1) typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more) t2_BLJ has 2 levels (0 1) t2_EJN has 2 levels (0 1) t2_HNH has 2 levels (0 1) t2_JC has 2 levels (0 1) t2_OWD has 2 levels (0 1) t2_a has 2 levels (0 1) typ3strip-v has 77 levels ( : :a :c :e D D: D:a E E:a E:c E:s Ei: Nl O O: O:a O:a/qa O:c O:e ... 57 more) t3#v has 6 levels (0 1 2 3 4 5) state has 4 levels (: :a :c :e) k has 4 levels ( #NAAM? +K +K=) dom_D has 3 levels (0 0.5 1) dom_N has 3 levels (0 0.5 1) dom_Q has 3 levels (0 0.5 1) dom_? has 3 levels (0 0.5 1) dom_emb has 7 levels (1 2 3 4 5 6 7) phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP) carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more) C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more) conj has 62 levels ( <D <D->CR <D-C <D-H <L->CR <L-DJ <L-H <L-KJ >CR >CR-/W >L-H >M >T->CR >T-H >W >XR/->CR B-C B-H B-VRM/ ... 42 more) C10 has 16 levels ( >CR >M B-H C DJ H J<N/->CR K->CR KJ KMW LM<N MN MN-DJ W W-/>CR) neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/) vstem has 16 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal shaf tif) C13 has 2 levels (act pas) lex has 906 levels (<BD <BR <BR= <CQ <CR <CT <DH <DH= <DP <DR <DR== <FH <FR <GB <JN <KR <LH <LL <LM <LP ... 886 more) vlabel has 6823 levels (1ch1.46 1ch10.3 1ch10.4 1ch10.5 1ch10.8 1ch11.10 1ch11.15 1ch11.2 1ch11.21 1ch11.23 1ch11.25 1ch11.39 1ch11.4 1ch11.5 1ch11.6 1ch12.1 1ch12.16 1ch12.19 1ch12.2 1ch12.24 ... 6803 more) booknum has 39 levels (1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 ... 19 more) bookname has 39 levels (1 chronicles 1 kings 1 samuel 2 chronicles 2 kings 2 samuel amos daniel deuteronomy esther exodus ezekiel ezra genesis habakkuk haggai hosea isaiah jeremiah job ... 19 more) bookacro has 39 levels (1ch 1ki 1sa 2ch 2ki 2sa amo dan deu est exo eze ezr gen hab hag hos isa jer job ... 19 more) chapter has 147 levels (1 10 101 102 103 104 105 106 107 108 109 11 110 111 112 113 114 115 116 118 ... 127 more) verse has 91 levels (1 10 11 115 118 12 121 13 130 132 14 140 141 15 150 157 158 16 162 165 ... 71 more) seqnum has 10 levels (14 21 28 35 42 49 56 63 7 70) clause has 8984 levels ([**XBRT <PC>] [>CH <Su>] [>L >XTH <Co>] [*<BD <PC>] [>DMTW <Ob>] [*<BR <PC>] [<LJNW <Co>] [TMJD <Ti>] [*<BR <PC>] [<LJW <Co>] [*<BR <PC>] [HW> W-CLC M>WT H->JC <Su>] [*<BR <PC>] [JM <Ob>] [*<BR <PC>] [L--SXR <Co>] [*<BRJM <PC>] [<L PNJ H-MLK <Co>] [*<BRJM <PC>] [<LJW <Co>] [*<BRJM <PC>] [>NXNW <Su>] [M-BJT_LXM JHWDH / <D JRKTJ HR >PRJM <sp><Co>] [*<BRJM <PC>] [B-->RY <Co>] [*<FH <PC>] [>LH <Ob>] [*<FH <PC>] [CMJM W->RY <Ob>] [*<FH <PC>] [CXR <Ob>] [<JPH <Ob>] [*<FH <PC>] [GDLWT <..>] [*<FH <PC>] [GDLWT <Ob>] [*<FH <PC>] [GDLWT <Ob>] [<D >JN XQR <Co>] [*<FH <PC>] [GDLWT <Ob>] [B-MYRJM <Lo>] [*<FH <PC>] [KL <Ob>] [*<FH <PC>] [KL >LH <Ob>] ... 8964 more)
Deal with verbal stems, column vstem and the next one, which gives the values act
and pas
.
If vstem has qal
and the next one has pas
, modify qal
to qalp
.
We use this stage for determining the levels, and later we do the actual levelling.
stage_start(nr=5)
vstem = column_index['vstem']
for row in data:
val = row[vstem]
val2 = row[vstem+1]
if val == 'qal' and val2 == 'pas': val += 'p'
row[vstem] = val
del row[vstem+1:vstem+2]
nrows += 1
new_data.writerow(row)
del column_names[vstem+1:vstem+2]
stage_end()
#show_col('vstem')
===BEGIN==STAGE 5===== Column names before: typ1, t1_wav, t1_alef, typ2strip-a, t2_BLJ, t2_EJN, t2_HNH, t2_JC, t2_OWD, t2_a, typ3strip-v, t3#v, state, k, dom_D, dom_N, dom_Q, dom_?, dom_emb, phrf, carc, C08, conj, C10, neg, vstem, C13, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause Reading participle text data stage 5 (participia_compleet_r4.csv => participia_compleet_r5.csv) ... 9651 total rows x 35 header columns 9651 body rows x 35 actual columns (OK) ===END====STAGE 5=====
typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r) t1_wav has 2 levels (0 1) t1_alef has 2 levels (0 1) typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more) t2_BLJ has 2 levels (0 1) t2_EJN has 2 levels (0 1) t2_HNH has 2 levels (0 1) t2_JC has 2 levels (0 1) t2_OWD has 2 levels (0 1) t2_a has 2 levels (0 1) typ3strip-v has 77 levels ( : :a :c :e D D: D:a E E:a E:c E:s Ei: Nl O O: O:a O:a/qa O:c O:e ... 57 more) t3#v has 6 levels (0 1 2 3 4 5) state has 4 levels (: :a :c :e) k has 4 levels ( #NAAM? +K +K=) dom_D has 3 levels (0 0.5 1) dom_N has 3 levels (0 0.5 1) dom_Q has 3 levels (0 0.5 1) dom_? has 3 levels (0 0.5 1) dom_emb has 7 levels (1 2 3 4 5 6 7) phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP) carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more) C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more) conj has 62 levels ( <D <D->CR <D-C <D-H <L->CR <L-DJ <L-H <L-KJ >CR >CR-/W >L-H >M >T->CR >T-H >W >XR/->CR B-C B-H B-VRM/ ... 42 more) C10 has 16 levels ( >CR >M B-H C DJ H J<N/->CR K->CR KJ KMW LM<N MN MN-DJ W W-/>CR) neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/) vstem has 17 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal qalp shaf tif) lex has 906 levels (<BD <BR <BR= <CQ <CR <CT <DH <DH= <DP <DR <DR== <FH <FR <GB <JN <KR <LH <LL <LM <LP ... 886 more) vlabel has 6823 levels (1ch1.46 1ch10.3 1ch10.4 1ch10.5 1ch10.8 1ch11.10 1ch11.15 1ch11.2 1ch11.21 1ch11.23 1ch11.25 1ch11.39 1ch11.4 1ch11.5 1ch11.6 1ch12.1 1ch12.16 1ch12.19 1ch12.2 1ch12.24 ... 6803 more) booknum has 39 levels (1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 ... 19 more) bookname has 39 levels (1 chronicles 1 kings 1 samuel 2 chronicles 2 kings 2 samuel amos daniel deuteronomy esther exodus ezekiel ezra genesis habakkuk haggai hosea isaiah jeremiah job ... 19 more) bookacro has 39 levels (1ch 1ki 1sa 2ch 2ki 2sa amo dan deu est exo eze ezr gen hab hag hos isa jer job ... 19 more) chapter has 147 levels (1 10 101 102 103 104 105 106 107 108 109 11 110 111 112 113 114 115 116 118 ... 127 more) verse has 91 levels (1 10 11 115 118 12 121 13 130 132 14 140 141 15 150 157 158 16 162 165 ... 71 more) seqnum has 10 levels (14 21 28 35 42 49 56 63 7 70) clause has 8984 levels ([**XBRT <PC>] [>CH <Su>] [>L >XTH <Co>] [*<BD <PC>] [>DMTW <Ob>] [*<BR <PC>] [<LJNW <Co>] [TMJD <Ti>] [*<BR <PC>] [<LJW <Co>] [*<BR <PC>] [HW> W-CLC M>WT H->JC <Su>] [*<BR <PC>] [JM <Ob>] [*<BR <PC>] [L--SXR <Co>] [*<BRJM <PC>] [<L PNJ H-MLK <Co>] [*<BRJM <PC>] [<LJW <Co>] [*<BRJM <PC>] [>NXNW <Su>] [M-BJT_LXM JHWDH / <D JRKTJ HR >PRJM <sp><Co>] [*<BRJM <PC>] [B-->RY <Co>] [*<FH <PC>] [>LH <Ob>] [*<FH <PC>] [CMJM W->RY <Ob>] [*<FH <PC>] [CXR <Ob>] [<JPH <Ob>] [*<FH <PC>] [GDLWT <..>] [*<FH <PC>] [GDLWT <Ob>] [*<FH <PC>] [GDLWT <Ob>] [<D >JN XQR <Co>] [*<FH <PC>] [GDLWT <Ob>] [B-MYRJM <Lo>] [*<FH <PC>] [KL <Ob>] [*<FH <PC>] [KL >LH <Ob>] ... 8964 more)
Deal with conjunctions, column conj and the next one. If there is a value in the next one, take that, otherwise the value in the conj column. Then apply a translation table.
The actual leveling occurs in the next stage.
stage_start(nr=6)
conj = column_index['conj']
trans_conj = {
'': 'empty',
'W': 'w',
'H': 'h',
'KJ': 'ki',
'>CR': 'acr',
'W-/H': 'h',
'>M': 'im',
'DJ': 'di',
'K->CR': 'kacer',
'W-/>M': 'im',
'C': 'c',
'W-/KJ': 'ki',
'L-H': 'h',
'>T->CR': 'acr',
'K-H': 'h',
'PN': 'pn',
'W-/W': 'w',
'<D': 'ad',
'K-L-QBL/-DJ': 'di',
'<L-H': 'h',
'>W': 'ow',
'KJ->M': 'im',
'>XR/->CR': 'acr',
'KJ-/>M': 'im',
'LW': None,
'W-/>T-H': 'h',
'J<N/': None,
'K-DJ': 'di',
'L->CR': 'acr',
'LM<N': None,
'MN-H': 'h',
'W-/>CR': 'acr',
'<D->CR': 'acr',
'<D-C': 'c',
'<D-H': 'h',
'<L->CR': 'acr',
'<L-DJ': 'di',
'<L-KJ': 'ki',
'>CR-/W': 'w',
'>L-H': 'h',
'>T-H': 'h',
'B-C': 'c',
'B-H': 'h',
'B-VRM/': None,
'H-/W': 'w',
'J<N/->CR': 'acr',
'K-C': 'c',
'K-PH/->CR': 'acr',
'KJ-/LWL>': None,
'KMW': None,
'LWL>': None,
'MN': None,
'MN-DJ': 'di',
'MN-L-BD/-H': 'h',
'TXT/->CR': 'acr',
'W-/<L-H': 'h',
'W-/>L-H': 'h ',
'W-/B-KL/-DJ': 'di',
'W-/DJ': 'di',
'W-/L->CR': 'acr',
'W-/L-H': 'h',
'W-/LW': None,
'W-/W-/W': 'w',
}
for row in data:
val = row[conj]
val2 = row[conj+1]
if val2 != '': val = val2
row[conj] = trans_conj[val] or ''
del row[conj+1:conj+2]
nrows += 1
new_data.writerow(row)
del column_names[conj+1:conj+2]
stage_end()
===BEGIN==STAGE 6===== Column names before: typ1, t1_wav, t1_alef, typ2strip-a, t2_BLJ, t2_EJN, t2_HNH, t2_JC, t2_OWD, t2_a, typ3strip-v, t3#v, state, k, dom_D, dom_N, dom_Q, dom_?, dom_emb, phrf, carc, C08, conj, C10, neg, vstem, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause Reading participle text data stage 6 (participia_compleet_r5.csv => participia_compleet_r6.csv) ... 9651 total rows x 34 header columns 9651 body rows x 34 actual columns (OK) ===END====STAGE 6=====
typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r) t1_wav has 2 levels (0 1) t1_alef has 2 levels (0 1) typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more) t2_BLJ has 2 levels (0 1) t2_EJN has 2 levels (0 1) t2_HNH has 2 levels (0 1) t2_JC has 2 levels (0 1) t2_OWD has 2 levels (0 1) t2_a has 2 levels (0 1) typ3strip-v has 77 levels ( : :a :c :e D D: D:a E E:a E:c E:s Ei: Nl O O: O:a O:a/qa O:c O:e ... 57 more) t3#v has 6 levels (0 1 2 3 4 5) state has 4 levels (: :a :c :e) k has 4 levels ( #NAAM? +K +K=) dom_D has 3 levels (0 0.5 1) dom_N has 3 levels (0 0.5 1) dom_Q has 3 levels (0 0.5 1) dom_? has 3 levels (0 0.5 1) dom_emb has 7 levels (1 2 3 4 5 6 7) phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP) carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more) C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more) conj has 14 levels ( acr ad c di empty h h im kacer ki ow pn w) neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/) vstem has 17 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal qalp shaf tif) lex has 906 levels (<BD <BR <BR= <CQ <CR <CT <DH <DH= <DP <DR <DR== <FH <FR <GB <JN <KR <LH <LL <LM <LP ... 886 more) vlabel has 6823 levels (1ch1.46 1ch10.3 1ch10.4 1ch10.5 1ch10.8 1ch11.10 1ch11.15 1ch11.2 1ch11.21 1ch11.23 1ch11.25 1ch11.39 1ch11.4 1ch11.5 1ch11.6 1ch12.1 1ch12.16 1ch12.19 1ch12.2 1ch12.24 ... 6803 more) booknum has 39 levels (1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 ... 19 more) bookname has 39 levels (1 chronicles 1 kings 1 samuel 2 chronicles 2 kings 2 samuel amos daniel deuteronomy esther exodus ezekiel ezra genesis habakkuk haggai hosea isaiah jeremiah job ... 19 more) bookacro has 39 levels (1ch 1ki 1sa 2ch 2ki 2sa amo dan deu est exo eze ezr gen hab hag hos isa jer job ... 19 more) chapter has 147 levels (1 10 101 102 103 104 105 106 107 108 109 11 110 111 112 113 114 115 116 118 ... 127 more) verse has 91 levels (1 10 11 115 118 12 121 13 130 132 14 140 141 15 150 157 158 16 162 165 ... 71 more) seqnum has 10 levels (14 21 28 35 42 49 56 63 7 70) clause has 8984 levels ([**XBRT <PC>] [>CH <Su>] [>L >XTH <Co>] [*<BD <PC>] [>DMTW <Ob>] [*<BR <PC>] [<LJNW <Co>] [TMJD <Ti>] [*<BR <PC>] [<LJW <Co>] [*<BR <PC>] [HW> W-CLC M>WT H->JC <Su>] [*<BR <PC>] [JM <Ob>] [*<BR <PC>] [L--SXR <Co>] [*<BRJM <PC>] [<L PNJ H-MLK <Co>] [*<BRJM <PC>] [<LJW <Co>] [*<BRJM <PC>] [>NXNW <Su>] [M-BJT_LXM JHWDH / <D JRKTJ HR >PRJM <sp><Co>] [*<BRJM <PC>] [B-->RY <Co>] [*<FH <PC>] [>LH <Ob>] [*<FH <PC>] [CMJM W->RY <Ob>] [*<FH <PC>] [CXR <Ob>] [<JPH <Ob>] [*<FH <PC>] [GDLWT <..>] [*<FH <PC>] [GDLWT <Ob>] [*<FH <PC>] [GDLWT <Ob>] [<D >JN XQR <Co>] [*<FH <PC>] [GDLWT <Ob>] [B-MYRJM <Lo>] [*<FH <PC>] [KL <Ob>] [*<FH <PC>] [KL >LH <Ob>] ... 8964 more)
Deal with carc.
There should be no empty carcs.
In some cases we do not take the value in the carc column, but the value in the next column:
200
or 201
take the value of the next column (which may be empty)There should not be other three digit values starting with 2
.
The result is a two or three digit number.
Here are the rules for processing the resulting values
(empty value, coming from the next column): carc1 = chain
, no levels in other columns0
: carc1 = txto
, no levels in other columns10
- 16
: carc1 = rela
, carc2 = second digit, no level in carc350
- 74
: carc1 = infc
, no levels in other columns999
: carc1 = q
, no levels in other columnsstage_start(nr=7)
carc = column_index['carc']
for row in data:
carc1 = ''
carc2 = ''
carc3 = ''
code = str(row[carc])
if code == '':
error('Empty carc', nrows, code)
if len(code) == 3 and code[0] == '2' and code[1:] not in {'00', '01'}:
error('Strange carc in 200 range', nrows, code)
if len(code) == 3 and code[0] == '2':
code = str(row[carc+1])
if code == '':
carc1 = 'chain'
elif code == '0':
carc1 = 'txto'
elif 10 <= int(code) <= 16:
carc1 = 'rela'
carc2 = code[1]
elif 50 <= int(code) <= 74:
carc1 = 'infc'
elif len(code) == 2:
error('Strange carc with two digits', nrows, code)
elif code == '999':
carc1 = 'q'
else:
(carc1,carc2,carc3) = (code[0], code[1], code[2])
row[carc:carc+2] = (carc1, carc2, carc3)
nrows += 1
new_data.writerow(row)
column_names[carc:carc+2] = ('carc1', 'carc2', 'carc3')
stage_end()
===BEGIN==STAGE 7===== Column names before: typ1, t1_wav, t1_alef, typ2strip-a, t2_BLJ, t2_EJN, t2_HNH, t2_JC, t2_OWD, t2_a, typ3strip-v, t3#v, state, k, dom_D, dom_N, dom_Q, dom_?, dom_emb, phrf, carc, C08, conj, neg, vstem, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause Reading participle text data stage 7 (participia_compleet_r6.csv => participia_compleet_r7.csv) ... 9651 total rows x 35 header columns 9651 body rows x 35 actual columns (OK) ===END====STAGE 7=====
typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r) t1_wav has 2 levels (0 1) t1_alef has 2 levels (0 1) typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more) t2_BLJ has 2 levels (0 1) t2_EJN has 2 levels (0 1) t2_HNH has 2 levels (0 1) t2_JC has 2 levels (0 1) t2_OWD has 2 levels (0 1) t2_a has 2 levels (0 1) typ3strip-v has 77 levels ( : :a :c :e D D: D:a E E:a E:c E:s Ei: Nl O O: O:a O:a/qa O:c O:e ... 57 more) t3#v has 6 levels (0 1 2 3 4 5) state has 4 levels (: :a :c :e) k has 4 levels ( #NAAM? +K +K=) dom_D has 3 levels (0 0.5 1) dom_N has 3 levels (0 0.5 1) dom_Q has 3 levels (0 0.5 1) dom_? has 3 levels (0 0.5 1) dom_emb has 7 levels (1 2 3 4 5 6 7) phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP) carc1 has 12 levels (1 3 4 5 6 7 8 chain infc q rela txto) carc2 has 9 levels ( 0 1 2 3 4 5 6 7) carc3 has 9 levels ( 0 1 2 3 4 5 6 7) conj has 14 levels ( acr ad c di empty h h im kacer ki ow pn w) neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/) vstem has 17 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal qalp shaf tif) lex has 906 levels (<BD <BR <BR= <CQ <CR <CT <DH <DH= <DP <DR <DR== <FH <FR <GB <JN <KR <LH <LL <LM <LP ... 886 more) vlabel has 6823 levels (1ch1.46 1ch10.3 1ch10.4 1ch10.5 1ch10.8 1ch11.10 1ch11.15 1ch11.2 1ch11.21 1ch11.23 1ch11.25 1ch11.39 1ch11.4 1ch11.5 1ch11.6 1ch12.1 1ch12.16 1ch12.19 1ch12.2 1ch12.24 ... 6803 more) booknum has 39 levels (1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 ... 19 more) bookname has 39 levels (1 chronicles 1 kings 1 samuel 2 chronicles 2 kings 2 samuel amos daniel deuteronomy esther exodus ezekiel ezra genesis habakkuk haggai hosea isaiah jeremiah job ... 19 more) bookacro has 39 levels (1ch 1ki 1sa 2ch 2ki 2sa amo dan deu est exo eze ezr gen hab hag hos isa jer job ... 19 more) chapter has 147 levels (1 10 101 102 103 104 105 106 107 108 109 11 110 111 112 113 114 115 116 118 ... 127 more) verse has 91 levels (1 10 11 115 118 12 121 13 130 132 14 140 141 15 150 157 158 16 162 165 ... 71 more) seqnum has 10 levels (14 21 28 35 42 49 56 63 7 70) clause has 8984 levels ([**XBRT <PC>] [>CH <Su>] [>L >XTH <Co>] [*<BD <PC>] [>DMTW <Ob>] [*<BR <PC>] [<LJNW <Co>] [TMJD <Ti>] [*<BR <PC>] [<LJW <Co>] [*<BR <PC>] [HW> W-CLC M>WT H->JC <Su>] [*<BR <PC>] [JM <Ob>] [*<BR <PC>] [L--SXR <Co>] [*<BRJM <PC>] [<L PNJ H-MLK <Co>] [*<BRJM <PC>] [<LJW <Co>] [*<BRJM <PC>] [>NXNW <Su>] [M-BJT_LXM JHWDH / <D JRKTJ HR >PRJM <sp><Co>] [*<BRJM <PC>] [B-->RY <Co>] [*<FH <PC>] [>LH <Ob>] [*<FH <PC>] [CMJM W->RY <Ob>] [*<FH <PC>] [CXR <Ob>] [<JPH <Ob>] [*<FH <PC>] [GDLWT <..>] [*<FH <PC>] [GDLWT <Ob>] [*<FH <PC>] [GDLWT <Ob>] [<D >JN XQR <Co>] [*<FH <PC>] [GDLWT <Ob>] [B-MYRJM <Lo>] [*<FH <PC>] [KL <Ob>] [*<FH <PC>] [KL >LH <Ob>] ... 8964 more)
Now deal with several features of which we isolate known levels, strip those parts from the value, and leave the remainder in its original column.
If there is no remainder in any column, may remove that column as well.
In the feature spec below, set keep
to False
.
stage_start(nr=8)
features = collections.OrderedDict((
('vstem', {
'lvs': list(sorted('''qal qalp hif nif piel peal pual hit hof haf pael htpa hsht htpe pasq tif shaf'''.split(), reverse=True)),
'keep': False,
}),
('neg', {
'lvs': list(sorted('''>JN/ >L= BLJ/ L> MN->JN/'''.split(), reverse=True)),
'keep': False,
}),
('conj', {
'lvs': list(sorted('''acr ad c di empty h h im kacer ki ow pn w'''.split(), reverse=True)),
'keep': False,
}),
('carc3', {
'lvs': list(sorted('''0 1 2 3 4 5 6 7'''.split(), reverse=True)),
'keep': False,
}),
('carc2', {
'lvs': list(sorted('''0 1 2 3 4 5 6 7'''.split(), reverse=True)),
'keep': False,
}),
('carc1', {
'lvs': list(sorted('''1 3 4 5 6 7 8 chain infc q rela txto'''.split(), reverse=True)),
'keep': False,
}),
('phrf', {
'lvs': list(sorted('''AdjP AdvP DPrP NP PP PPrP PrNP VP'''.split(), reverse=True)),
'keep': False,
}),
('k', {
'lvs': list(sorted('''#NAAM? +K +K='''.split(), reverse=True)),
'keep': False,
}),
('state', {
'lvs': list(sorted(''': :a :c :e'''.split(), reverse=True)),
'keep': False,
}),
('typ3strip-v', {
'lvs': list(sorted('''V PC S O E D Nl Nhl'''.split())),
'keep': True,
'lvname': 't3',
}),
))
colnums = []
clevels = []
keep = []
for feat in (features):
colnums.append(column_index[feat])
clevels.append(features[feat]['lvs'])
keep.append(features[feat]['keep'])
for row in data:
for (i, cn) in enumerate(colnums):
val = row[cn]
flags = []
for lv in clevels[i]:
if lv in val:
if keep[i]: val = val.replace(lv, '')
flag = 1
else:
flag = 0
flags.append(flag)
row[cn:cn+1] = ([val] if keep[i] else []) + flags
new_data.writerow(row)
nrows += 1
for feat in features:
cn = column_index[feat]
keep = features[feat]['keep']
lvname = features[feat].get('lvname', feat)
lvs = features[feat]['lvs']
column_names[cn:cn+1] = ([feat] if keep else []) + ['{}_{}'.format(lvname, x) for x in lvs]
stage_end(last=True)
#for feat in features:
# if features[feat]['keep']: show_col(feat)
===BEGIN==STAGE 8===== Column names before: typ1, t1_wav, t1_alef, typ2strip-a, t2_BLJ, t2_EJN, t2_HNH, t2_JC, t2_OWD, t2_a, typ3strip-v, t3#v, state, k, dom_D, dom_N, dom_Q, dom_?, dom_emb, phrf, carc1, carc2, carc3, conj, neg, vstem, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause Reading participle text data stage 8 (participia_compleet_r7.csv => participia_compleet_r8.csv) ... 9651 total rows x 112 header columns 9651 body rows x 112 actual columns (OK) ===END====STAGE 8=====
typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r) t1_wav has 2 levels (0 1) t1_alef has 2 levels (0 1) typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more) t2_BLJ has 2 levels (0 1) t2_EJN has 2 levels (0 1) t2_HNH has 2 levels (0 1) t2_JC has 2 levels (0 1) t2_OWD has 2 levels (0 1) t2_a has 2 levels (0 1) typ3strip-v has 26 levels ( : :/Hit3 :/qa :` :a :a/qa :a/qa1 :c :c/qa :e :s ; i i: i:a i:c n n: n:a ... 6 more) t3_D has 2 levels (0 1) t3_E has 2 levels (0 1) t3_Nhl has 2 levels (0 1) t3_Nl has 2 levels (0 1) t3_O has 2 levels (0 1) t3_PC has 2 levels (0 1) t3_S has 2 levels (0 1) t3_V has 2 levels (0 1) t3#v has 6 levels (0 1 2 3 4 5) state_:e has 2 levels (0 1) state_:c has 2 levels (0 1) state_:a has 2 levels (0 1) state_: has 1 levels (1) k_+K= has 2 levels (0 1) k_+K has 2 levels (0 1) k_#NAAM? has 2 levels (0 1) dom_D has 3 levels (0 0.5 1) dom_N has 3 levels (0 0.5 1) dom_Q has 3 levels (0 0.5 1) dom_? has 3 levels (0 0.5 1) dom_emb has 7 levels (1 2 3 4 5 6 7) phrf_VP has 2 levels (0 1) phrf_PrNP has 2 levels (0 1) phrf_PPrP has 2 levels (0 1) phrf_PP has 2 levels (0 1) phrf_NP has 2 levels (0 1) phrf_DPrP has 2 levels (0 1) phrf_AdvP has 2 levels (0 1) phrf_AdjP has 2 levels (0 1) carc1_txto has 2 levels (0 1) carc1_rela has 2 levels (0 1) carc1_q has 2 levels (0 1) carc1_infc has 2 levels (0 1) carc1_chain has 2 levels (0 1) carc1_8 has 2 levels (0 1) carc1_7 has 2 levels (0 1) carc1_6 has 2 levels (0 1) carc1_5 has 2 levels (0 1) carc1_4 has 2 levels (0 1) carc1_3 has 2 levels (0 1) carc1_1 has 2 levels (0 1) carc2_7 has 2 levels (0 1) carc2_6 has 2 levels (0 1) carc2_5 has 2 levels (0 1) carc2_4 has 2 levels (0 1) carc2_3 has 2 levels (0 1) carc2_2 has 2 levels (0 1) carc2_1 has 2 levels (0 1) carc2_0 has 2 levels (0 1) carc3_7 has 2 levels (0 1) carc3_6 has 2 levels (0 1) carc3_5 has 2 levels (0 1) carc3_4 has 2 levels (0 1) carc3_3 has 2 levels (0 1) carc3_2 has 2 levels (0 1) carc3_1 has 2 levels (0 1) carc3_0 has 2 levels (0 1) conj_w has 2 levels (0 1) conj_pn has 2 levels (0 1) conj_ow has 2 levels (0 1) conj_ki has 2 levels (0 1) conj_kacer has 2 levels (0 1) conj_im has 2 levels (0 1) conj_h has 2 levels (0 1) conj_h has 2 levels (0 1) conj_empty has 2 levels (0 1) conj_di has 2 levels (0 1) conj_c has 2 levels (0 1) conj_ad has 2 levels (0 1) conj_acr has 2 levels (0 1) neg_MN->JN/ has 2 levels (0 1) neg_L> has 2 levels (0 1) neg_BLJ/ has 2 levels (0 1) neg_>L= has 2 levels (0 1) neg_>JN/ has 2 levels (0 1) vstem_tif has 2 levels (0 1) vstem_shaf has 2 levels (0 1) vstem_qalp has 2 levels (0 1) vstem_qal has 2 levels (0 1) vstem_pual has 2 levels (0 1) vstem_piel has 2 levels (0 1) vstem_peal has 2 levels (0 1) vstem_pasq has 2 levels (0 1) vstem_pael has 2 levels (0 1) vstem_nif has 2 levels (0 1) vstem_htpe has 2 levels (0 1) vstem_htpa has 2 levels (0 1) vstem_hsht has 2 levels (0 1) vstem_hof has 2 levels (0 1) vstem_hit has 2 levels (0 1) vstem_hif has 2 levels (0 1) vstem_haf has 2 levels (0 1) lex has 906 levels (<BD <BR <BR= <CQ <CR <CT <DH <DH= <DP <DR <DR== <FH <FR <GB <JN <KR <LH <LL <LM <LP ... 886 more) vlabel has 6823 levels (1ch1.46 1ch10.3 1ch10.4 1ch10.5 1ch10.8 1ch11.10 1ch11.15 1ch11.2 1ch11.21 1ch11.23 1ch11.25 1ch11.39 1ch11.4 1ch11.5 1ch11.6 1ch12.1 1ch12.16 1ch12.19 1ch12.2 1ch12.24 ... 6803 more) booknum has 39 levels (1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 ... 19 more) bookname has 39 levels (1 chronicles 1 kings 1 samuel 2 chronicles 2 kings 2 samuel amos daniel deuteronomy esther exodus ezekiel ezra genesis habakkuk haggai hosea isaiah jeremiah job ... 19 more) bookacro has 39 levels (1ch 1ki 1sa 2ch 2ki 2sa amo dan deu est exo eze ezr gen hab hag hos isa jer job ... 19 more) chapter has 147 levels (1 10 101 102 103 104 105 106 107 108 109 11 110 111 112 113 114 115 116 118 ... 127 more) verse has 91 levels (1 10 11 115 118 12 121 13 130 132 14 140 141 15 150 157 158 16 162 165 ... 71 more) seqnum has 10 levels (14 21 28 35 42 49 56 63 7 70) clause has 8984 levels ([**XBRT <PC>] [>CH <Su>] [>L >XTH <Co>] [*<BD <PC>] [>DMTW <Ob>] [*<BR <PC>] [<LJNW <Co>] [TMJD <Ti>] [*<BR <PC>] [<LJW <Co>] [*<BR <PC>] [HW> W-CLC M>WT H->JC <Su>] [*<BR <PC>] [JM <Ob>] [*<BR <PC>] [L--SXR <Co>] [*<BRJM <PC>] [<L PNJ H-MLK <Co>] [*<BRJM <PC>] [<LJW <Co>] [*<BRJM <PC>] [>NXNW <Su>] [M-BJT_LXM JHWDH / <D JRKTJ HR >PRJM <sp><Co>] [*<BRJM <PC>] [B-->RY <Co>] [*<FH <PC>] [>LH <Ob>] [*<FH <PC>] [CMJM W->RY <Ob>] [*<FH <PC>] [CXR <Ob>] [<JPH <Ob>] [*<FH <PC>] [GDLWT <..>] [*<FH <PC>] [GDLWT <Ob>] [*<FH <PC>] [GDLWT <Ob>] [<D >JN XQR <Co>] [*<FH <PC>] [GDLWT <Ob>] [B-MYRJM <Lo>] [*<FH <PC>] [KL <Ob>] [*<FH <PC>] [KL >LH <Ob>] ... 8964 more)