#!/usr/bin/env python
# coding: utf-8
#
#
#
#
#
# # CALAP
# **About CALAP**. CALAP stands for Computer-Assisted Linguistic Analysis of the Peshitta.
# CALAP was a [project](https://openaccess.leidenuniv.nl/handle/1887/10866) at the University of Leiden.
# The [Peshitta](http://en.wikipedia.org/wiki/Peshitta) is a collection of Syriac texts. According to Wikipedia it is the standard version of the Bible in churches of the Syriac tradition. Resources can be found on [peshitta.org](http://www.peshitta.org).
#
# The text we use below comes from the [Peshitta Institute Leiden](http://www.hum.leiden.edu/religion/research/peshitta-institute/peshitta-institute.html), and has been prepared as an EMDROS database, which is now held by the [ETCBC](http://www.godgeleerdheid.vu.nl/etcbc).
#
# From there is has been converted to [LAF](http://www.iso.org/iso/catalogue_detail.htm?csnumber=37326)
# by Dirk Roorda, and this notebook accesses this LAF data by means of
# [LAF-Fabric](http://laf-fabric.readthedocs.org/en/latest/).
#
# The LAF-data of the CALAP project has been archived at DANS:
# [DOI 10.17026/dans-zv9-w9d2](http://dx.doi.org/10.17026/dans-zv9-w9d2)
# # Text from features
# Here comes the plain text of the CALAP data.
#
# The CALAP database only contains the surface consonants as textual representation.
# In[1]:
import sys
import collections
from etcbc.lib import Transcription
from laf.fabric import LafFabric
fabric = LafFabric()
# In[2]:
fabric.load('calap', '--', 'plain', {
"xmlids": {"node": False, "edge": False},
"features": ('''
otype
surface_consonants
psp
book chapter verse verse_label
''',''),
"primary": True,
})
exec(fabric.localnames.format(var='fabric'))
# In[3]:
plain_file = outfile("calap_plain.txt")
tr = Transcription()
catalog = set()
for i in F.otype.s('word'):
sf = F.surface_consonants.v(i)
for x in sf: catalog.add(x)
the_text = tr.to_syriac(sf)
plain_file.write(the_text + ' ')
plain_file.close()
print(','.join(sorted(catalog)))
# This file does not have newlines, it is a blob of consonant transcriptions for each word separated by spaces.
# ## Passage indicators
# If you want books, chapters and verses marked, you can achieve it in the following way:
# In[4]:
plainx_file = outfile("calap_plainx.txt")
the_book = None
the_chapter = None
the_verse = None
for i in NN():
this_type = F.otype.v(i)
if this_type == "word":
the_text = tr.to_syriac(F.surface_consonants.v(i))
the_suffix = ' '
plainx_file.write(the_text + the_suffix)
elif this_type == "book":
the_book = F.book.v(i)
sys.stderr.write("\r{:>6} {:<30}".format(i, the_book))
plainx_file.write("\n{}".format(the_book))
elif this_type == "chapter":
the_chapter = F.chapter.v(i)
plainx_file.write("\n{} {}".format(the_book, the_chapter))
elif this_type == "verse":
the_verse = F.verse.v(i)
plainx_file.write("\n{}:{} ".format(the_chapter, the_verse))
sys.stderr.write("\n")
plainx_file.close()
# In order to show the syriac text, you need to install a font that has glyphs for the syriac unicode characters (0700 - 074F).
# For example: Estrangelo Edessa from [Meltho](http://www.bethmardutho.org/index.php/resources/fonts.html).
# In[5]:
get_ipython().system("head -n 10 {my_file('calap_plainx.txt')}")
# If you are in an environment where you do not have this font installed, see the screenshot at the top screenshot.
#
# ## Verse list
# We can get the text in a quite different way: just read it from the *primary data*.
#
# Let us do that per verse.
# In[6]:
verse_file = outfile("calap_verses.txt")
for i in F.otype.s('verse'):
the_text = tr.to_syriac(''.join([txt for (j, txt) in P.data(i)]))
the_verse = F.verse_label.v(i)
verse_file.write("{}\n{}\n".format(the_verse, the_text))
verse_file.close()
# In[7]:
get_ipython().system("head -n 10 {my_file('calap_verses.txt')}")
# ## Empty words
# In the BHS there are words that have an empty representation.
#
# Let us have a closer look to the CALAP.
# Are there empty words?
# In[8]:
ewords = collections.defaultdict(lambda: [])
verse = None
for i in NN(test=F.otype.v, values=['verse', 'word']):
if F.otype.v(i) == 'verse':
verse = i
continue
text = F.surface_consonants.v(i)
if text == '':
lex = lexeme.v(i)
pos = F.psp.v(i)
ewords[(lex, pos)].append(verse)
for (item, occs) in sorted(ewords.items(), key=lambda x: (-len(x[1]), x[0][1], x[0][0])):
print("{:>6} x {:<15} = {:>10} in {}{}".format(
len(occs),
item[1],
item[0],
"; ".join([F.verse_label.v(j) for j in occs][0:5]),
' ...' if len(occs) > 20 else '',
))
if not len(ewords):
print("No empty words found")
# In[9]:
close()
# In[ ]: