#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('pylab', 'inline') # In[2]: import pandas as pd from matplotlib import pyplot as plt import xml.etree.ElementTree as ET # ## Bring in the data # In[3]: tropnames = {'etnakhta': u'\u0591', 'segol': u'\u0592', 'shalshelet': u'\u0593', 'katan': u'\u0594', 'gadol': u'\u0595', 'tipkha': u'\u0596', 'revii': u'\u0597', 'tsinnorit': u'\u0598', 'pashta': u'\u0599', 'yetiv': u'\u059a', 'tevir': u'\u059b', 'geresh': u'\u059c', 'gereshmukdam': u'\u059d', 'gershayim': u'\u059e', 'karnepara': u'\u059f', 'telishagedola': u'\u05a0', 'pazer': u'\u05a1', 'munakh': u'\u05a3', 'mapakh': u'\u05a4', 'merkha': u'\u05a5', 'merkhakfula': u'\u05a6', 'darga': u'\u05a7', 'kadma': u'\u05a8', 'telishaketana': u'\u05a9', 'yerakhbenyomo': u'\u05aa', 'sofpasuk': u'\u05c3', 'zarka': u'\u05ae'} # per wikipedia: Note that both marks have been wrongly named by Unicode.[5][6] Zarqa/tsinnor corresponds to Unicode # "Hebrew accent zinor", code point U+05AE (where "zinor" is a misspelled form for tsinnor), while tsinnorit maps to # "Hebrew accent zarqa", code point U+0598. # In[4]: sfarim = ['bereshit', 'shmot', 'vayikra', 'bmidbar', 'dvarim'] counts = {} for sefer in sfarim: counts[sefer] = {} tree = ET.parse(sefer + '.xml') root = tree.getroot() prakim = root.findall('.//c') for perek in prakim: pereknum = int(perek.attrib['n']) if pereknum not in counts[sefer]: counts[sefer][pereknum] = {} psukim = perek.findall('v') for pasuk in psukim: pasuknum = int(pasuk.attrib['n']) if pasuknum not in counts[sefer][pereknum]: row = {name: 0 for name in tropnames.keys()} row['sefer'] = sefer row['pasuk'] = pasuknum row['perek'] = pereknum counts[sefer][pereknum][pasuknum] = row words = pasuk.findall('w') if 'wordcount' not in counts[sefer][pereknum][pasuknum]: counts[sefer][pereknum][pasuknum]['wordcount'] = len(words) else: counts[sefer][pereknum][pasuknum]['wordcount'] += len(words) for wordobj in words: word = wordobj.text for trop in tropnames: if tropnames[trop] in word: # if there is ever the same trop more than once on one word, it won't be captured here counts[sefer][pereknum][pasuknum][trop] += 1 # In[5]: flatcounts = [counts[s][pe][pa] for s in counts for pe in counts[s] for pa in counts[s][pe]] df = pd.DataFrame(flatcounts) df.set_index(['sefer', 'perek', 'pasuk'], inplace=True) df = df.reindex(sfarim, level=0) # In[6]: # for pretty plot labels seferticklocs = [0,df.index.get_loc(('shmot',1,1)),df.index.get_loc(('vayikra',1,1)),df.index.get_loc(('bmidbar',1,1)),df.index.get_loc(('dvarim',1,1))] seferlabels = ['Bereshit', 'Shemot', 'Vayikra', 'B\'midbar', 'D\'varim'] # ## Aggregate to inspect by sefer and perek # In[7]: sefergroup = df.groupby(level=['sefer']) sefermeans = sefergroup.mean() sefersums = sefergroup.sum() # In[8]: # I didn't end up using these, but they're here perekgroup = df.groupby(level=['sefer','perek'], sort=False) perekmeans = perekgroup.mean() perekmeans = perekmeans.reindex(sfarim, level=0) # sort=False isn't working, so... perekmeans.dropna(inplace=True) # workaround for https://github.com/pydata/pandas/issues/9344 pereksums = perekgroup.sum() pereksums = pereksums.reindex(sfarim, level=0) # sort=False isn't working, so... pereksums.dropna(inplace=True) # workaround for https://github.com/pydata/pandas/issues/9344 # ## Telishas # In[38]: ax = sefermeans[['telishaketana','telishagedola']].plot(by='sefer', kind='bar', figsize=(9,6)) # 'shalshelet', 'merkhakfula' plt.xticks(range(5), seferlabels, fontsize='large', rotation=0) plt.title('Telishas per pasuk by sefer', fontsize='x-large') plt.xlabel('') plt.ylabel('Average count per pasuk', fontsize='large') patches, labels = ax.get_legend_handles_labels() ax.legend(patches, ['Telisha Ketana','Telisha Gedola'], loc='upper left', framealpha=0.9) plt.savefig('telisha_bar.svg', transparent=True) plt.savefig('telisha_bar.png', transparent=True) # In[39]: (sefermeans['telishaketana']/sefermeans['telishagedola']).plot(kind='bar', figsize=(9,6)) plt.xticks(range(5), seferlabels, fontsize='large', rotation=0) plt.title('Ratio of telisha ketana to telisha gedola by sefer', fontsize='x-large') plt.xlabel('') plt.ylabel('Telisha Ketana:Telisha Gedola', fontsize='large') plt.savefig('telisha_ratios.svg', transparent=True) plt.savefig('telisha_ratios.png', transparent=True) # ## Moving average plots # In[9]: rolling = pd.rolling_mean(df, 500, center=True) # In[40]: rolling.plot(y=['telishaketana', 'telishagedola'], figsize=(9,6)) plt.xticks(seferticklocs, seferlabels, fontsize='large') plt.title('Trop occurence across the Torah', fontsize='x-large') plt.xlabel('') plt.ylabel('Average count per pasuk', fontsize='large') plt.legend(['Telisha Ketana', 'Telisha Gedola'], loc='upper left', framealpha=0.9) plt.savefig('telisha.svg', transparent=True) plt.savefig('telisha.png', transparent=True) # In[41]: rolling.plot(y=['munakh', 'katan', 'pashta', 'mapakh', 'revii'], figsize=(9,6)) plt.xticks(seferticklocs, seferlabels, fontsize='large') plt.title('Trop occurence across the Torah', fontsize='x-large') plt.xlabel('') plt.ylabel('Average count per pasuk', fontsize='large') plt.legend(['Munakh', 'Katan', 'Pashta', 'Mapakh', 'Revi\'i'], loc='upper left', framealpha=0.9) plt.savefig('common_trop.svg', transparent=True) plt.savefig('common_trop.png', transparent=True) # In[36]: # trying to make an area plot to see how all these compare to the total number of munakhs. rolling.plot(y=['katan', 'pashta', 'mapakh', 'revii'], kind='area', figsize=(9,6)) # plt.hold(True) # rolling.plot(y=['munakh']) plt.xticks(seferticklocs, seferlabels, fontsize='large') plt.title('Trop occurence across the Torah', fontsize='x-large') plt.xlabel('') plt.ylabel('Average count per pasuk', fontsize='large') # plt.legend(['Katan', 'Pashta', 'Mapakh', 'Revi\'i'], loc='upper left', framealpha=0.9) # plt.savefig('common_trop.svg', transparent=True) # In[42]: rolling.plot(y=['zarka', 'segol'], figsize=(9,6)) plt.xticks(seferticklocs, seferlabels, fontsize='large') plt.title('Trop occurence across the Torah', fontsize='x-large') plt.xlabel('') plt.ylabel('Average count per pasuk', fontsize='large') plt.legend(['Zarka', 'Zegol'], loc='upper left', framealpha=0.9) # plt.savefig('zarkasegol.svg', transparent=True) plt.savefig('zarkasegol.png', transparent=True) # In[43]: rolling.plot(y=['tevir', 'darga'], figsize=(9,6)) plt.xticks(seferticklocs, seferlabels, fontsize='large') plt.title('Trop occurence across the Torah', fontsize='x-large') plt.xlabel('') plt.ylabel('Average count per pasuk', fontsize='large') plt.legend(['Tevir', 'Darga'], loc='upper left', framealpha=0.9) plt.savefig('dargatevir.svg', transparent=True) plt.savefig('dargatevir.png', transparent=True) # In[44]: rolling.plot(y=['etnakhta'], figsize=(9,6), legend=False) plt.xticks(seferticklocs, seferlabels, fontsize='large') plt.title('Etnakhta occurence across the Torah', fontsize='x-large') plt.xlabel('') plt.ylabel('Average count per pasuk', fontsize='large') # plt.legend(['Tevir', 'Darga'], loc='upper left', framealpha=0.9) plt.savefig('etnakhta.svg', transparent=True) plt.savefig('etnakhta.png', transparent=True) # In[45]: ax = rolling[['etnakhta', 'wordcount']].plot(secondary_y=['wordcount'], figsize=(9,6), legend=False) plt.xticks(seferticklocs, seferlabels, fontsize='large') # this doesn't seem to be setting the font size ax.grid(axis='x') ax.set_title('Etnakhta occurence and word count across the Torah', fontsize='x-large') ax.set_xlabel('') ax.set_ylabel('Average trop count per pasuk', fontsize='large') ax.right_ax.set_ylabel('Average word count per pasuk', fontsize='large') plt.legend((ax.get_lines()[0], ax.right_ax.get_lines()[0]), ['Etnakhta', 'Word count (right)'], loc='lower left', framealpha=0.9) plt.savefig('etnakhtawordcount.svg', transparent=True) plt.savefig('etnakhtawordcount.png', transparent=True) # In[34]: # what the heck is this syntax? http://pandas.pydata.org/pandas-docs/stable/cookbook.html#if-then # I'd rather it be df['fivewords'] = 1 if df['wordcount'] > 5 else 0 df.ix[df['wordcount'] > 5, 'fivewords'] = 1 df.ix[df['wordcount'] <= 5, 'fivewords'] = 0 # In[35]: rolling = pd.rolling_mean(df, 500, center=True) # In[37]: rolling.plot(y=['etnakhta', 'fivewords'], figsize=(9,6)) plt.xticks(seferticklocs, seferlabels, fontsize='large') plt.title('Etnakhta occurence and word count across the Torah', fontsize='x-large') plt.xlabel('') plt.ylabel('Average count per pasuk', fontsize='large') plt.legend(['Etnakhta', 'Psukim with > 5 words'], loc='lower left', framealpha=0.9) plt.savefig('etnakhtafivewords.svg', transparent=True) plt.savefig('etnakhtafivewords.png', transparent=True) # ## Pasuk length/word counts # In[111]: bywordcount = df.groupby('wordcount').aggregate('mean') bywordcount = bywordcount.drop('fivewords', 1) # In[151]: df['wordcount'].hist(bins=20) plt.title('Wordcount distribution', fontsize='x-large') plt.xlabel('Number of words', fontsize='large') plt.ylabel('Number of psukim', fontsize='large') plt.savefig('wordcountdist.svg', transparent=True) plt.savefig('wordcountdist.png', transparent=True) # In[144]: # export a csv to bring into a (still-in-progress) D3 graph to turn trop on and off bywordcount[1:32].to_csv('bywordcount/bywordcount.csv') # In[109]: bywordcount.plot(y='munakh', figsize=(9,6), xlim=(0,32)) # ## Correlation tables # In[46]: dfforcorrs = df.drop(['wordcount', 'fivewords'],1) # In[20]: dfforcorrs.corr().to_csv('rawcorr.csv') # In[21]: rollingforcorrs = pd.rolling_mean(dfforcorrs, 500, center=True) rollingforcorrs.corr().to_csv('rollingcorr.csv') # ## Looking for anomalies # In[12]: df.loc['bmidbar'].query('etnakhta > 1')[['etnakhta']] # In[23]: # this seems to occur when there's a k'tiv/kri on the last word of a pasuk. df.query('sofpasuk == 0')[['sofpasuk']] # In[24]: df.query('tsinnorit > 0')[['tsinnorit']] # ## Histograms and other plots # In[25]: df['wordcount'].hist() # In[26]: perekmeans[['munakh','pashta']].hist() # In[27]: perekmeans[['telishaketana','telishagedola']].hist() # In[28]: # this is a little off from https://en.wikipedia.org/wiki/Munach#Total_occurrences sum(df['munakh']) # ## Output to CSV # In[29]: df.to_csv('trop.csv')