In [1]:
import glob
import re
import bs4
import cltk
import nltk
import json

import numpy as np
from collections import defaultdict, Counter
from nltk.tokenize.punkt import PunktLanguageVars

from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import ward, dendrogram   

from sklearn.feature_extraction.text import TfidfVectorizer
from cltk.stop.latin.stops import STOPS_LIST

import itertools

stop_words = STOPS_LIST.extend(['quod', 'vel', 'sunt', 'hoc', 'vero', 'sit', 'sol', 'que', 'esse'])

%matplotlib inline
In [2]:
from cltk.corpus.utils.importer import CorpusImporter

corpus_importer = CorpusImporter('latin')
corpus_importer.import_corpus('latin_text_latin_library')
corpus_importer.import_corpus('latin_pos_lemmata_cltk')
corpus_importer.import_corpus('latin_models_cltk')
INFO:CLTK:Pulling latest 'latin_text_latin_library' from 'https://github.com/cltk/latin_text_latin_library.git'.
ERROR:CLTK:Git pull of 'https://github.com/cltk/latin_text_latin_library.git' failed: ''Error when fetching: fatal: unable to access 'https://github.com/cltk/latin_text_latin_library.git/': Could not resolve host: github.com' returned with exit code 2'
INFO:CLTK:Pulling latest 'latin_pos_lemmata_cltk' from 'https://github.com/cltk/latin_pos_lemmata_cltk.git'.
ERROR:CLTK:Git pull of 'https://github.com/cltk/latin_pos_lemmata_cltk.git' failed: ''Error when fetching: fatal: unable to access 'https://github.com/cltk/latin_pos_lemmata_cltk.git/': Could not resolve host: github.com' returned with exit code 2'
INFO:CLTK:Pulling latest 'latin_models_cltk' from 'https://github.com/cltk/latin_models_cltk.git'.
ERROR:CLTK:Git pull of 'https://github.com/cltk/latin_models_cltk.git' failed: ''Error when fetching: fatal: unable to access 'https://github.com/cltk/latin_models_cltk.git/': Could not resolve host: github.com' returned with exit code 2'
In [113]:
"""This corpus comes from the Thesaurus Musicarum Latinarum.
If you want to replicate these results, you should grab the
corpus with a recursive wget or similar."""

corpus_files = glob.glob('corpus/html/*.html')
In [4]:
soups = [bs4.BeautifulSoup(open(file), 'lxml') for file in corpus_files]
In [137]:
def process_soup(soup):
    "Takes a bs4 BeautifulSoup object and returns a dict with document content and metadata."
    corpus_item = {}
    
    header = soup.find('p')

    original = " ".join([n for n in soup.findAll(text=True)])
    
    body = "".join([p.text for p in soup.findAll('p')[1:]])
    body = remove_newlines(body)
    body = remove_doublespace(body)
    body = remove_editorial_apparatus(body)
    body = join_hyphens(body)
    
    century_re = re.compile('.*www.chmtl.indiana.edu\/tml\/(\w+\W*\w+)\/.*')
    
    try:
        comments = "".join(soup.findAll(text=lambda text:isinstance(text, bs4.Comment)))
        century = century_re.match(comments).groups()[0]
        corpus_item['century'] = century
    except:
        corpus_item['century'] = 'nd'

    corpus_item['body'] = body.strip()
    corpus_item['id'] = soup.title.text.split(' ')[0]
    
    match = re.search(r"Author:\s(.*)", original)
    
    if match:
        result = match.group(1)
    else:
        result = ""
    
    corpus_item['author'] = result
    
    match = re.search(r"Title:\s(.*)", original)
    
    if match:
        result = match.group(1)
    else:
        result = ""
        
    corpus_item['title'] = result
    
    return corpus_item

def remove_editorial_apparatus(text):
    "Removes all text enclosed in square brackets."
    pattern = re.compile('\[.+?\]')
    return pattern.sub('', text)

def remove_newlines(text):
    "Replaces newlines with spaces."
    return text.replace('\n', ' ')

def remove_doublespace(text):
    "Removes any doublespaces."
    return text.replace('  ', ' ')

def join_hyphens(text):
    "Joins hyphens used at ends of lines."
    return text.replace('-\n', '')

def my_tokenize(document):
    "Tokenizes a document, represented as a string using Punkt."
    p = PunktLanguageVars()
    tokens = p.word_tokenize(document)
    tokens = [x.lower() for x in tokens if x not in (',', ';', '.', "'", '"',':',')','(', '|' , '||' )]
    return tokens
In [124]:
corpus = []

for soup in soups:
    try:
        corpus_item = process_soup(soup)    
        corpus.append(corpus_item)
    except:
        continue
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-124-2717f0198c2d> in <module>()
      1 corpus = []
      2 
----> 3 for soup in soups:
      4     try:
      5         corpus_item = process_soup(soup)

NameError: name 'soups' is not defined
In [19]:
# Save to disk 

with open('corpus.json', 'w') as fp:
    json.dump(corpus, fp)
In [3]:
# Load pre-parsed corpus

with open('corpus.json', 'r') as fp:
    corpus = json.load(fp)
In [4]:
len(corpus)
Out[4]:
807
In [5]:
lengths = [len(x['body']) for x in corpus]
plt.hist(lengths, bins=15)
plt.title('Histogram of document lengths in TML')
plt.xlabel('Length of doc (chars)')
plt.xlabel('Count')
plt.show()
In [14]:
# Write texts stripped of boilerplate for use externally 

import unidecode

for item in corpus:
    with open('./corpus/body/{}.txt'.format(item['id']), 'w') as f:
        f.write(unidecode.unidecode(item['body']))
In [6]:
def prepare_id_info_map(corpus):
    id_info_map = {}
    for item in corpus:
        item_copy = item.copy()
        del item_copy['body']
        id_info_map[item['id']] = item_copy
    return id_info_map
In [7]:
id_info_map = prepare_id_info_map(corpus)
In [8]:
id_info_map['AARIH1']
Out[8]:
{'author': 'Aaron, Petrus',
 'century': '16th',
 'id': 'AARIH1',
 'title': 'De institutione harmonica, liber primus'}

Topic Modeling with gensim

In [142]:
import gensim
from cltk.stem.lemma import LemmaReplacer
In [143]:
# Code that's useful for preparing the corpus for analysis with gensim.
# Vector space models sometimes perform better when stopwords etc. are filtered out
# I lemmatize the Latin for the fun of it. Note this prep() function is not used
# in the text reuse part of this notebook

lemmatizer = LemmaReplacer('latin')

def filter_stopwords(tokens):
    "Filters stopwords from list of tokens."
    return [token for token in tokens if token not in STOPS_LIST]

def filter_shortwords(tokens, short_size=2):
    "Filters tokens of size greater than short_size from list of tokens."
    return [token for token in tokens if len(token) > short_size]

def prep(document):
    "A convenience function which applies a series of operations to a document represented in a string."
    tokenized = my_tokenize(document)
    stopped = filter_stopwords(tokenized)
    shorted = filter_shortwords(stopped)
    lemmatized = lemmatizer.lemmatize(shorted)
    done = shorted
    return done
In [144]:
documents = [prep(doc['body']) for doc in corpus]
In [145]:
dictionary = gensim.corpora.Dictionary(documents)
In [29]:
bow_corpus = [dictionary.doc2bow(doc) for doc in documents]
len(bow_corpus)
Out[29]:
807
In [30]:
# Save the sparse matrix corresponding to the BoW representation of the corpus

gensim.corpora.MmCorpus.serialize('tml.mm', bow_corpus)
In [31]:
# Retrieve the sparse matrix corresponding to the BoW representation of the corpus

bow_corpus = gensim.corpora.MmCorpus('tml.mm')
In [32]:
# Make a tf-idf model for the corpus
tfidf = gensim.models.TfidfModel(bow_corpus, normalize=True)

# Get vectors full of tf-idf scores for the words in the BoW representation
tfidf_corpus = tfidf[bow_corpus]
In [33]:
# Now do LSI with that 

lsi = gensim.models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=150)
In [41]:
lsi.print_topics(10)
Out[41]:
['0.141*"tonus" + 0.137*"longus" + 0.135*"brevis" + 0.131*"proportio" + 0.128*"diapente" + 0.122*"diapason" + 0.120*"diatessaron" + 0.116*"semibreves" + 0.109*"tempus" + 0.107*"imperfectus"',
 '-0.257*"semibreves" + -0.249*"brevis" + -0.248*"longus" + -0.230*"semibrevis" + -0.179*"prolatio" + -0.174*"tempus" + -0.171*"imperfectus" + -0.153*"ligo1" + -0.150*"pungo" + -0.147*"minimus"',
 '0.362*"proportio" + 0.209*"numerus" + 0.154*"terminus" + -0.152*"finalis" + 0.131*"multiplex" + 0.102*"hypaton" + 0.100*"diezeugmenon" + -0.099*"antiphona" + 0.095*"meson" + 0.090*"diapason"',
 '0.273*"meson" + 0.259*"diezeugmenon" + 0.252*"hypaton" + -0.228*"proportio" + 0.193*"meses" + 0.190*"hypate" + 0.184*"hyperboleon" + 0.172*"lichanos" + 0.159*"paranete" + 0.154*"neo1"',
 '0.189*"dyapente" + 0.144*"diapente" + 0.134*"tonus" + 0.130*"dyapason" + 0.124*"discantus" + 0.120*"ascendo" + 0.117*".a." + 0.113*".d." + 0.107*".c." + 0.106*"dyatessaron"',
 '0.363*".a." + 0.345*".d." + 0.303*".c." + 0.277*".b." + 0.231*".f." + 0.200*".g." + 0.198*".e." + -0.168*"dyapente" + -0.123*"dyapason" + -0.116*"discantus"',
 '-0.208*"tropus" + 0.171*".a." + 0.161*".d." + 0.161*".c." + 0.160*"dyapente" + -0.160*"diapente" + -0.152*"diatessaron" + -0.146*"diapason" + 0.142*".b." + 0.130*"dyapason"',
 '-0.248*"dyapente" + -0.224*"prolatio" + 0.200*"ligo1" + -0.194*"dyapason" + -0.172*"dyatessaron" + 0.169*"discantus" + 0.150*"tenor" + 0.148*"organum" + -0.135*"minimus" + 0.134*"longus"',
 '0.253*"dyapente" + -0.251*"prolatio" + 0.211*"dyapason" + 0.170*"dyatessaron" + -0.161*"tenor" + 0.152*"ligo1" + 0.127*"longus" + -0.114*"diapente" + -0.107*"diatessaron" + -0.107*"contrapunctus"',
 '0.267*"tenor" + 0.203*"antiphona" + 0.198*"discantus" + 0.194*"dominus." + 0.183*"dominus" + 0.175*"contrapunctus" + 0.133*"responsorium" + -0.119*"mutatio" + 0.113*"amen." + 0.105*"duodecimus"']

Focusing on a subset

This is some code to restrict sample to a certain set of parameters

TODO: Make a proper query interface for corpus entries

In [110]:
q_century = '15th'
q_min_size = 25000
q_max_size = 10000000

sample = [treatise for treatise in corpus\
          if treatise['century'] == q_century\
          and len(treatise['body']) < q_max_size\
          and len(treatise['body']) > q_min_size]

sample = corpus

bodys = [treatise['body'] for treatise in sample]
lens = [str(len(treatise['body'])) for treatise in sample]

ids = [treatise['id'] for treatise in sample]
titles = [treatise.get('title', 'nt')[:15] for treatise in sample]
authors = [treatise.get('author', 'na')[:15] for treatise in sample]


len(sample) , len(ids), len(titles)
Out[110]:
(807, 807, 807)
In [75]:
lengths = [len(treatise['body']) for treatise in sample]
plt.hist(lengths, bins=12)
plt.title('Histogram of text lengths (characters) in selected 15C treatises.')
plt.show()

Text Similarity - Ward clustering in tf-idf space

In [146]:
documents = [doc['body'] for doc in corpus]

tfidf = TfidfVectorizer(tokenizer=prep, stop_words=stop_words)
tfs = tfidf.fit_transform(documents)
In [147]:
# Recall that bodys is the sample only from the 15C

bodys = [" ".join(prep(doc)) for doc in bodys]

tfs = tfidf.transform(bodys)
tfs.shape

dist = 1 - cosine_similarity(tfs)
In [149]:
linkage_matrix = ward(dist)

fig, ax = plt.subplots(figsize=(10, 100)) # set size
labels = [" / ".join(t) for t in zip(ids, authors, titles, lens)]
ax = dendrogram(linkage_matrix, orientation="right", labels=labels);

plt.tick_params(axis='x',          
    which='both',      
    bottom='off',      
    top='off',         
    labelbottom='off')

plt.tight_layout()