#!/usr/bin/env python
# coding: utf-8

# # Named Entity Recognition
# 
# TAW Project
# 
# 26 March 2015

# One thing we can do with large quantities of machine-processable text is to use algorithms to try "to locate and classify elements in text into pre-defined categories such as the names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc." ([Wikipedia](http://en.wikipedia.org/wiki/Named-entity_recognition)). This notebook demonstrates how that process can work.

# In[1]:


import os
import re
import operator
from collections import defaultdict
import nltk


# Here, we're using the [Natural Language Toolkit](http://www.nltk.org/), which describes itself as "a leading platform for building Python programs to work with human language data."

# In[2]:


WORKDIR = '/Users/libraries/Code/taw-data/'


# In[3]:


BOOKS_DATA_DIR = os.path.join(WORKDIR,'Books')
ESSAY_DATA_DIR = os.path.join(WORKDIR, 'Essay_Contest')


# In[4]:


def glob_txt(dir):
    dirs_seen = {}
    for dir_name, subdirs, file_list in os.walk(dir):
        if os.path.split(dir_name)[-1] == 'pages':
            txt_globs = []
            for f in file_list:
                with open(os.path.join(dir_name, f)) as infile:
                    txt_buff = infile.read()
                    txt_globs.append(txt_buff)
            dirs_seen[dir_name] = ' '.join(txt_globs)
        else:
            pass
        
    return dirs_seen


# In[5]:


directories = glob_txt(ESSAY_DATA_DIR)


# In our sample set of essays we have just over 100 entries …

# In[6]:


len(directories.keys())


# These two functions utilize the tools provided by the NLTK to prepare, parse, and analyze our text data…

# In[7]:


def extract_entity_names(t):
    """
    Collect parts of speech marked as named entities from nested parsed sentence trees
    Adapted from https://gist.github.com/onyxfish/322906
    """
    entity_names = []
    
    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))
                
    return entity_names


# In[8]:


def run_ner(dirpath):
    """
    Run a number of NLTK text processing functions in sequence.
    Return a set of unique named entities recognized in texts in a given directory
    """
    sentences = nltk.sent_tokenize(directories[dirpath])
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    
    # Once text is broken up into words and sentences we use a default 'tagger' to 
    # assign parts of speech to sections of text
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
    unique_names = set(entity_names)
    
    return unique_names


# Store the results so we can inspect them …

# In[9]:


results = {}

for d in directories.keys():
    names = run_ner(d)
    results[d] = names


# In[10]:


entities = defaultdict(int)
for k,v in results.items():
    item_name = re.split('\/', k)[-2]
    new_path = os.path.join(WORKDIR, 'analyzed')
    os.makedirs(new_path, exist_ok=True)
    print('Essay: {0}'.format(item_name))
    with open(os.path.join(new_path, '{0}-entities.txt'.format(item_name)), 'w') as outfile:
        for named_entity in v:
            entities[named_entity] += 1
            outfile.write(named_entity)


# In[11]:


for k,v in sorted(entities.items(), key=operator.itemgetter(1), reverse=True):
    print('{0}: \t {1}'.format(k,v))


# Discuss …

# In[ ]: