#!/usr/bin/env python # coding: utf-8 # # Named Entity Recognition # # TAW Project # # 26 March 2015 # One thing we can do with large quantities of machine-processable text is to use algorithms to try "to locate and classify elements in text into pre-defined categories such as the names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc." ([Wikipedia](http://en.wikipedia.org/wiki/Named-entity_recognition)). This notebook demonstrates how that process can work. # In[1]: import os import re import operator from collections import defaultdict import nltk # Here, we're using the [Natural Language Toolkit](http://www.nltk.org/), which describes itself as "a leading platform for building Python programs to work with human language data." # In[2]: WORKDIR = '/Users/libraries/Code/taw-data/' # In[3]: BOOKS_DATA_DIR = os.path.join(WORKDIR,'Books') ESSAY_DATA_DIR = os.path.join(WORKDIR, 'Essay_Contest') # In[4]: def glob_txt(dir): dirs_seen = {} for dir_name, subdirs, file_list in os.walk(dir): if os.path.split(dir_name)[-1] == 'pages': txt_globs = [] for f in file_list: with open(os.path.join(dir_name, f)) as infile: txt_buff = infile.read() txt_globs.append(txt_buff) dirs_seen[dir_name] = ' '.join(txt_globs) else: pass return dirs_seen # In[5]: directories = glob_txt(ESSAY_DATA_DIR) # In our sample set of essays we have just over 100 entries … # In[6]: len(directories.keys()) # These two functions utilize the tools provided by the NLTK to prepare, parse, and analyze our text data… # In[7]: def extract_entity_names(t): """ Collect parts of speech marked as named entities from nested parsed sentence trees Adapted from https://gist.github.com/onyxfish/322906 """ entity_names = [] if hasattr(t, 'label') and t.label: if t.label() == 'NE': entity_names.append(' '.join([child[0] for child in t])) else: for child in t: entity_names.extend(extract_entity_names(child)) return entity_names # In[8]: def run_ner(dirpath): """ Run a number of NLTK text processing functions in sequence. Return a set of unique named entities recognized in texts in a given directory """ sentences = nltk.sent_tokenize(directories[dirpath]) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] # Once text is broken up into words and sentences we use a default 'tagger' to # assign parts of speech to sections of text tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True) entity_names = [] for tree in chunked_sentences: entity_names.extend(extract_entity_names(tree)) unique_names = set(entity_names) return unique_names # Store the results so we can inspect them … # In[9]: results = {} for d in directories.keys(): names = run_ner(d) results[d] = names # In[10]: entities = defaultdict(int) for k,v in results.items(): item_name = re.split('\/', k)[-2] new_path = os.path.join(WORKDIR, 'analyzed') os.makedirs(new_path, exist_ok=True) print('Essay: {0}'.format(item_name)) with open(os.path.join(new_path, '{0}-entities.txt'.format(item_name)), 'w') as outfile: for named_entity in v: entities[named_entity] += 1 outfile.write(named_entity) # In[11]: for k,v in sorted(entities.items(), key=operator.itemgetter(1), reverse=True): print('{0}: \t {1}'.format(k,v)) # Discuss … # In[ ]: