import re
import itertools
import string
import csv
# Open raw text and split over newlines
FILE = open('/bigdrive/Documents/MSAN622_Data_Visualization/msan622/project-prototype/Catch-22.txt')
data = FILE.read()
data = data.split("\n")
# Find chapters, skipping intro and appendix
chapters = {}
key = False
for line in data:
# Find chapter markers and make new dictionary entry
if re.match(r'^[0-9]+ [A-Za-z0-9-&\'. ]+$', line) is not None:
key = int(line[0:2])
chapters[key] = []
# If we have a new chapter, append the lowercase, punctuation-cleaned words to the dictionary
elif key:
chapters[key].append(line.lower().translate(string.maketrans("",""), string.punctuation).split())
# Stop at the end of the book
if line == 'APPENDIX':
break
# Clean up broken lists into one total list for each chapter
for chapter in chapters:
chapters[chapter] = list(itertools.chain(*chapters[chapter]))
# Now look for occurances of the main characters in the book
char_names = {'yossarian':"Yossarian",
'chaplain':"Chaplain Tappman",
'milo':"Milo Minderbinder",
'cathcart':"Colonel Cathcart",
'korn':"Colonel Korn",
'nately':"Nately",
'orr':"Orr",
'major':"Major Major Major Major",
'dunbar':"Dunbar",
'daneeka':"Doc Daneeka",
'joe':"Hungry Joe",
'clevinger':"Clevinger",
'aarfy':"Aarfy",
'dreedle':"General Dreedle",
'danby':"Major Danby",
'mcwatt':"McWatt",
'scheisskopf':"General Scheisskopf",
'peckem':"General Peckem",
'dobbs':"Dobbs",
'whitcomb':"Corporal Whitcomb",
'black':"Captain Black",
'halfoat':"Chief White Halfoat",
'duckett':"Nurse Duckett",
'coverley':"Major — de Coverley",
'wintergreen':"ex-P.F.C. Wintergreen",
'appleby':"Appleby",
'havermeyer':"Havermeyer",
'snowden':"Snowden"}
# Loop through characters and chapters, index an appearance by the percentile of a chapter,
# ie. 1st percentile of chapter 2, encode as 2.01
characters = {character: [] for character in char_names}
for character in characters:
for chapter in chapters:
length = len(chapters[chapter])
# Speacial handling for Major Major (Major Major Major Major)
if character == 'major':
b = ['major','major']
location = [i for i in range(len(chapters[chapter])) if chapters[chapter][i:i+len(b)] == b]
location.append(0)
location = [location[i] for i in range(len(location) - 1) if location[i] != location[i+1] - 1]
location = [(chapter + (float(x)/length)) for x in location]
# Speacial handling for Captain Black
elif character == 'black':
b = ['captain','black']
location = [(chapter + (float(i)/length)) for i in range(len(chapters[chapter])) if
chapters[chapter][i:i+len(b)] == b]
else:
location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if
x == character]
characters[character].append(location)
# Clean up broken lists, remove duplicates (only relevant if binning locations) and sort
characters[character] = sorted(list(set(list(itertools.chain(*characters[character])))))
# Print summary of number of appearances, limit character dictionary to those only appearing 50+ times
for char in sorted(characters):
print char, len(characters[char])
# Now load it into a melted CSV file with characters and their appearance times
with open('catch22.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
headers = ['Character', 'Chapter']
csvwriter.writerow(headers)
for character in characters:
for location in characters[character]:
this_row = [char_names[character], location]
csvwriter.writerow(this_row)
# Now look for occurances of the main locations visited in the book
locations = {'pianosa':'Pianosa, Italy',
'rome':'Rome, Italy',
'smyrna':'Smyrna, Turkey',
'corsica':'Corsica, France',
'parma':'Parma, Italy',
'salerno':'Salerno, Italy',
'marrakech':'Marrakech, Morocco',
'malta':'Valletta, Malta',
'cairo':'Cairo, Egypt',
'sicily':'Sicily, Italy',
'istanbul':'Istanbul, Turkey',
'etna':'Mt Etna, Italy',
'vesuvius':'Mt Vesuvius, Italy',
'palermo':'Palermo, Italy',
'catania':'Catania, Italy',
'oran':'Oran, Algeria',
'beirut':'Beirut, Lebanon',
'bengasi':'Bengasi, Libya',
'sardinia':'Sardinia, Italy',
'barcelona':'Barcelona, Spain',
'leghorn':'Livorno, Italy',
'marseilles':'Marseilles, France',
'spezia':'Spezia, Italy',
'majorca':'Majorca, Spain',
'elba':'Elba, Italy',
'ferrara':'Ferrara, Italy',
'bologna':'Bologna, Italy',
'arezzo':'Arezzo, Italy',
'avignon':'Avignon, France'}
# Use OpenStreetMaps to geo-code the cities
from geopy.geocoders import Nominatim
geolocator = Nominatim(timeout=10)
loc_geo = {}
for locale in sorted(locations):
address, (latitude, longitude) = geolocator.geocode(locations[locale])
loc_geo[locale] = (latitude, longitude)
# Loop through locations and chapters, index a location mention by the percentile of a chapter,
# ie. 1st percentile of chapter 2, encode as 2.01
loc_times = {locale: [] for locale in locations}
for locale in locations:
for chapter in chapters:
length = len(chapters[chapter])
location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if
x == locale]
loc_times[locale].append(location)
# Clean up broken lists, remove duplicates (only relevant if binning locations) and sort
loc_times[locale] = sorted(list(set(list(itertools.chain(*loc_times[locale])))))
# Now load it into a melted CSV file with locations, their mention times, and the geo-coded location
with open('catch22geo.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
headers = ['Location', 'Time', 'Lat', 'Lon']
csvwriter.writerow(headers)
for locale in sorted(locations):
for t in loc_times[locale]:
this_line = [locale, t, loc_geo[locale][0], loc_geo[locale][1]]
csvwriter.writerow(this_line)
import nltk
from nltk.tag.simplify import simplify_wsj_tag
# Now look for the words surrounding our main character
yo_words = {'words': [], 'locs': []}
for chapter in chapters:
length = len(chapters[chapter])
location = [i for i, x in enumerate(chapters[chapter]) if x == 'yossarian']
# Expand range of words to 20 either side, this just gets indexes
locations = [range(max(0,(i-25)),min(len(chapters[chapter]),(i+26))) for i in location]
# Remove duplicates for overlapping ranges
locations = list(set(list(itertools.chain(*locations))))
# Grab the words and store to dictionary
words = [chapters[chapter][i] for i in locations]
locations = [(chapter + (float(x)/length)) for x in locations]
yo_words['words'].append(words)
yo_words['locs'].append(locations)
# Clean up broken liss
yo_words['words'] = list(itertools.chain(*yo_words['words']))
yo_words['locs'] = list(itertools.chain(*yo_words['locs']))
yo_words['words'] = nltk.pos_tag(yo_words['words'])
yo_words['words'] = [(word, simplify_wsj_tag(tag)) for word, tag in yo_words['words']]
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.extend(('said','thats','im','dont','got','get','say','youre'))
# Now load it into a melted CSV file with word, POS type and their mention times
with open('catch22pos.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
headers = ['Word', 'Time', 'POS']
csvwriter.writerow(headers)
for i in range(len(yo_words['locs'])):
if yo_words['words'][i][0] not in stop and yo_words['words'][i][0] not in char_names:
this_line = [yo_words['words'][i][0], yo_words['locs'][i], yo_words['words'][i][1]]
csvwriter.writerow(this_line)