import re
import itertools
import string
import csv
# Open raw text and split over newlines
FILE = open('/bigdrive/Documents/MSAN622_Data_Visualization/msan622/project-prototype/Catch-22.txt')
data = FILE.read()
data = data.split("\n")
# Find chapters, skipping intro and appendix
chapters = {}
key = False
for line in data:
# Find chapter markers and make new dictionary entry
if re.match(r'^[0-9]+ [A-Za-z0-9-&\'. ]+$', line) is not None:
key = int(line[0:2])
chapters[key] = []
# If we have a new chapter, append the lowercase, punctuation-cleaned words to the dictionary
elif key:
chapters[key].append(line.lower().translate(string.maketrans("",""), string.punctuation).split())
# Stop at the end of the book
if line == 'APPENDIX':
break
# Clean up broken lists into one total list for each chapter
for chapter in chapters:
chapters[chapter] = list(itertools.chain(*chapters[chapter]))
# Now look for occurances of the main characters in the book
char_names = {'yossarian':"Yossarian",
'chaplain':"Chaplain Tappman",
'milo':"Milo Minderbinder",
'cathcart':"Colonel Cathcart",
'korn':"Colonel Korn",
'nately':"Nately",
'orr':"Orr",
'major':"Major Major Major Major",
'dunbar':"Dunbar",
'daneeka':"Doc Daneeka",
'joe':"Hungry Joe",
'clevinger':"Clevinger",
'aarfy':"Aarfy",
'dreedle':"General Dreedle",
'danby':"Major Danby",
'mcwatt':"McWatt",
'scheisskopf':"General Scheisskopf",
'peckem':"General Peckem",
'dobbs':"Dobbs",
'whitcomb':"Corporal Whitcomb",
'black':"Captain Black",
'halfoat':"Chief White Halfoat",
'duckett':"Nurse Duckett",
'coverley':"Major — de Coverley",
'wintergreen':"ex-P.F.C. Wintergreen",
'appleby':"Appleby",
'havermeyer':"Havermeyer",
'snowden':"Snowden"}
# Loop through characters and chapters, index an appearance by the percentile of a chapter,
# ie. 1st percentile of chapter 2, encode as 2.01
characters = {character: [] for character in char_names}
for character in characters:
for chapter in chapters:
length = len(chapters[chapter])
# Speacial handling for Major Major (Major Major Major Major)
if character == 'major':
b = ['major','major']
location = [i for i in range(len(chapters[chapter])) if chapters[chapter][i:i+len(b)] == b]
location.append(0)
location = [location[i] for i in range(len(location) - 1) if location[i] != location[i+1] - 1]
location = [(chapter + (float(x)/length)) for x in location]
# Speacial handling for Captain Black
elif character == 'black':
b = ['captain','black']
location = [(chapter + (float(i)/length)) for i in range(len(chapters[chapter])) if
chapters[chapter][i:i+len(b)] == b]
else:
location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if
x == character]
characters[character].append(location)
# Clean up broken lists, remove duplicates (only relevant if binning locations) and sort
characters[character] = sorted(list(set(list(itertools.chain(*characters[character])))))
# Print summary of number of appearances, limit character dictionary to those only appearing 50+ times
for char in sorted(characters):
print char, len(characters[char])
aarfy 130 appleby 55 black 70 cathcart 310 chaplain 446 clevinger 131 coverley 59 danby 127 daneeka 150 dobbs 82 dreedle 128 duckett 61 dunbar 169 halfoat 69 havermeyer 52 joe 141 korn 214 major 183 mcwatt 116 milo 393 nately 205 orr 185 peckem 102 scheisskopf 115 snowden 52 whitcomb 78 wintergreen 56 yossarian 1347
# Now load it into a melted CSV file with characters and their appearance times
with open('catch22.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
headers = ['Character', 'Chapter']
csvwriter.writerow(headers)
for character in characters:
for location in characters[character]:
this_row = [char_names[character], location]
csvwriter.writerow(this_row)
# Now look for occurances of the main locations visited in the book
locations = {'pianosa':'Pianosa, Italy',
'rome':'Rome, Italy',
'smyrna':'Smyrna, Turkey',
'corsica':'Corsica, France',
'parma':'Parma, Italy',
'salerno':'Salerno, Italy',
'marrakech':'Marrakech, Morocco',
'malta':'Valletta, Malta',
'cairo':'Cairo, Egypt',
'sicily':'Sicily, Italy',
'istanbul':'Istanbul, Turkey',
'etna':'Mt Etna, Italy',
'vesuvius':'Mt Vesuvius, Italy',
'palermo':'Palermo, Italy',
'catania':'Catania, Italy',
'oran':'Oran, Algeria',
'beirut':'Beirut, Lebanon',
'bengasi':'Bengasi, Libya',
'sardinia':'Sardinia, Italy',
'barcelona':'Barcelona, Spain',
'leghorn':'Livorno, Italy',
'marseilles':'Marseilles, France',
'spezia':'Spezia, Italy',
'majorca':'Majorca, Spain',
'elba':'Elba, Italy',
'ferrara':'Ferrara, Italy',
'bologna':'Bologna, Italy',
'arezzo':'Arezzo, Italy',
'avignon':'Avignon, France'}
# Use OpenStreetMaps to geo-code the cities
from geopy.geocoders import Nominatim
geolocator = Nominatim(timeout=10)
loc_geo = {}
for locale in sorted(locations):
address, (latitude, longitude) = geolocator.geocode(locations[locale])
loc_geo[locale] = (latitude, longitude)
# Loop through locations and chapters, index a location mention by the percentile of a chapter,
# ie. 1st percentile of chapter 2, encode as 2.01
loc_times = {locale: [] for locale in locations}
for locale in locations:
for chapter in chapters:
length = len(chapters[chapter])
location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if
x == locale]
loc_times[locale].append(location)
# Clean up broken lists, remove duplicates (only relevant if binning locations) and sort
loc_times[locale] = sorted(list(set(list(itertools.chain(*loc_times[locale])))))
# Now load it into a melted CSV file with locations, their mention times, and the geo-coded location
with open('catch22geo.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
headers = ['Location', 'Time', 'Lat', 'Lon']
csvwriter.writerow(headers)
for locale in sorted(locations):
for t in loc_times[locale]:
this_line = [locale, t, loc_geo[locale][0], loc_geo[locale][1]]
csvwriter.writerow(this_line)
import nltk
from nltk.tag.simplify import simplify_wsj_tag
# Now look for the words surrounding our main character
yo_words = {'words': [], 'locs': []}
for chapter in chapters:
length = len(chapters[chapter])
location = [i for i, x in enumerate(chapters[chapter]) if x == 'yossarian']
# Expand range of words to 20 either side, this just gets indexes
locations = [range(max(0,(i-25)),min(len(chapters[chapter]),(i+26))) for i in location]
# Remove duplicates for overlapping ranges
locations = list(set(list(itertools.chain(*locations))))
# Grab the words and store to dictionary
words = [chapters[chapter][i] for i in locations]
locations = [(chapter + (float(x)/length)) for x in locations]
yo_words['words'].append(words)
yo_words['locs'].append(locations)
# Clean up broken liss
yo_words['words'] = list(itertools.chain(*yo_words['words']))
yo_words['locs'] = list(itertools.chain(*yo_words['locs']))
yo_words['words'] = nltk.pos_tag(yo_words['words'])
yo_words['words'] = [(word, simplify_wsj_tag(tag)) for word, tag in yo_words['words']]
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.extend(('said','thats','im','dont','got','get','say','youre'))
# Now load it into a melted CSV file with word, POS type and their mention times
with open('catch22pos.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
headers = ['Word', 'Time', 'POS']
csvwriter.writerow(headers)
for i in range(len(yo_words['locs'])):
if yo_words['words'][i][0] not in stop and yo_words['words'][i][0] not in char_names:
this_line = [yo_words['words'][i][0], yo_words['locs'][i], yo_words['words'][i][1]]
csvwriter.writerow(this_line)