import bokeh.plotting as bp import ftfy import numpy as np import os import pandas as pd import re import requests from bokeh.embed import notebook_div from bokeh.models import HoverTool from collections import Counter from textblob import TextBlob from textblob import Word # create a directory for them if it doesn't exist if not os.path.isdir('tempdir'): os.mkdir('tempdir') for episode_number in list(range(2,80)) + ['16b']: url = 'http://www.chakoteya.net/startrek/{}.htm'.format(episode_number) html = requests.get(url).content.decode('latin-1') with open(os.path.join('tempdir', 'script{}.htm'.format(episode_number)), 'wb+') as f: f.write(html.encode('utf-8')) _episode_num = [] _speaker = [] _line = [] _episode = [] for episode_number in list(range(2,80)) + ['16b']: f = open(os.path.join('tempdir', 'script{}.htm'.format(episode_number)), 'r') html = f.read() f.close() by_p_tag = [x.replace('

', '') for x in html.split('

')] # fix unicode errors by_p_tag = [ftfy.fix_text(x, fix_entities=True) for x in by_p_tag] for i, subset in enumerate(by_p_tag): _ = re.sub('\n', ' ', subset) _ = re.sub('<.+?>', '\n', _) _ = [x.strip() for x in _.splitlines()] if i == 0: episode = _[9].replace('The Star Trek Transcripts - ', '') for line in _: munged_line = line munged_line = re.sub('\(.+?\) ', '', munged_line) munged_line = re.sub('\(.+?\)', '', munged_line) munged_line = re.sub('\[.+?\]', '', munged_line) if len(munged_line) > 0 and munged_line != 'GELDER: ': if re.search("(^[A-Z']{2}.+?: )", munged_line): found = re.search("(^[A-Z']{2}.+?: )", munged_line).group(1).strip() _speaker.append(re.search("^([A-Z' ]+)", found).group(1).strip()) _line.append(re.search("^[A-Z']{2}.+?: (.+)", munged_line).group(1).strip()) else: _speaker.append('') _line.append(munged_line) _episode_num.append(episode_number) _episode.append(episode) df = pd.DataFrame({'episode_num': _episode_num, 'episode': _episode, 'speaker': _speaker, 'line': _line}) _ep = [] _title = [] for episode_number in list(range(2,80)) + ['16b']: try: os.remove(os.path.join('tempdir', 'tos_munged_{}.txt'.format(episode_number))) except FileNotFoundError: pass dftemp = df[df.episode_num == episode_number] _ep.append(episode_number) if episode_number == 12: _title.append('Miri') # correcting a formatting difference resulting in the wrong title else: _title.append(dftemp.episode.iloc[0]) with open(os.path.join('tempdir', 'tos_munged_{}.txt'.format(episode_number)), 'a+') as f: f.write('character\tline\n') for idx, row in dftemp.iterrows(): f.write('{}\t{}'.format(row.speaker, row.line)) f.write('\n') dfep = pd.DataFrame({'number': _ep, 'title': _title}) dfep.to_csv('tos_episode_numbers.tsv', sep='\t', index=False) epnums = pd.read_csv('tos_episode_numbers.tsv', sep='\t') epnum2name = {} for idx, row in epnums.iterrows(): epnum2name[row.number] = row.title to_remove = ["(They struggle for the phaser. Spock", "gets between Nancy and Kirk.", ")", "Starbase Computer Centre]", "To be concluded next week...", ] to_replace = {"Captain! (everyone rushes over in time to see the door closing in the rock. They fire their phasers at it to no avail.": "Captain!", "SURAK I'": "SURAK", "\t(One last scream and convulsion, then all her readings plummet to zero. KIRK: My brother's son": "KIRK\tMy brother's son", "\t. SPOCK: Free, Captain?": "SPOCK\tFree, Captain?", "\t SULU: ": "SULU\t", "\tKirk; All decks alert. Captain Christopher": "KIRK\tAll decks alert. Captain Christopher", "comes out. KIRK: Nomad. Nomad?": "comes out.\nKIRK\tNomad. Nomad?", "a very big stick. AKUTA: Vaal": "a very big stick.\nAKUTA\tVaal", "has been bathing. KIRK: Nona. Pardon me": "has been bathing.\nKIRK\tNona. Pardon me", "\t) MAN 1: Tyree's woman": "MAN 1\tTyree's woman", "\tMIRAMANEE@ For": "MIRAMANEE\tFor", "\tSpock Fascinating. Intergalactic travel": "SPOCK\tFascinating. Intergalactic travel", "\tM5: ": "M5\t", "\tWU: ": "WU\t", "'\n\tAll I ask is a tall ship, and a star to steer by\n\t'.": "'All I ask is a tall ship, and a star to steer by'.", "\tVina\n": "\n", "\t.\n": "\n", "the\n\tTholian Sector\n\t,": "the Tholian Sector,", "\t??: The captain.": "CREWMAN\tThe captain.", "\t.)\n": "\n", "\tKIRK; Like a computer,": "KIRK\tLike a computer,", } to_concat = ['I was thinking if there was just one', 'I receive orders to proceed here. No reason given.', 'Back to duty status, Fisher. I have no sympathy for clumsiness.', "Tell Jose he'll get his chili peppers when we get there. Tell him they're prime Mexican reds. I handpicked them myself, but he won't die if he goes a few more days without them. Got it?", "How? You've been here five years. Will a couple of days make a difference? Mister Spock.", 'They may have solved that problem.', 'The skies are green and glowing', 'I want landing parties to co-ordinate the colonists', 'Where my heart is', 'Where the scented lunar flower is growing', 'Somewhere beyond the stars', 'Beyond Antares.', 'Forever is just a day', 'Forever is just another journey', "Where my love eternal is waiting'", 'Somewhere beyond the stars', 'Beyond Antares.', 'General Trelane, retired. At your service, sirs. My home is your home.', "Now, that's a sample of the atmosphere of this planet outside my kindly influence. Now, you will behave yourself hereafter, won't you? Or I shall be very, very angry.", "And this, of course, is an array of your battle flags and pennants dating back to the Crusades, to Hannibal's invaders, the Grecian war galleys, and the hordes of Persia. Can't you imagine it, Captain? The thousands of men marching off to their deaths, singing beneath these banners. Doesn't it make your blood run swiftly?", 'Careful, careful. Shock, radiation burns, internal injuries for certain.', 'Well, that gives us a little time.', 'Mister Spock, you and I have some serious thinking to do. When we leave here tomorrow morning, I want to have a plan of action.', 'Peace.', "Mister Vanderberg, I'll need a complete subsurface chart of all the drifts, galleries, tunnels.", 'Within range of our sensors, there is no life, other than the accountable human residents of this colony beneath the surface. At least, no life as we know it.', 'Commander Giotto, take your detail. Go directly to the twenty-third level. Start your search there.', 'What is there to understand, Mister Spock?', "Make no mistake. They can be highly dangerous. The Capellans' basic weapon, the kligat. At any distance up to a hundred yards, they can make it almost as effective against a man as a phaser.", 'Enterprise, acknowledge. Please acknowledge.', 'Oochy-woochy coochy-coo. Oochy-woochy coochy-coo.', 'Yes, there is something here. Something terrible. I feel its presence. Fear, anger, hatred. Anger feeds the flame. Oh! Oh! There is evil here. Monstrous, terrible evil. Consuming hunger. Hatred of all that lives. Hatred of women. A hunger that never dies. It is strong, overpowering. An ancient terror. It has a name. Beratis, Kesla, Redjac! Devouring all life, all light. A hunger that will never die! Redjac! Redjac!', 'Nomad was a thinking machine, the best that could be engineered. It was a prototype.', 'Yes.', 'Ready, sir?', "They've come to pay their respects to Alvin.", "Mister Spock, isn't that too much for our purposes?", 'Four fleeing fish. A fine haul. Flavius Maximus.', "Your next improvement. Notice what we've done to the striker. See how it holds the priming powder more securely? Fewer misfires. When I return, we will give you other improvements. A rifled barrel.", "You won't be working with them, you'll be working with us, our bodies. They'll be inside us, and we'll be", 'Why do the Nazis treat you as enemies?', 'Decks four and six are living quarters, are they not?', 'What are you?', 'Beam down the guards.', 'Captain Kirk.', 'Starfleet Command supersedes your order, sir!', 'She walks in beauty, like the night', 'Well, then, come. We must hurry to join', 'Stop! Stop, please!', "Deeply they'll swallow from your finest kegs. Then swiftly be gone leaving bitter dregs. Ah, bitter dregs.", 'With smiling words and tender touch. Man offers little and asks for so much"', 'He loves in the breathless excitement of night, then leaves with your treasure in cold morning light. Ah, in cold morning light"', "What you see before you, Captain, is my latest invention. Quite ordinary in appearance, almost primitive, wouldn't you say?", 'Present course will bring it across our starboard side, sir.', 'Locked in, sir.', "Our ship's sensors did not reveal your presence here, Mister Flint.", 'Vians of Minara', "demanded that we let Bones die, we didn't permit it."] not_found = set(list(to_replace.keys()) + to_remove + to_concat) for episode_number in list(range(2,80)) + ['16b']: filename_in = os.path.join('tempdir', 'tos_munged_{}.txt'.format(episode_number)) filename_out = os.path.join('tempdir', 'tos_munged_2_{}.txt'.format(episode_number)) with open(filename_in, 'r') as f: txt = f.read() for item in to_remove: if txt.find('\t' + item + '\n') != -1: txt = txt.replace('\t' + item + '\n', '') not_found.remove(item) for k, v in to_replace.items(): if txt.find(k) != -1: txt = txt.replace(k, v) not_found.remove(k) for item in to_concat: if txt.find('\n\t' + item) != -1: txt = txt.replace('\n\t' + item, ' ' + item) not_found.remove(item) with open(filename_out, 'w') as f: f.write(txt) # ensure all the errors were found and fixed assert len(not_found) == 0 # load munged scripts into one df df = pd.DataFrame() for episode_number in list(range(2,80)) + ['16b']: _ = pd.read_csv(os.path.join('tempdir', 'tos_munged_2_{}.txt'.format(episode_number)), sep='\t', encoding='latin-1') _.fillna(' ') _['episode_num'] = str(episode_number) df = pd.concat([df, _]) df.reset_index(drop=True, inplace=True) toggle = '' # go through each row for idx, row in df.iterrows(): if pd.isnull(row.character): if toggle == 'title': df.loc[idx, 'character'] = 'eraseme' toggle = '' elif toggle == 'stardate': df.loc[idx, 'character'] = 'eraseme' toggle = '' elif row.line.find('The Star Trek Transcripts - ') != -1: toggle = 'title' df.loc[idx, 'character'] = 'eraseme' elif row.line[:8] == 'Stardate' and len(row.line) < 40: toggle = 'stardate' df.loc[idx, 'character'] = 'eraseme' elif row.line.find('Original Airdate') != -1: df.loc[idx, 'character'] = 'eraseme' elif len(row.line) < 60 and row.line[0] in ['[', '{'] and row.line[-1] in [']', '}']: df.loc[idx, 'character'] = 'eraseme' elif row.line[:11] in ["CBS Studios", 'Star Trek ®']: df.loc[idx, 'character'] = 'eraseme' elif row.line == 'Unaired pilot': df.loc[idx, 'character'] = 'eraseme' elif row.line[:11] == '. Copyright': df.loc[idx, 'character'] = 'eraseme' elif row.line[:9] == 'Copyright': df.loc[idx, 'character'] = 'eraseme' elif row.line[:10] == 'Stardate: ': df.loc[idx, 'character'] = 'eraseme' elif re.search('^([A-Z]+); (.+)', row.line): found = re.search('^([A-Z]+); (.+)', row.line) df.loc[idx, 'character'] = found.group(1) df.loc[idx, 'line'] = found.group(2) elif re.search('^([A-Z]+) (.+)', row.line): found = re.search('^([A-Z]+) (.+)', row.line) df.loc[idx, 'character'] = found.group(1) df.loc[idx, 'line'] = found.group(2) elif re.search("^Captain's [Ll]og", row.line) or re.search('^Personal', row.line) or row.line[:8] == 'Stardate': if (row.line.find("Science Officer Spock reporting for Captain Kirk.") != -1 or row.line.find("First Officer Spock reporting.") != -1 or row.line.find("Entry made by Second Officer Spock.") != -1 or row.line.find("First Officer Spock commanding.") != -1 or row.line.find("First Officer Spock in temporary command.") != -1): df.loc[idx, 'character'] = 'SPOCK' elif row.line.find("Lieutenant Commander Scott in temporary command.") != -1: df.loc[idx, 'character'] = 'SPOCK' else: df.loc[idx, 'character'] = 'KIRK' elif row.line[:7] == 'SCOTT@ ': df.loc[idx, 'character'] = 'SCOTT' df.loc[idx, 'line'] = row.line[7:] elif row.line.find("Star date 1313.1. We're") != -1: df.loc[idx, 'character'] = 'KIRK' elif row.line.find("Ship's log Stardate 2124.5. First Officer Spock") != -1: df.loc[idx, 'character'] = 'SPOCK' elif row.line.find("(From his vantage point") != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find("But the Gorn has to move the boulder") != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find("Captain 's log Stardate") != -1: df.loc[idx, 'character'] = 'KIRK' elif row.line.find("Chief Engineer Scott recording.") != -1: df.loc[idx, 'character'] = 'SCOTT' elif row.line.find("Lieutenant Sulu recording.") != -1: df.loc[idx, 'character'] = 'SULU' elif row.line.find("First Officer Spock recording.") != -1: df.loc[idx, 'character'] = "SPOCK" elif row.line.find("Lieutenant Commander Scott recording") != -1: df.loc[idx, 'character'] = "SCOTT" elif row.line.find("Lieutenant Commander Scott reporting") != -1: df.loc[idx, 'character'] = "SCOTT" elif row.line.find("[OC}; ") != -1: df.loc[idx, 'line'] = row.line[6:] elif row.line[0] in ["(", "[", "{", "]", ">"]: df.loc[idx, 'character'] = 'eraseme' elif row.line.find("Later...)") != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find("The Andorian pulls a knife") != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line[:9] == '. SCOTT: ': df.loc[idx, 'character'] = 'SCOTT' df.loc[idx, 'line'] = row.line[9:] elif row.line[:8] == '. KIRK: ': df.loc[idx, 'character'] = 'KIRK' df.loc[idx, 'line'] = row.line[8:] elif row.line[:11] == '. MARLENA: ': df.loc[idx, 'character'] = 'MARLENA' df.loc[idx, 'line'] = row.line[11:] elif row.line == '.': df.loc[idx, 'character'] = 'eraseme' elif row.line.find('blue and pink tie-die') != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find(')Spock leaves') != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find('They kiss properly') != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find('Enterprise Medical Log') != -1: df.loc[idx, 'character'] = 'MCCOY' elif row.line.find('Mister Spock reporting.') != -1: df.loc[idx, 'character'] = 'SPOCK' elif row.line.find('Norman is behind a console') != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find('Engineer Scott reporting.') != -1: df.loc[idx, 'character'] = 'SCOTT' elif row.line.find('has a whip and net, which Kirk') != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find('Medical Log. Stardate 4770.3.') != -1: df.loc[idx, 'character'] = 'MCCOY' elif row.line.find("Captain 's Log, stardate 4731.3") != -1: df.loc[idx, 'character'] = 'KIRK' elif row.line.find('Miramanee takes the headband') != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find('then everything around him slows') != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find('Marta gently gives the ') != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find('First Officer Spock reporting.') != -1: df.loc[idx, 'character'] = 'SPOCK' elif row.line.find("An engineer touches Scott's arm") != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find("Spock and Droxine enter") != -1: df.loc[idx, 'character'] = 'eraseme' elif row.line.find("Kirk is being bathed in coloured") != -1: df.loc[idx, 'character'] = 'eraseme' if row.line.find("(") != -1: df.loc[idx, 'line'] = re.sub("\(.+", "", row.line) df = df[~(df.character == 'eraseme')] for episode_number in list(range(1,80)) + ['16b']: try: os.remove('tos_transcript_{}.txt'.format(episode_number)) except FileNotFoundError: pass dftemp = df[df.episode_num == str(episode_number)] with open('tos_transcript_{}.txt'.format(episode_number), 'a+') as f: f.write('character\tline\n') for idx, row in dftemp.iterrows(): f.write('{}\t{}'.format(row.character, row.line)) f.write('\n') df = pd.DataFrame() for episode_number in list(range(2,80)) + ['16b']: _ = pd.read_csv('tos_transcript_{}.txt'.format(episode_number), sep='\t', encoding='latin-1') _.fillna(' ') _['episode_num'] = str(episode_number) df = pd.concat([df, _]) df.reset_index(drop=True, inplace=True) df.head() %%time print("Before disambiguation, there were {} unique character names.".format(len(df.character.unique()))) for idx, row in df.iterrows(): name = row.character newname = name.title() newname = newname.replace("Mcc", "McC") epnum = str(row.episode_num) if name == 'ADMIRAL' and epnum == '42': newname = 'Admiral Fitzpatrick' elif name in ['ENSIGN', 'ROMULAN', 'ELDER', 'MAN', 'NURSE', 'BOTH', 'BARMAN', 'PILOT', 'KLINGON', 'OFFICER', 'WOMAN', 'ENGINEER', 'CHIEF', 'VOICE', 'GUARD', 'ALL', 'SECURITY', 'MAN', 'CREWMAN', 'ADMIRAL']: newname = "{} (ep.{})".format(name.title(), epnum) elif name == 'FARRELL': if epnum == '39': newname = 'Farrell (Bodyguard)' else: newname = 'John Farrell' elif name == 'MARTHA': if epnum == '13': newname = 'Martha Leighton' else: newname = 'Martha Landon' elif name == 'ROBERT': if epnum == '40': newname = 'Robert Johnson' else: newname = 'Robert Tomlinson' elif name == 'COMMANDER': if epnum == '9': newname = 'Romulan Commander (Male)' else: newname = 'Romulan Commander (Female)' elif name == 'RUTH': if epnum == '4': newname = 'Ruth Bonaventure' else: newname = 'Ruth (Construct)' elif name == 'NANCY': if epnum == '6': newname = 'Nancy Crater' else: newname = 'Nancy Hedford' elif name == 'GREEN': if epnum == '77': newname = 'Colonel Green' else: newname = 'Crewman Green' elif name == 'SAM': if epnum == '26': newname = 'SAM (Janus VI)' else: newname = 'Sam (Crewman)' elif name == 'HANSEN': if epnum == '9': newname = 'Commander Hansen' else: newname = 'Lt. Hansen' elif name == 'ALICE': if epnum == '17': newname = 'Alice in Wonderland' else: newname = 'Alice (Android)' elif name == 'JONES': if epnum == '42': newname = 'Cyrano Jones' else: newname = 'Dr. Jones' elif name == 'KARA': if epnum == '36': newname = 'Kara (Argelian)' else: newname = 'Kara (Eymorg)' elif name == 'SYLVIA': if epnum == '56': newname = 'Sylvia (Melkotian)' else: newname = 'Sylvia (Old One)' df.loc[idx, 'character'] = newname print("After disambiguation, there are {} unique character names.".format(len(df.character.unique()))) %%time _chars = [] _lines = [] _eps = [] _titles = [] for char in df.character.unique(): _eps_set = set() _df = df[df.character == char] _list = [] for idx, row in _df.iterrows(): _eps_set.add(row.episode_num) _list.append(row.line) _chars.append(char) _lines.append(' '.join(_list)) _eps.append(len(_eps_set)) if len(_eps_set) == 1: _titles.append(list(_eps_set)[0]) else: _titles.append(_eps_set) dfc = pd.DataFrame({'character': _chars, 'dialogue': _lines, 'episodes': _eps, 'titles': _titles}) _eps = [] _lines2 = [] for epnum in df.episode_num.unique(): _df = df[df.episode_num == epnum] _list = [] for idx, row in _df.iterrows(): _list.append(row.line) _eps.append(epnum) _lines2.append(' '.join(_list)) dfe = pd.DataFrame({'episode': _eps, 'dialogue': _lines2}) dfs = [dfc, dfe] def blobit(cell): _ = TextBlob(cell.lower()).words return [Word(x).lemmatize() for x in _] for df in dfs: df['blob'] = df.dialogue.apply(blobit) dfc['words'] = [len(x) for x in dfc.blob] dfc['words_per_episode'] = [int(dfc.words.iloc[x] / dfc.episodes.iloc[x]) for x in range(len(dfc))] dfc.to_csv('star_trek_tos_characters.csv') dfe.to_csv('star_trek_tos_episodes.csv') dfc.head() bp.output_file("star_trek_tos_char_dialogue.html", title="Star Trek TOS Character Dialogue", mode='cdn') p = bp.figure(plot_width=600, plot_height=450, title="Star Trek TOS dialogue plot", tools="hover", min_border=1, x_axis_label = "Words per episode", y_axis_label = "Number of episodes", y_range = [0, 85]) p.scatter( x = dfc.words_per_episode, y = dfc.episodes, radius= 15, line_color="#333333", line_width=2, alpha=0.7, source=bp.ColumnDataSource({"character": dfc.character}) ).select(dict(type=HoverTool)).tooltips = {"":"@character"} bp.show(p) bp.save(p) os.chdir('tempdir') import glob for filename in glob.glob('*.*'): os.remove(filename) os.chdir('..') os.rmdir('tempdir')