import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
pd.options.display.mpl_style = 'default'
import brewer2mpl
bmap = brewer2mpl.get_map('Set2','qualitative',8,reverse=False)
colors = bmap.mpl_colors
font = {'family' : 'Open Sans',
'weight' : 'normal',
'size' : 24}
bigfont = {'family' : 'Open Sans',
'weight' : 'normal',
'size' : 36,
'color' : '#333333'}
mpl.rc('font', **font)
mpl.rc('lines', linewidth=5)
mpl.rcParams['legend.fontsize'] = 20
A modified sentiment estimator script will be used here to estimate the sentiment of each episode from the series Avatar: The Last Airbender.
url = 'http://atla.avatarspirit.net/transcripts.php?num='
pages = range(101,121)+range(201,221)+range(301,322)
urls = [url+str(x) for x in pages]
def web_sentiment_estimator(url):
#RECEIVING AND PARSING WEBPAGE TEXT
import requests
html = requests.get(url).text #gets and stores webpage's hmtl source code in string variable 'html'
from bs4 import BeautifulSoup as bs
soup = bs(html) #converts string variable to unicode
souped_text = soup.get_text() #stores unicode text in variable 'souped_text'
encoded_text = souped_text.encode('utf-8') #converts unicode text to utf-8
#EXCLUDE NON-RELEVANT PAGE SECTIONS
top = 'Transcriber'
bottom = '[End Credits]'
text = encoded_text.split(top)[1].split(bottom)[0]
#TOKENIZE TEXT
import nltk
tokens = nltk.word_tokenize(encoded_text) #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'
alpha = [word for word in tokens if word.isalpha() == True] #filters tokens to exclude non-alphabetic words
#IMPORTING DATAFRAME FUNCTIONALITY
import pandas as pd
alpha_df = pd.DataFrame(alpha)
alpha_df.columns=['Word']
#IMPORTING STOPWORD CORPUS
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
no_stopwords_df = alpha_df[[x not in stopwords for x in alpha_df['Word']]] # excludes stopwords
df = no_stopwords_df
#ESTABLISHING WORD COUNTS FOR EACH UNIQUE WORD
df['Word Count'] = [1 for x in df['Word']]
df = df.groupby(['Word']).count()
df = df.reset_index()
#DETERMINING FREQUENCY-WEIGHTS FOR EACH WORD
df['Frequency-Weight'] = 1
df.ix[df['Word Count'] == 1, 'Frequency-Weight'] = 2
df.ix[df['Word Count'] > 10, 'Frequency-Weight'] = 0.5
#CALCULATING SHARE OF TEXT FOR EACH UNIQUE WORD
df['Share of Words'] = df['Word Count']/df['Word Count'].sum()
#ASSIGNING INITAL POLARITY WITH PRE-DEFINED POS/NEG WORDBANKS
df['Polarity'] = 0
positive_emotions = ['curious', 'attract', 'surprise', 'hope', 'Hope',
'thank', 'joy', 'relief', 'proud', 'Joy',
'generous', 'sympath', 'love', 'amuse', 'Love',
'delight', 'elat', 'excit', 'happy', 'happi', 'Happi',
'joy', 'pleasure', 'affection', 'empath', 'friendl']
negative_emotions = ['alarm', 'disgust', 'indifferen', 'fear', 'Fear',
'rage', 'sorrow', 'grief', 'frustrat', 'disappoint',
'embarrass', 'shame', 'guilt', 'remorse', 'greed', 'Greed',
'miser', 'jealous', 'cruel', 'hate', 'anger', 'annoyed', 'Anger', 'Hate',
'disgust', 'irrit', 'anxious', 'anxiety', 'helpless',
'worry', 'doubt', 'shame', 'bored', 'despar', 'hurt']
#PROCESSING PRE-DEFINED POSITIVE AND NEGATIVE WORDBANKS
pos_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/positive_wordbank.txt'
pos_words_text = requests.get(pos_bank).text
pos_soup = bs(pos_words_text) #converts string variable to unicode
souped_pos_words = pos_soup.get_text() #stores unicode text in variable 'souped_text'
encoded_pos_words = souped_pos_words.encode('utf-8') #converts unicode text to utf-8
positive_words = nltk.word_tokenize(encoded_pos_words) #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'
neg_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/negative_wordbank.txt'
neg_words_text = requests.get(neg_bank).text
neg_soup = bs(neg_words_text) #converts string variable to unicode
souped_neg_words = neg_soup.get_text() #stores unicode text in variable 'souped_text'
encoded_neg_words = souped_neg_words.encode('utf-8') #converts unicode text to utf-8
negative_words = nltk.word_tokenize(encoded_neg_words)
df.ix[df.Word.isin(positive_emotions), 'Polarity'] = 1
df.ix[df.Word.isin(negative_emotions), 'Polarity'] = -1
df.ix[df.Word.isin(positive_words), 'Polarity'] = 1
df.ix[df.Word.isin(negative_words), 'Polarity'] = -1
df['Adjusted Sentiment'] = df['Polarity'] * df['Frequency-Weight']
df['Relative Weight'] = df['Word Count'] * df['Adjusted Sentiment']
df['Episode'] = url[-3:]
x = sum(df['Relative Weight'])
if x > 0:
pos_or_neg = 'Positive'
if x == 0:
pos_or_neg = 'Neutral'
if x < 0:
pos_or_neg = 'Negative'
pos_words = len([weight for weight in df['Relative Weight'] if weight > 0])
neg_words = len([weight for weight in df['Relative Weight'] if weight < 0])
#RETURNING PRINTED REPORT
print 'Aggregate text sentiment for episode',str(url[-3:])+':', pos_or_neg
with open(str(url[-3:]+'.csv'), 'w') as f:
df.to_csv(f, sep=',')
f.close()
The estimator script has been modified to write a dataframe for each episode containing sentiment data to an exclusive csv file.
for url in urls:
web_sentiment_estimator(url)
Aggregate text sentiment for episode 101: Negative Aggregate text sentiment for episode 102: Negative Aggregate text sentiment for episode 103: Negative Aggregate text sentiment for episode 104: Negative Aggregate text sentiment for episode 105: Negative Aggregate text sentiment for episode 106: Negative Aggregate text sentiment for episode 107: Negative Aggregate text sentiment for episode 108: Negative Aggregate text sentiment for episode 109: Negative Aggregate text sentiment for episode 110: Negative Aggregate text sentiment for episode 111: Negative Aggregate text sentiment for episode 112: Negative Aggregate text sentiment for episode 113: Negative Aggregate text sentiment for episode 114: Negative Aggregate text sentiment for episode 115: Negative Aggregate text sentiment for episode 116: Negative Aggregate text sentiment for episode 117: Negative Aggregate text sentiment for episode 118: Negative Aggregate text sentiment for episode 119: Negative Aggregate text sentiment for episode 120: Negative Aggregate text sentiment for episode 201: Negative Aggregate text sentiment for episode 202: Negative Aggregate text sentiment for episode 203: Negative Aggregate text sentiment for episode 204: Negative Aggregate text sentiment for episode 205: Negative Aggregate text sentiment for episode 206: Negative Aggregate text sentiment for episode 207: Negative Aggregate text sentiment for episode 208: Negative Aggregate text sentiment for episode 209: Negative Aggregate text sentiment for episode 210: Negative Aggregate text sentiment for episode 211: Negative Aggregate text sentiment for episode 212: Negative Aggregate text sentiment for episode 213: Negative Aggregate text sentiment for episode 214: Negative Aggregate text sentiment for episode 215: Negative Aggregate text sentiment for episode 216: Negative Aggregate text sentiment for episode 217: Negative Aggregate text sentiment for episode 218: Negative Aggregate text sentiment for episode 219: Negative Aggregate text sentiment for episode 220: Negative Aggregate text sentiment for episode 301: Negative Aggregate text sentiment for episode 302: Negative Aggregate text sentiment for episode 303: Negative Aggregate text sentiment for episode 304: Negative Aggregate text sentiment for episode 305: Negative Aggregate text sentiment for episode 306: Positive Aggregate text sentiment for episode 307: Negative Aggregate text sentiment for episode 308: Negative Aggregate text sentiment for episode 309: Negative Aggregate text sentiment for episode 310: Negative Aggregate text sentiment for episode 311: Negative Aggregate text sentiment for episode 312: Negative Aggregate text sentiment for episode 313: Negative Aggregate text sentiment for episode 314: Negative Aggregate text sentiment for episode 315: Negative Aggregate text sentiment for episode 316: Negative Aggregate text sentiment for episode 317: Negative Aggregate text sentiment for episode 318: Negative Aggregate text sentiment for episode 319: Negative Aggregate text sentiment for episode 320: Negative Aggregate text sentiment for episode 321: Negative
Now that we've stored dataframes for each episode individually, we can import them individually to review episodes one at a time. All we need to do is call the file associated with the episode in question for review, which has been saved by web_sentiment_estimator() as its episode number code (eg. '101.csv').
def lite_sentiment_estimator(url):
#RECEIVING AND PARSING WEBPAGE TEXT
import requests
html = requests.get(url).text #gets and stores webpage's hmtl source code in string variable 'html'
from bs4 import BeautifulSoup as bs
soup = bs(html)
souped_text = soup.get_text() #stores unicode text in variable 'souped_text'
encoded_text = souped_text.encode('utf-8') #converts unicode text to utf-8
#EXCLUDE NON-RELEVANT PAGE SECTIONS
top = 'Transcriber'
bottom = '[End Credits]'
text = encoded_text.split(top)[1].split(bottom)[0]
#TOKENIZE TEXT
import nltk
tokens = nltk.word_tokenize(text) #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'
alpha = [word for word in tokens if word.isalpha() == True] #filters tokens to exclude non-alphabetic words
#IMPORTING DATAFRAME FUNCTIONALITY
import pandas as pd
alpha_df = pd.DataFrame(alpha)
alpha_df.columns=['Word']
#IMPORTING STOPWORD CORPUS
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
df = alpha_df[[x not in stopwords for x in alpha_df['Word']]] # excludes stopwords
#ESTABLISHING WORD COUNTS FOR EACH UNIQUE WORD
df['Word Count'] = [1 for x in df['Word']]
df = df.groupby(['Word']).count()
df = df.reset_index()
#DETERMINING FREQUENCY-WEIGHTS FOR EACH WORD
df['Frequency-Weight'] = 1
df.ix[df['Word Count'] == 1, 'Frequency-Weight'] = 2
df.ix[df['Word Count'] > 10, 'Frequency-Weight'] = 0.5
#CALCULATING SHARE OF TEXT FOR EACH UNIQUE WORD
df['Share of Words'] = df['Word Count']/df['Word Count'].sum()
#ASSIGNING INITAL POLARITY WITH PRE-DEFINED POS/NEG WORDBANKS
df['Polarity'] = 0
positive_emotions = ['curious', 'attract', 'surprise', 'hope', 'Hope',
'thank', 'joy', 'relief', 'proud', 'Joy',
'generous', 'sympath', 'love', 'amuse', 'Love',
'delight', 'elat', 'excit', 'happy', 'happi', 'Happi',
'joy', 'pleasure', 'affection', 'empath', 'friendl']
negative_emotions = ['alarm', 'disgust', 'indifferen', 'fear', 'Fear',
'rage', 'sorrow', 'grief', 'frustrat', 'disappoint',
'embarrass', 'shame', 'guilt', 'remorse', 'greed', 'Greed',
'miser', 'jealous', 'cruel', 'hate', 'anger', 'annoyed', 'Anger', 'Hate',
'disgust', 'irrit', 'anxious', 'anxiety', 'helpless',
'worry', 'doubt', 'shame', 'bored', 'despar', 'hurt']
#PROCESSING PRE-DEFINED POSITIVE AND NEGATIVE WORDBANKS
pos_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/positive_wordbank.txt'
pos_words_text = requests.get(pos_bank).text
pos_soup = bs(pos_words_text) #converts string variable to unicode
souped_pos_words = pos_soup.get_text() #stores unicode text in variable 'souped_text'
encoded_pos_words = souped_pos_words.encode('utf-8') #converts unicode text to utf-8
positive_words = nltk.word_tokenize(encoded_pos_words) #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'
neg_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/negative_wordbank.txt'
neg_words_text = requests.get(neg_bank).text
neg_soup = bs(neg_words_text) #converts string variable to unicode
souped_neg_words = neg_soup.get_text() #stores unicode text in variable 'souped_text'
encoded_neg_words = souped_neg_words.encode('utf-8') #converts unicode text to utf-8
negative_words = nltk.word_tokenize(encoded_neg_words)
df.ix[df.Word.isin(positive_emotions), 'Polarity'] = 1
df.ix[df.Word.isin(negative_emotions), 'Polarity'] = -1
df.ix[df.Word.isin(positive_words), 'Polarity'] = 1
df.ix[df.Word.isin(negative_words), 'Polarity'] = -1
df['Adjusted Sentiment'] = df['Polarity'] * df['Frequency-Weight']
df['Relative Weight'] = df['Word Count'] * df['Adjusted Sentiment']
pos = sum(df[[x > 0 for x in df['Relative Weight']]].sort('Relative Weight', ascending=False)['Relative Weight'])
neg = sum(df[[x < 0 for x in df['Relative Weight']]].sort('Relative Weight', ascending=False)['Relative Weight'])
agg = abs(pos) + abs(neg)
bal = pos + neg
per = bal / agg
print 'Aggregate text sentiment:', str(per)+'% ---', 'Episode',str(url[-3:])
for url in urls:
lite_sentiment_estimator(url)
Aggregate text sentiment: -0.269736842105% --- Episode 101 Aggregate text sentiment: -0.223443223443% --- Episode 102 Aggregate text sentiment: -0.178893178893% --- Episode 103 Aggregate text sentiment: -0.13249651325% --- Episode 104 Aggregate text sentiment: -0.0864946889226% --- Episode 105 Aggregate text sentiment: -0.129814550642% --- Episode 106 Aggregate text sentiment: -0.101449275362% --- Episode 107 Aggregate text sentiment: -0.127035830619% --- Episode 108 Aggregate text sentiment: -0.148514851485% --- Episode 109 Aggregate text sentiment: -0.262905162065% --- Episode 110 Aggregate text sentiment: -0.347222222222% --- Episode 111 Aggregate text sentiment: -0.270704573548% --- Episode 112 Aggregate text sentiment: -0.384885764499% --- Episode 113 Aggregate text sentiment: -0.125313283208% --- Episode 114 Aggregate text sentiment: -0.173216885007% --- Episode 115 Aggregate text sentiment: -0.18041958042% --- Episode 116 Aggregate text sentiment: -0.137142857143% --- Episode 117 Aggregate text sentiment: -0.0945558739255% --- Episode 118 Aggregate text sentiment: -0.109570041609% --- Episode 119 Aggregate text sentiment: -0.321196358908% --- Episode 120 Aggregate text sentiment: -0.29451287794% --- Episode 201 Aggregate text sentiment: -0.205846528624% --- Episode 202 Aggregate text sentiment: -0.19733924612% --- Episode 203 Aggregate text sentiment: -0.374449339207% --- Episode 204 Aggregate text sentiment: -0.194614443084% --- Episode 205 Aggregate text sentiment: -0.115423901941% --- Episode 206 Aggregate text sentiment: -0.266149870801% --- Episode 207 Aggregate text sentiment: -0.166051660517% --- Episode 208 Aggregate text sentiment: -0.127937336815% --- Episode 209 Aggregate text sentiment: -0.162416107383% --- Episode 210 Aggregate text sentiment: -0.202702702703% --- Episode 211 Aggregate text sentiment: -0.150550795594% --- Episode 212 Aggregate text sentiment: -0.295454545455% --- Episode 213 Aggregate text sentiment: -0.08038585209% --- Episode 214 Aggregate text sentiment: -0.0402010050251% --- Episode 215 Aggregate text sentiment: -0.420989143546% --- Episode 216 Aggregate text sentiment: -0.384615384615% --- Episode 217 Aggregate text sentiment: -0.156732891832% --- Episode 218 Aggregate text sentiment: -0.0432946145723% --- Episode 219 Aggregate text sentiment: -0.171945701357% --- Episode 220 Aggregate text sentiment: -0.240506329114% --- Episode 301 Aggregate text sentiment: -0.155015197568% --- Episode 302 Aggregate text sentiment: -0.315573770492% --- Episode 303 Aggregate text sentiment: -0.123882503193% --- Episode 304 Aggregate text sentiment: -0.168704156479% --- Episode 305 Aggregate text sentiment: -0.010101010101% --- Episode 306 Aggregate text sentiment: -0.241379310345% --- Episode 307 Aggregate text sentiment: -0.25037037037% --- Episode 308 Aggregate text sentiment: -0.269121813031% --- Episode 309 Aggregate text sentiment: -0.387045813586% --- Episode 310 Aggregate text sentiment: -0.151515151515% --- Episode 311 Aggregate text sentiment: -0.288732394366% --- Episode 312 Aggregate text sentiment: -0.24% --- Episode 313 Aggregate text sentiment: -0.407894736842% --- Episode 314 Aggregate text sentiment: -0.383512544803% --- Episode 315 Aggregate text sentiment: -0.35453100159% --- Episode 316 Aggregate text sentiment: -0.230046948357% --- Episode 317 Aggregate text sentiment: -0.16290726817% --- Episode 318 Aggregate text sentiment: -0.0648967551622% --- Episode 319 Aggregate text sentiment: -0.395424836601% --- Episode 320 Aggregate text sentiment: -0.207715133531% --- Episode 321
We can now view the episodes in order from most negative to least negative
ref = pd.read_csv('ATLA Sentiment Balance Results.csv')
reader = pd.read_csv('Avatar Numbers.csv')
ep_list = reader[['Series', 'Episode', 'Title', 'Viewership (Mil)']]
df = ref.join(ep_list['Title'], lsuffix='Episode')
df = df.join(ep_list['Viewership (Mil)'], lsuffix='Episode')
df['Viewership (Mil)'] = [float(x) for x in df['Viewership (Mil)']]
df['Episode'] = [str(x) for x in df['Episode']]
df = df[['Episode', 'Title', 'Total Non-Neutral Weight', 'Sentiment Balance', 'Balance Percentage', 'Viewership (Mil)']]
df.columns = ['Episode Number', 'Episode', 'Total Non-Neutral Weight', 'Sentiment Balance', 'Balance Percentage', 'Viewership (Mil)']
df.sort(columns='Balance Percentage', ascending=False).head(20) #Least negative episodes
#df.sort(columns='Balance Percentage').head() #Most negative episodes
Episode Number | Episode | Total Non-Neutral Weight | Sentiment Balance | Balance Percentage | Viewership (Mil) | |
---|---|---|---|---|---|---|
45 | 306 | The Avatar and the Firelord | 396.0 | -4.0 | -0.010 | 3.20 |
34 | 215 | Tales of Ba Sing Se | 398.0 | -16.0 | -0.040 | 3.12 |
38 | 219 | The Guru | 473.5 | -20.5 | -0.043 | 4.40 |
58 | 319 | Sozin's Comet, Part 2: The Old Masters | 339.0 | -22.0 | -0.065 | 5.59 |
33 | 214 | City of Walls and Secrets | 311.0 | -25.0 | -0.080 | 3.27 |
4 | 105 | The King of Omashu | 329.5 | -28.5 | -0.086 | 3.54 |
17 | 118 | The Waterbending Master | 349.0 | -33.0 | -0.095 | 3.50 |
6 | 107 | The Spirit World (Winter Solstice, Part 1) | 310.5 | -31.5 | -0.101 | 3.29 |
18 | 119 | The Siege of the North, Part 1 | 360.5 | -39.5 | -0.110 | 3.42 |
25 | 206 | The Blind Bandit | 489.5 | -56.5 | -0.115 | 3.33 |
43 | 304 | Sokka's Master | 391.5 | -48.5 | -0.124 | 3.22 |
13 | 114 | The Fortuneteller | 399.0 | -50.0 | -0.125 | 2.05 |
7 | 108 | Avatar Roku (Winter Solstice, Part 2) | 307.0 | -39.0 | -0.127 | 3.12 |
28 | 209 | Bitter Work | 383.0 | -49.0 | -0.128 | 3.26 |
5 | 106 | Imprisoned | 350.5 | -45.5 | -0.130 | 3.38 |
3 | 104 | The Warriors of Kyoshi | 358.5 | -47.5 | -0.132 | 3.47 |
16 | 117 | The Northern Air Temple | 350.0 | -48.0 | -0.137 | 1.68 |
8 | 109 | The Waterbending Scroll | 404.0 | -60.0 | -0.149 | 3.15 |
31 | 212 | The Serpent's Pass | 408.5 | -61.5 | -0.151 | 4.10 |
50 | 311 | The Day of Black Sun, Part 2: The Eclipse | 330.0 | -50.0 | -0.152 | 3.77 |
First, to figure out how to decide which episodes to look deeper into:
least_negative = ref.sort(columns='Sentiment Balance', ascending=False)['Episode'].head(10)
For further reference, a quick function will be created and used to return a quick-view list of any episodes specified
def get_title(episode):
print ep_list[ep_list['Episode'].isin([int(episode)])]
for episode in least_negative:
get_title(episode)
Series Episode Title Viewership (Mil) 45 ATLA 306 The Avatar and the Firelord 3.2 Series Episode Title Viewership (Mil) 34 ATLA 215 Tales of Ba Sing Se 3.12 Series Episode Title Viewership (Mil) 38 ATLA 219 The Guru 4.4 Series Episode Title Viewership (Mil) 58 ATLA 319 Sozin's Comet, Part 2: The Old Masters 5.59 Series Episode Title Viewership (Mil) 33 ATLA 214 City of Walls and Secrets 3.27 Series Episode Title Viewership (Mil) 4 ATLA 105 The King of Omashu 3.54 Series Episode Title \ 6 ATLA 107 The Spirit World (Winter Solstice, Part 1) Viewership (Mil) 6 3.29 Series Episode Title Viewership (Mil) 17 ATLA 118 The Waterbending Master 3.5 Series Episode Title Viewership (Mil) 7 ATLA 108 Avatar Roku (Winter Solstice, Part 2) 3.12 Series Episode Title Viewership (Mil) 18 ATLA 119 The Siege of the North, Part 1 3.42
Now that we have the means to select which episodes to dive into, let's start with some of the least negative episodes
get_title(319)
Series Episode Title Viewership (Mil) 58 ATLA 319 Sozin's Comet, Part 2: The Old Masters 5.59
ep = 319
sort = False
ep_viewer = pd.read_csv(str(ep)+'.csv').drop('Unnamed: 0', axis=1).sort(columns='Relative Weight', ascending=sort)
#viewer = viewer.drop('Unnamed: 0', axis=1)
#viewer = viewer.sort(columns='Relative Weight', ascending=False)
ep_viewer.head()
Word | Word Count | Frequency-Weight | Share of Words | Polarity | Adjusted Sentiment | Relative Weight | Episode | |
---|---|---|---|---|---|---|---|---|
689 | like | 8 | 1.0 | 0.001926 | 1 | 1.0 | 8.0 | 319 |
988 | smiles | 7 | 1.0 | 0.001686 | 1 | 1.0 | 7.0 | 319 |
893 | right | 13 | 0.5 | 0.003130 | 1 | 0.5 | 6.5 | 319 |
989 | smiling | 5 | 1.0 | 0.001204 | 1 | 1.0 | 5.0 | 319 |
1108 | top | 5 | 1.0 | 0.001204 | 1 | 1.0 | 5.0 | 319 |
The top 3-4 episodes in each list seem accurate, however as we attempt to estimate beyond the 5th most positive/negative episodes, sentiment begins to behave ambiguously. Perhaps it is fan-bias, but The Desert shouldn't be a relatively neutral episode.
As it stands, the estimator processes dialogue as well as stage directions included in each episode. Perhaps these directions (which include a large number of adjectives and adverbs' are skewing the estimator? Otherwise, the wordbanks need to be refined to better fit the content in the series.
df
Episode Number | Episode | Total Non-Neutral Weight | Sentiment Balance | Balance Percentage | Viewership (Mil) | |
---|---|---|---|---|---|---|
0 | 101 | The Boy in the Iceberg | 304.0 | -82.0 | -0.270 | 3.47 |
1 | 102 | The Avatar Returns | 273.0 | -61.0 | -0.223 | 3.47 |
2 | 103 | The Southern Air Temple | 388.5 | -69.5 | -0.179 | 3.41 |
3 | 104 | The Warriors of Kyoshi | 358.5 | -47.5 | -0.132 | 3.47 |
4 | 105 | The King of Omashu | 329.5 | -28.5 | -0.086 | 3.54 |
5 | 106 | Imprisoned | 350.5 | -45.5 | -0.130 | 3.38 |
6 | 107 | The Spirit World (Winter Solstice, Part 1) | 310.5 | -31.5 | -0.101 | 3.29 |
7 | 108 | Avatar Roku (Winter Solstice, Part 2) | 307.0 | -39.0 | -0.127 | 3.12 |
8 | 109 | The Waterbending Scroll | 404.0 | -60.0 | -0.149 | 3.15 |
9 | 110 | Jet | 416.5 | -109.5 | -0.263 | 3.40 |
10 | 111 | The Great Divide | 432.0 | -150.0 | -0.347 | 3.10 |
11 | 112 | The Storm | 404.5 | -109.5 | -0.271 | 3.25 |
12 | 113 | The Blue Spirit | 284.5 | -109.5 | -0.385 | 3.19 |
13 | 114 | The Fortuneteller | 399.0 | -50.0 | -0.125 | 2.05 |
14 | 115 | Bato of the Water Tribe | 343.5 | -59.5 | -0.173 | 3.37 |
15 | 116 | The Deserter | 357.5 | -64.5 | -0.180 | 3.17 |
16 | 117 | The Northern Air Temple | 350.0 | -48.0 | -0.137 | 1.68 |
17 | 118 | The Waterbending Master | 349.0 | -33.0 | -0.095 | 3.50 |
18 | 119 | The Siege of the North, Part 1 | 360.5 | -39.5 | -0.110 | 3.42 |
19 | 120 | The Siege of the North, Part 2 | 384.5 | -123.5 | -0.321 | 3.42 |
20 | 201 | The Avatar State | 446.5 | -131.5 | -0.295 | 3.38 |
21 | 202 | The Cave of Two Lovers | 410.5 | -84.5 | -0.206 | 3.27 |
22 | 203 | Return to Omashu | 451.0 | -89.0 | -0.197 | 3.20 |
23 | 204 | The Swamp | 340.5 | -127.5 | -0.374 | 3.10 |
24 | 205 | Avatar Day | 408.5 | -79.5 | -0.195 | 3.11 |
25 | 206 | The Blind Bandit | 489.5 | -56.5 | -0.115 | 3.33 |
26 | 207 | Zuko Alone | 387.0 | -103.0 | -0.266 | 3.33 |
27 | 208 | The Chase | 406.5 | -67.5 | -0.166 | 3.33 |
28 | 209 | Bitter Work | 383.0 | -49.0 | -0.128 | 3.26 |
29 | 210 | The Library | 372.5 | -60.5 | -0.162 | 3.18 |
... | ... | ... | ... | ... | ... | ... |
31 | 212 | The Serpent's Pass | 408.5 | -61.5 | -0.151 | 4.10 |
32 | 213 | The Drill | 440.0 | -130.0 | -0.295 | 4.10 |
33 | 214 | City of Walls and Secrets | 311.0 | -25.0 | -0.080 | 3.27 |
34 | 215 | Tales of Ba Sing Se | 398.0 | -16.0 | -0.040 | 3.12 |
35 | 216 | Appa's Lost Days | 414.5 | -174.5 | -0.421 | 3.54 |
36 | 217 | Lake Laogai | 416.0 | -160.0 | -0.385 | 3.27 |
37 | 218 | The Earth King | 453.0 | -71.0 | -0.157 | 3.76 |
38 | 219 | The Guru | 473.5 | -20.5 | -0.043 | 4.40 |
39 | 220 | The Crossroads of Destiny | 442.0 | -76.0 | -0.172 | 4.40 |
40 | 301 | The Awakening | 316.0 | -76.0 | -0.241 | 3.06 |
41 | 302 | The Headband | 329.0 | -51.0 | -0.155 | 3.06 |
42 | 303 | The Painted Lady | 244.0 | -77.0 | -0.316 | 3.22 |
43 | 304 | Sokka's Master | 391.5 | -48.5 | -0.124 | 3.22 |
44 | 305 | The Beach | 409.0 | -69.0 | -0.169 | 3.22 |
45 | 306 | The Avatar and the Firelord | 396.0 | -4.0 | -0.010 | 3.20 |
46 | 307 | The Runaway | 261.0 | -63.0 | -0.241 | 3.22 |
47 | 308 | The Puppetmaster | 337.5 | -84.5 | -0.250 | 3.52 |
48 | 309 | Nightmares and Daydreams | 353.0 | -95.0 | -0.269 | 3.52 |
49 | 310 | The Day of Black Sun, Part 1: The Invasion | 316.5 | -122.5 | -0.387 | 3.77 |
50 | 311 | The Day of Black Sun, Part 2: The Eclipse | 330.0 | -50.0 | -0.152 | 3.77 |
51 | 312 | The Western Air Temple | 426.0 | -123.0 | -0.289 | 3.55 |
52 | 313 | The Firebending Masters | 375.0 | -90.0 | -0.240 | 3.55 |
53 | 314 | The Boiling Rock, Part 1 | 304.0 | -124.0 | -0.408 | 3.97 |
54 | 315 | The Boiling Rock, Part 2 | 279.0 | -107.0 | -0.384 | 3.97 |
55 | 316 | The Southern Raiders | 314.5 | -111.5 | -0.355 | 4.23 |
56 | 317 | The Ember Island Players | 426.0 | -98.0 | -0.230 | 4.53 |
57 | 318 | Sozin's Comet, Part 1: The Phoenix King | 399.0 | -65.0 | -0.163 | 5.59 |
58 | 319 | Sozin's Comet, Part 2: The Old Masters | 339.0 | -22.0 | -0.065 | 5.59 |
59 | 320 | Sozin's Comet, Part 3: Into the Inferno | 306.0 | -121.0 | -0.395 | 5.59 |
60 | 321 | Sozin's Comet, Part 4: Avatar Aang | 337.0 | -70.0 | -0.208 | 5.59 |
61 rows × 6 columns
df[['Episode',
'Balance Percentage',
'Viewership (Mil)']].plot(
kind='bar',
figsize=(50,10),
x='Episode',
title='Avatar: The Last Airbender',
#marker='.',
#markersize=20
)
<matplotlib.axes._subplots.AxesSubplot at 0x186ab3c8>
import requests
from bs4 import BeautifulSoup as bs
import nltk
import re
raw_html = [requests.get(url).text for url in urls]
#EXCLUDE NON-RELEVANT PAGE SECTIONS
top = 'Transcriber'
bottom = '[End Credits]'
slim_html = [x.split(top)[1].split(bottom)[0] for x in raw_html]
#html = str()
#html = slim_html - formatted
soup = [bs(x) for x in slim_html]
souped_text = [x.get_text() for x in soup]
encoded_text = [x.encode('utf-8') for x in souped_text]
#TOKENIZE TEXT
import nltk
tokens = [nltk.word_tokenize(x) for x in encoded_text] #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'
sum([len(x) for x in tokens]) #total words
492928
#Stripping non-alphabetic items
consolidated_text = list()
for row in tokens:
consolidated_text += [word for word in row]
alpha = [word for word in consolidated_text if word.isalpha() == True]
len(alpha)
392107
a_df = pd.DataFrame(alpha)
a_df.columns = ['Word']
a_df['Word Count'] = [1 for x in a_df['Word']]
a_df = a_df.groupby('Word').count()
a_df = a_df.reset_index()
#IMPORTING STOPWORD CORPUS
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
a_df = a_df[[x not in stopwords for x in a_df['Word']]]
#DETERMINING FREQUENCY-WEIGHTS FOR EACH WORD
a_df['Frequency-Weight'] = 1
a_df.ix[a_df['Word Count'] == 1, 'Frequency-Weight'] = 2
a_df.ix[a_df['Word Count'] > 10, 'Frequency-Weight'] = 0.5
#CALCULATING SHARE OF TEXT FOR EACH UNIQUE WORD
a_df['Share of Words'] = a_df['Word Count']/a_df['Word Count'].sum()
#ASSIGNING INITAL POLARITY WITH PRE-DEFINED POS/NEG WORDBANKS
a_df['Polarity'] = 0
positive_emotions = ['curious', 'attract', 'surprise', 'hope', 'Hope',
'thank', 'joy', 'relief', 'proud', 'Joy',
'generous', 'sympath', 'love', 'amuse', 'Love',
'delight', 'elat', 'excit', 'happy', 'happi', 'Happi',
'joy', 'pleasure', 'affection', 'empath', 'friendl']
negative_emotions = ['alarm', 'disgust', 'indifferen', 'fear', 'Fear',
'rage', 'sorrow', 'grief', 'frustrat', 'disappoint',
'embarrass', 'shame', 'guilt', 'remorse', 'greed', 'Greed',
'miser', 'jealous', 'cruel', 'hate', 'anger', 'annoyed', 'Anger', 'Hate',
'disgust', 'irrit', 'anxious', 'anxiety', 'helpless',
'worry', 'doubt', 'shame', 'bored', 'despar', 'hurt']
#PROCESSING PRE-DEFINED POSITIVE AND NEGATIVE WORDBANKS
pos_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/positive_wordbank.txt'
pos_words_text = requests.get(pos_bank).text
pos_soup = bs(pos_words_text) #converts string variable to unicode
souped_pos_words = pos_soup.get_text() #stores unicode text in variable 'souped_text'
encoded_pos_words = souped_pos_words.encode('utf-8') #converts unicode text to utf-8
positive_words = nltk.word_tokenize(encoded_pos_words) #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'
neg_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/negative_wordbank.txt'
neg_words_text = requests.get(neg_bank).text
neg_soup = bs(neg_words_text) #converts string variable to unicode
souped_neg_words = neg_soup.get_text() #stores unicode text in variable 'souped_text'
encoded_neg_words = souped_neg_words.encode('utf-8') #converts unicode text to utf-8
negative_words = nltk.word_tokenize(encoded_neg_words)
a_df.ix[a_df.Word.isin(positive_emotions), 'Polarity'] = 1
a_df.ix[a_df.Word.isin(negative_emotions), 'Polarity'] = -1
a_df.ix[a_df.Word.isin(positive_words), 'Polarity'] = 1
a_df.ix[a_df.Word.isin(negative_words), 'Polarity'] = -1
a_df['Adjusted Sentiment'] = a_df['Polarity'] * a_df['Frequency-Weight']
a_df['Relative Weight'] = a_df['Word Count'] * a_df['Adjusted Sentiment']
a_df[a_df['Relative Weight'] != 0].sort(columns='Relative Weight', ascending=False)
Word | Word Count | Frequency-Weight | Share of Words | Polarity | Adjusted Sentiment | Relative Weight | |
---|---|---|---|---|---|---|---|
10188 | right | 639 | 0.5 | 0.002847 | 1 | 0.5 | 319.5 |
7803 | like | 615 | 0.5 | 0.002740 | 1 | 0.5 | 307.5 |
12364 | top | 280 | 0.5 | 0.001247 | 1 | 0.5 | 140.0 |
11106 | smiles | 204 | 0.5 | 0.000909 | 1 | 0.5 | 102.0 |
6471 | good | 168 | 0.5 | 0.000748 | 1 | 0.5 | 84.0 |
6534 | great | 157 | 0.5 | 0.000699 | 1 | 0.5 | 78.5 |
11109 | smiling | 156 | 0.5 | 0.000695 | 1 | 0.5 | 78.0 |
9807 | ready | 127 | 0.5 | 0.000566 | 1 | 0.5 | 63.5 |
3961 | clearly | 102 | 0.5 | 0.000454 | 1 | 0.5 | 51.0 |
11105 | smile | 98 | 0.5 | 0.000437 | 1 | 0.5 | 49.0 |
13151 | well | 95 | 0.5 | 0.000423 | 1 | 0.5 | 47.5 |
3141 | better | 91 | 0.5 | 0.000405 | 1 | 0.5 | 45.5 |
5474 | enough | 80 | 0.5 | 0.000356 | 1 | 0.5 | 40.0 |
7933 | love | 79 | 0.5 | 0.000352 | 1 | 0.5 | 39.5 |
13336 | work | 73 | 0.5 | 0.000325 | 1 | 0.5 | 36.5 |
11883 | surprise | 69 | 0.5 | 0.000307 | 1 | 0.5 | 34.5 |
9472 | pretty | 66 | 0.5 | 0.000294 | 1 | 0.5 | 33.0 |
7691 | leading | 65 | 0.5 | 0.000290 | 1 | 0.5 | 32.5 |
8089 | master | 65 | 0.5 | 0.000290 | 1 | 0.5 | 32.5 |
6976 | hope | 63 | 0.5 | 0.000281 | 1 | 0.5 | 31.5 |
7003 | hot | 54 | 0.5 | 0.000241 | 1 | 0.5 | 27.0 |
8542 | nice | 54 | 0.5 | 0.000241 | 1 | 0.5 | 27.0 |
9413 | powerful | 51 | 0.5 | 0.000227 | 1 | 0.5 | 25.5 |
6731 | happy | 51 | 0.5 | 0.000227 | 1 | 0.5 | 25.5 |
6205 | free | 49 | 0.5 | 0.000218 | 1 | 0.5 | 24.5 |
3958 | clear | 47 | 0.5 | 0.000209 | 1 | 0.5 | 23.5 |
3046 | beautiful | 46 | 0.5 | 0.000205 | 1 | 0.5 | 23.0 |
6443 | glow | 46 | 0.5 | 0.000205 | 1 | 0.5 | 23.0 |
3133 | best | 44 | 0.5 | 0.000196 | 1 | 0.5 | 22.0 |
7320 | instantly | 41 | 0.5 | 0.000183 | 1 | 0.5 | 20.5 |
... | ... | ... | ... | ... | ... | ... | ... |
13351 | worry | 55 | 0.5 | 0.000245 | -1 | -0.5 | -27.5 |
4427 | crazy | 55 | 0.5 | 0.000245 | -1 | -0.5 | -27.5 |
11739 | struggles | 58 | 0.5 | 0.000258 | -1 | -0.5 | -29.0 |
2896 | backward | 62 | 0.5 | 0.000276 | -1 | -0.5 | -31.0 |
2899 | bad | 62 | 0.5 | 0.000276 | -1 | -0.5 | -31.0 |
10777 | shocked | 63 | 0.5 | 0.000281 | -1 | -0.5 | -31.5 |
5949 | fist | 64 | 0.5 | 0.000285 | -1 | -0.5 | -32.0 |
13399 | wrong | 64 | 0.5 | 0.000285 | -1 | -0.5 | -32.0 |
7923 | lost | 65 | 0.5 | 0.000290 | -1 | -0.5 | -32.5 |
12041 | tanks | 70 | 0.5 | 0.000312 | -1 | -0.5 | -35.0 |
10776 | shock | 70 | 0.5 | 0.000312 | -1 | -0.5 | -35.0 |
6735 | hard | 72 | 0.5 | 0.000321 | -1 | -0.5 | -36.0 |
8915 | pan | 73 | 0.5 | 0.000325 | -1 | -0.5 | -36.5 |
2581 | angrily | 77 | 0.5 | 0.000343 | -1 | -0.5 | -38.5 |
3734 | cave | 79 | 0.5 | 0.000352 | -1 | -0.5 | -39.5 |
3406 | breaks | 83 | 0.5 | 0.000370 | -1 | -0.5 | -41.5 |
5259 | dust | 85 | 0.5 | 0.000379 | -1 | -0.5 | -42.5 |
8339 | monster | 85 | 0.5 | 0.000379 | -1 | -0.5 | -42.5 |
11273 | sorry | 87 | 0.5 | 0.000388 | -1 | -0.5 | -43.5 |
7974 | lying | 88 | 0.5 | 0.000392 | -1 | -0.5 | -44.0 |
5749 | falling | 89 | 0.5 | 0.000397 | -1 | -0.5 | -44.5 |
2802 | attack | 92 | 0.5 | 0.000410 | -1 | -0.5 | -46.0 |
11115 | smoke | 93 | 0.5 | 0.000414 | -1 | -0.5 | -46.5 |
4627 | dark | 93 | 0.5 | 0.000414 | -1 | -0.5 | -46.5 |
2582 | angry | 98 | 0.5 | 0.000437 | -1 | -0.5 | -49.0 |
4007 | cloud | 98 | 0.5 | 0.000437 | -1 | -0.5 | -49.0 |
5747 | fall | 104 | 0.5 | 0.000463 | -1 | -0.5 | -52.0 |
8462 | na | 182 | 0.5 | 0.000811 | -1 | -0.5 | -91.0 |
5750 | falls | 220 | 0.5 | 0.000980 | -1 | -0.5 | -110.0 |
11064 | slowly | 249 | 0.5 | 0.001109 | -1 | -0.5 | -124.5 |
1893 rows × 7 columns
a_df['Relative Weight'].sum()
-2181.0
pos_20 = a_df.sort(columns='Relative Weight', ascending=False).head(50)
neg_20 = a_df.sort(columns='Relative Weight', ascending=True).head(50)
top_10_negative_words = [
'fall',
'angry',
'cloud', #remember the episode with the fortune teller?
'dark',
'smoke',
'attack',
'lying',
'sorry',
'monster',
'tanks',
'lost',
'wrong',
'bad',
'crazy'
]
top_10_positive_words = [
'like',
'good',
'great',
'smile',
'love',
'surprise',
'pretty',
'master',
'hope',
'hot',
'powerful',
'beautiful',
'happy',
'free'
]