In [1]:

import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt

pd.options.display.mpl_style = 'default'

import brewer2mpl
bmap = brewer2mpl.get_map('Set2','qualitative',8,reverse=False)
colors = bmap.mpl_colors
font = {'family' : 'Open Sans',
        'weight' : 'normal',
        'size'   : 24}
bigfont = {'family' : 'Open Sans',
        'weight' : 'normal',
        'size'   : 36,
        'color'  : '#333333'}

mpl.rc('font', **font)
mpl.rc('lines', linewidth=5)
mpl.rcParams['legend.fontsize'] = 20

Web Sentiment Estimator¶

A modified sentiment estimator script will be used here to estimate the sentiment of each episode from the series Avatar: The Last Airbender.

In [2]:

url = 'http://atla.avatarspirit.net/transcripts.php?num='
pages = range(101,121)+range(201,221)+range(301,322)
urls = [url+str(x) for x in pages]

In [3]:

def web_sentiment_estimator(url):

    #RECEIVING AND PARSING WEBPAGE TEXT
    import requests
    html = requests.get(url).text #gets and stores webpage's hmtl source code in string variable 'html'

    from bs4 import BeautifulSoup as bs
    soup = bs(html) #converts string variable to unicode
    souped_text = soup.get_text() #stores unicode text in variable 'souped_text'
    encoded_text = souped_text.encode('utf-8') #converts unicode text to utf-8

    #EXCLUDE NON-RELEVANT PAGE SECTIONS
    top = 'Transcriber'
    bottom = '[End Credits]'
    text = encoded_text.split(top)[1].split(bottom)[0]
    
    #TOKENIZE TEXT
    import nltk
    tokens = nltk.word_tokenize(encoded_text) #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'
    alpha = [word for word in tokens if word.isalpha() == True] #filters tokens to exclude non-alphabetic words

    #IMPORTING DATAFRAME FUNCTIONALITY
    import pandas as pd
    alpha_df = pd.DataFrame(alpha)
    alpha_df.columns=['Word']

    #IMPORTING STOPWORD CORPUS
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')
    no_stopwords_df = alpha_df[[x not in stopwords for x in alpha_df['Word']]] # excludes stopwords
    df = no_stopwords_df

    #ESTABLISHING WORD COUNTS FOR EACH UNIQUE WORD
    df['Word Count'] = [1 for x in df['Word']]
    df = df.groupby(['Word']).count()
    df = df.reset_index()

    #DETERMINING FREQUENCY-WEIGHTS FOR EACH WORD
    df['Frequency-Weight'] = 1
    df.ix[df['Word Count'] == 1, 'Frequency-Weight'] = 2
    df.ix[df['Word Count'] > 10, 'Frequency-Weight'] = 0.5

    #CALCULATING SHARE OF TEXT FOR EACH UNIQUE WORD
    df['Share of Words'] = df['Word Count']/df['Word Count'].sum()

    #ASSIGNING INITAL POLARITY WITH PRE-DEFINED POS/NEG WORDBANKS
    df['Polarity'] = 0

    positive_emotions = ['curious', 'attract', 'surprise', 'hope', 'Hope', 
                     'thank', 'joy', 'relief', 'proud', 'Joy', 
                     'generous', 'sympath', 'love', 'amuse', 'Love',
                     'delight', 'elat', 'excit', 'happy', 'happi', 'Happi',
                     'joy', 'pleasure', 'affection', 'empath', 'friendl']
    negative_emotions = ['alarm', 'disgust', 'indifferen', 'fear', 'Fear', 
                     'rage', 'sorrow', 'grief', 'frustrat', 'disappoint', 
                     'embarrass', 'shame', 'guilt', 'remorse', 'greed', 'Greed',  
                     'miser', 'jealous', 'cruel', 'hate', 'anger', 'annoyed', 'Anger', 'Hate',
                     'disgust', 'irrit', 'anxious', 'anxiety', 'helpless',
                     'worry', 'doubt', 'shame', 'bored', 'despar', 'hurt']

    
    #PROCESSING PRE-DEFINED POSITIVE AND NEGATIVE WORDBANKS
    pos_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/positive_wordbank.txt'
    pos_words_text = requests.get(pos_bank).text
    pos_soup = bs(pos_words_text) #converts string variable to unicode
    souped_pos_words = pos_soup.get_text() #stores unicode text in variable 'souped_text'
    encoded_pos_words = souped_pos_words.encode('utf-8') #converts unicode text to utf-8
    positive_words = nltk.word_tokenize(encoded_pos_words) #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'

    neg_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/negative_wordbank.txt'    
    neg_words_text = requests.get(neg_bank).text
    neg_soup = bs(neg_words_text) #converts string variable to unicode
    souped_neg_words = neg_soup.get_text() #stores unicode text in variable 'souped_text'
    encoded_neg_words = souped_neg_words.encode('utf-8') #converts unicode text to utf-8
    negative_words = nltk.word_tokenize(encoded_neg_words)

    df.ix[df.Word.isin(positive_emotions), 'Polarity'] = 1
    df.ix[df.Word.isin(negative_emotions), 'Polarity'] = -1
    df.ix[df.Word.isin(positive_words), 'Polarity'] = 1
    df.ix[df.Word.isin(negative_words), 'Polarity'] = -1

    df['Adjusted Sentiment'] = df['Polarity'] * df['Frequency-Weight']
    df['Relative Weight'] = df['Word Count'] * df['Adjusted Sentiment']
    df['Episode'] = url[-3:]
    
    x = sum(df['Relative Weight'])
    if x > 0:
        pos_or_neg = 'Positive'
    if x == 0:
        pos_or_neg = 'Neutral'
    if x < 0:
        pos_or_neg = 'Negative'
        
    pos_words = len([weight for weight in df['Relative Weight'] if weight > 0])
    neg_words = len([weight for weight in df['Relative Weight'] if weight < 0])
        
    #RETURNING PRINTED REPORT
    print 'Aggregate text sentiment for episode',str(url[-3:])+':', pos_or_neg

    with open(str(url[-3:]+'.csv'), 'w') as f:
        df.to_csv(f, sep=',')
        f.close()

The estimator script has been modified to write a dataframe for each episode containing sentiment data to an exclusive csv file.

In [4]:

for url in urls:
    web_sentiment_estimator(url)

Aggregate text sentiment for episode 101: Negative
Aggregate text sentiment for episode 102: Negative
Aggregate text sentiment for episode 103: Negative
Aggregate text sentiment for episode 104: Negative
Aggregate text sentiment for episode 105: Negative
Aggregate text sentiment for episode 106: Negative
Aggregate text sentiment for episode 107: Negative
Aggregate text sentiment for episode 108: Negative
Aggregate text sentiment for episode 109: Negative
Aggregate text sentiment for episode 110: Negative
Aggregate text sentiment for episode 111: Negative
Aggregate text sentiment for episode 112: Negative
Aggregate text sentiment for episode 113: Negative
Aggregate text sentiment for episode 114: Negative
Aggregate text sentiment for episode 115: Negative
Aggregate text sentiment for episode 116: Negative
Aggregate text sentiment for episode 117: Negative
Aggregate text sentiment for episode 118: Negative
Aggregate text sentiment for episode 119: Negative
Aggregate text sentiment for episode 120: Negative
Aggregate text sentiment for episode 201: Negative
Aggregate text sentiment for episode 202: Negative
Aggregate text sentiment for episode 203: Negative
Aggregate text sentiment for episode 204: Negative
Aggregate text sentiment for episode 205: Negative
Aggregate text sentiment for episode 206: Negative
Aggregate text sentiment for episode 207: Negative
Aggregate text sentiment for episode 208: Negative
Aggregate text sentiment for episode 209: Negative
Aggregate text sentiment for episode 210: Negative
Aggregate text sentiment for episode 211: Negative
Aggregate text sentiment for episode 212: Negative
Aggregate text sentiment for episode 213: Negative
Aggregate text sentiment for episode 214: Negative
Aggregate text sentiment for episode 215: Negative
Aggregate text sentiment for episode 216: Negative
Aggregate text sentiment for episode 217: Negative
Aggregate text sentiment for episode 218: Negative
Aggregate text sentiment for episode 219: Negative
Aggregate text sentiment for episode 220: Negative
Aggregate text sentiment for episode 301: Negative
Aggregate text sentiment for episode 302: Negative
Aggregate text sentiment for episode 303: Negative
Aggregate text sentiment for episode 304: Negative
Aggregate text sentiment for episode 305: Negative
Aggregate text sentiment for episode 306: Positive
Aggregate text sentiment for episode 307: Negative
Aggregate text sentiment for episode 308: Negative
Aggregate text sentiment for episode 309: Negative
Aggregate text sentiment for episode 310: Negative
Aggregate text sentiment for episode 311: Negative
Aggregate text sentiment for episode 312: Negative
Aggregate text sentiment for episode 313: Negative
Aggregate text sentiment for episode 314: Negative
Aggregate text sentiment for episode 315: Negative
Aggregate text sentiment for episode 316: Negative
Aggregate text sentiment for episode 317: Negative
Aggregate text sentiment for episode 318: Negative
Aggregate text sentiment for episode 319: Negative
Aggregate text sentiment for episode 320: Negative
Aggregate text sentiment for episode 321: Negative

Now that we've stored dataframes for each episode individually, we can import them individually to review episodes one at a time. All we need to do is call the file associated with the episode in question for review, which has been saved by web_sentiment_estimator() as its episode number code (eg. '101.csv').

In [5]:

def lite_sentiment_estimator(url):
    #RECEIVING AND PARSING WEBPAGE TEXT
    import requests
    html = requests.get(url).text #gets and stores webpage's hmtl source code in string variable 'html'

    from bs4 import BeautifulSoup as bs
    soup = bs(html)
    souped_text = soup.get_text() #stores unicode text in variable 'souped_text'
    encoded_text = souped_text.encode('utf-8') #converts unicode text to utf-8

    #EXCLUDE NON-RELEVANT PAGE SECTIONS
    top = 'Transcriber'
    bottom = '[End Credits]'
    text = encoded_text.split(top)[1].split(bottom)[0]

    #TOKENIZE TEXT
    import nltk
    tokens = nltk.word_tokenize(text) #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'
    alpha = [word for word in tokens if word.isalpha() == True] #filters tokens to exclude non-alphabetic words

    #IMPORTING DATAFRAME FUNCTIONALITY
    import pandas as pd
    alpha_df = pd.DataFrame(alpha)
    alpha_df.columns=['Word']

    #IMPORTING STOPWORD CORPUS
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')
    df = alpha_df[[x not in stopwords for x in alpha_df['Word']]] # excludes stopwords

    #ESTABLISHING WORD COUNTS FOR EACH UNIQUE WORD
    df['Word Count'] = [1 for x in df['Word']]
    df = df.groupby(['Word']).count()
    df = df.reset_index()

    #DETERMINING FREQUENCY-WEIGHTS FOR EACH WORD
    df['Frequency-Weight'] = 1
    df.ix[df['Word Count'] == 1, 'Frequency-Weight'] = 2
    df.ix[df['Word Count'] > 10, 'Frequency-Weight'] = 0.5

    #CALCULATING SHARE OF TEXT FOR EACH UNIQUE WORD
    df['Share of Words'] = df['Word Count']/df['Word Count'].sum()

    #ASSIGNING INITAL POLARITY WITH PRE-DEFINED POS/NEG WORDBANKS
    df['Polarity'] = 0

    positive_emotions = ['curious', 'attract', 'surprise', 'hope', 'Hope', 
                     'thank', 'joy', 'relief', 'proud', 'Joy', 
                     'generous', 'sympath', 'love', 'amuse', 'Love',
                     'delight', 'elat', 'excit', 'happy', 'happi', 'Happi',
                     'joy', 'pleasure', 'affection', 'empath', 'friendl']
    negative_emotions = ['alarm', 'disgust', 'indifferen', 'fear', 'Fear', 
                     'rage', 'sorrow', 'grief', 'frustrat', 'disappoint', 
                     'embarrass', 'shame', 'guilt', 'remorse', 'greed', 'Greed',  
                     'miser', 'jealous', 'cruel', 'hate', 'anger', 'annoyed', 'Anger', 'Hate',
                     'disgust', 'irrit', 'anxious', 'anxiety', 'helpless',
                     'worry', 'doubt', 'shame', 'bored', 'despar', 'hurt']


    #PROCESSING PRE-DEFINED POSITIVE AND NEGATIVE WORDBANKS
    pos_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/positive_wordbank.txt'
    pos_words_text = requests.get(pos_bank).text
    pos_soup = bs(pos_words_text) #converts string variable to unicode
    souped_pos_words = pos_soup.get_text() #stores unicode text in variable 'souped_text'
    encoded_pos_words = souped_pos_words.encode('utf-8') #converts unicode text to utf-8
    positive_words = nltk.word_tokenize(encoded_pos_words) #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'

    neg_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/negative_wordbank.txt'    
    neg_words_text = requests.get(neg_bank).text
    neg_soup = bs(neg_words_text) #converts string variable to unicode
    souped_neg_words = neg_soup.get_text() #stores unicode text in variable 'souped_text'
    encoded_neg_words = souped_neg_words.encode('utf-8') #converts unicode text to utf-8
    negative_words = nltk.word_tokenize(encoded_neg_words)

    df.ix[df.Word.isin(positive_emotions), 'Polarity'] = 1
    df.ix[df.Word.isin(negative_emotions), 'Polarity'] = -1
    df.ix[df.Word.isin(positive_words), 'Polarity'] = 1
    df.ix[df.Word.isin(negative_words), 'Polarity'] = -1

    df['Adjusted Sentiment'] = df['Polarity'] * df['Frequency-Weight']
    df['Relative Weight'] = df['Word Count'] * df['Adjusted Sentiment']

    pos = sum(df[[x > 0 for x in df['Relative Weight']]].sort('Relative Weight', ascending=False)['Relative Weight'])
    neg = sum(df[[x < 0 for x in df['Relative Weight']]].sort('Relative Weight', ascending=False)['Relative Weight'])
    agg = abs(pos) + abs(neg)
    bal = pos + neg
    per = bal / agg
   
    print 'Aggregate text sentiment:', str(per)+'% ---', 'Episode',str(url[-3:])

In [6]:

for url in urls:
    lite_sentiment_estimator(url)

Aggregate text sentiment: -0.269736842105% --- Episode 101
Aggregate text sentiment: -0.223443223443% --- Episode 102
Aggregate text sentiment: -0.178893178893% --- Episode 103
Aggregate text sentiment: -0.13249651325% --- Episode 104
Aggregate text sentiment: -0.0864946889226% --- Episode 105
Aggregate text sentiment: -0.129814550642% --- Episode 106
Aggregate text sentiment: -0.101449275362% --- Episode 107
Aggregate text sentiment: -0.127035830619% --- Episode 108
Aggregate text sentiment: -0.148514851485% --- Episode 109
Aggregate text sentiment: -0.262905162065% --- Episode 110
Aggregate text sentiment: -0.347222222222% --- Episode 111
Aggregate text sentiment: -0.270704573548% --- Episode 112
Aggregate text sentiment: -0.384885764499% --- Episode 113
Aggregate text sentiment: -0.125313283208% --- Episode 114
Aggregate text sentiment: -0.173216885007% --- Episode 115
Aggregate text sentiment: -0.18041958042% --- Episode 116
Aggregate text sentiment: -0.137142857143% --- Episode 117
Aggregate text sentiment: -0.0945558739255% --- Episode 118
Aggregate text sentiment: -0.109570041609% --- Episode 119
Aggregate text sentiment: -0.321196358908% --- Episode 120
Aggregate text sentiment: -0.29451287794% --- Episode 201
Aggregate text sentiment: -0.205846528624% --- Episode 202
Aggregate text sentiment: -0.19733924612% --- Episode 203
Aggregate text sentiment: -0.374449339207% --- Episode 204
Aggregate text sentiment: -0.194614443084% --- Episode 205
Aggregate text sentiment: -0.115423901941% --- Episode 206
Aggregate text sentiment: -0.266149870801% --- Episode 207
Aggregate text sentiment: -0.166051660517% --- Episode 208
Aggregate text sentiment: -0.127937336815% --- Episode 209
Aggregate text sentiment: -0.162416107383% --- Episode 210
Aggregate text sentiment: -0.202702702703% --- Episode 211
Aggregate text sentiment: -0.150550795594% --- Episode 212
Aggregate text sentiment: -0.295454545455% --- Episode 213
Aggregate text sentiment: -0.08038585209% --- Episode 214
Aggregate text sentiment: -0.0402010050251% --- Episode 215
Aggregate text sentiment: -0.420989143546% --- Episode 216
Aggregate text sentiment: -0.384615384615% --- Episode 217
Aggregate text sentiment: -0.156732891832% --- Episode 218
Aggregate text sentiment: -0.0432946145723% --- Episode 219
Aggregate text sentiment: -0.171945701357% --- Episode 220
Aggregate text sentiment: -0.240506329114% --- Episode 301
Aggregate text sentiment: -0.155015197568% --- Episode 302
Aggregate text sentiment: -0.315573770492% --- Episode 303
Aggregate text sentiment: -0.123882503193% --- Episode 304
Aggregate text sentiment: -0.168704156479% --- Episode 305
Aggregate text sentiment: -0.010101010101% --- Episode 306
Aggregate text sentiment: -0.241379310345% --- Episode 307
Aggregate text sentiment: -0.25037037037% --- Episode 308
Aggregate text sentiment: -0.269121813031% --- Episode 309
Aggregate text sentiment: -0.387045813586% --- Episode 310
Aggregate text sentiment: -0.151515151515% --- Episode 311
Aggregate text sentiment: -0.288732394366% --- Episode 312
Aggregate text sentiment: -0.24% --- Episode 313
Aggregate text sentiment: -0.407894736842% --- Episode 314
Aggregate text sentiment: -0.383512544803% --- Episode 315
Aggregate text sentiment: -0.35453100159% --- Episode 316
Aggregate text sentiment: -0.230046948357% --- Episode 317
Aggregate text sentiment: -0.16290726817% --- Episode 318
Aggregate text sentiment: -0.0648967551622% --- Episode 319
Aggregate text sentiment: -0.395424836601% --- Episode 320
Aggregate text sentiment: -0.207715133531% --- Episode 321

Organizing dataframes¶

We can now view the episodes in order from most negative to least negative

Bringing in a csv file containing series-level sentiment balance data¶

In [7]:

ref = pd.read_csv('ATLA Sentiment Balance Results.csv')

Inserting episode titles using data from another csv file¶

In [8]:

reader = pd.read_csv('Avatar Numbers.csv')
ep_list = reader[['Series', 'Episode', 'Title', 'Viewership (Mil)']]

In [9]:

df = ref.join(ep_list['Title'], lsuffix='Episode')
df = df.join(ep_list['Viewership (Mil)'], lsuffix='Episode')

Converting Viewership numbers from string to integers¶

In [10]:

df['Viewership (Mil)'] = [float(x) for x in df['Viewership (Mil)']]

Converting Episode data to string data¶

In [11]:

df['Episode'] = [str(x) for x in df['Episode']]

Re-organizing columns¶

In [12]:

df = df[['Episode', 'Title', 'Total Non-Neutral Weight', 'Sentiment Balance', 'Balance Percentage', 'Viewership (Mil)']]
df.columns = ['Episode Number', 'Episode', 'Total Non-Neutral Weight', 'Sentiment Balance', 'Balance Percentage', 'Viewership (Mil)']

Re-sorting data¶

In [78]:

df.sort(columns='Balance Percentage', ascending=False).head(20) #Least negative episodes
#df.sort(columns='Balance Percentage').head() #Most negative episodes

Out[78]:

	Episode Number	Episode	Total Non-Neutral Weight	Sentiment Balance	Balance Percentage	Viewership (Mil)
45	306	The Avatar and the Firelord	396.0	-4.0	-0.010	3.20
34	215	Tales of Ba Sing Se	398.0	-16.0	-0.040	3.12
38	219	The Guru	473.5	-20.5	-0.043	4.40
58	319	Sozin's Comet, Part 2: The Old Masters	339.0	-22.0	-0.065	5.59
33	214	City of Walls and Secrets	311.0	-25.0	-0.080	3.27
4	105	The King of Omashu	329.5	-28.5	-0.086	3.54
17	118	The Waterbending Master	349.0	-33.0	-0.095	3.50
6	107	The Spirit World (Winter Solstice, Part 1)	310.5	-31.5	-0.101	3.29
18	119	The Siege of the North, Part 1	360.5	-39.5	-0.110	3.42
25	206	The Blind Bandit	489.5	-56.5	-0.115	3.33
43	304	Sokka's Master	391.5	-48.5	-0.124	3.22
13	114	The Fortuneteller	399.0	-50.0	-0.125	2.05
7	108	Avatar Roku (Winter Solstice, Part 2)	307.0	-39.0	-0.127	3.12
28	209	Bitter Work	383.0	-49.0	-0.128	3.26
5	106	Imprisoned	350.5	-45.5	-0.130	3.38
3	104	The Warriors of Kyoshi	358.5	-47.5	-0.132	3.47
16	117	The Northern Air Temple	350.0	-48.0	-0.137	1.68
8	109	The Waterbending Scroll	404.0	-60.0	-0.149	3.15
31	212	The Serpent's Pass	408.5	-61.5	-0.151	4.10
50	311	The Day of Black Sun, Part 2: The Eclipse	330.0	-50.0	-0.152	3.77

Diving into episode-level sentiment data¶

First, to figure out how to decide which episodes to look deeper into:

In [14]:

least_negative = ref.sort(columns='Sentiment Balance', ascending=False)['Episode'].head(10)

For further reference, a quick function will be created and used to return a quick-view list of any episodes specified

In [15]:

def get_title(episode):
    print ep_list[ep_list['Episode'].isin([int(episode)])]

In [16]:

for episode in least_negative:
    get_title(episode)

   Series  Episode                        Title  Viewership (Mil)
45   ATLA      306  The Avatar and the Firelord               3.2
   Series  Episode                Title  Viewership (Mil)
34   ATLA      215  Tales of Ba Sing Se              3.12
   Series  Episode     Title  Viewership (Mil)
38   ATLA      219  The Guru               4.4
   Series  Episode                                   Title  Viewership (Mil)
58   ATLA      319  Sozin's Comet, Part 2: The Old Masters              5.59
   Series  Episode                      Title  Viewership (Mil)
33   ATLA      214  City of Walls and Secrets              3.27
  Series  Episode               Title  Viewership (Mil)
4   ATLA      105  The King of Omashu              3.54
  Series  Episode                                       Title  \
6   ATLA      107  The Spirit World (Winter Solstice, Part 1)   

   Viewership (Mil)  
6              3.29  
   Series  Episode                    Title  Viewership (Mil)
17   ATLA      118  The Waterbending Master               3.5
  Series  Episode                                  Title  Viewership (Mil)
7   ATLA      108  Avatar Roku (Winter Solstice, Part 2)              3.12
   Series  Episode                           Title  Viewership (Mil)
18   ATLA      119  The Siege of the North, Part 1              3.42

Now that we have the means to select which episodes to dive into, let's start with some of the least negative episodes

In [17]:

get_title(319)

   Series  Episode                                   Title  Viewership (Mil)
58   ATLA      319  Sozin's Comet, Part 2: The Old Masters              5.59

Define episode, sort order for ep_viewer:¶

In [18]:

ep = 319
sort = False

In [19]:

ep_viewer = pd.read_csv(str(ep)+'.csv').drop('Unnamed: 0', axis=1).sort(columns='Relative Weight', ascending=sort)
    #viewer = viewer.drop('Unnamed: 0', axis=1) 
    #viewer = viewer.sort(columns='Relative Weight', ascending=False)

In [20]:

ep_viewer.head()

Out[20]:

	Word	Word Count	Frequency-Weight	Share of Words	Polarity	Adjusted Sentiment	Relative Weight	Episode
689	like	8	1.0	0.001926	1	1.0	8.0	319
988	smiles	7	1.0	0.001686	1	1.0	7.0	319
893	right	13	0.5	0.003130	1	0.5	6.5	319
989	smiling	5	1.0	0.001204	1	1.0	5.0	319
1108	top	5	1.0	0.001204	1	1.0	5.0	319

Review¶

The top 3-4 episodes in each list seem accurate, however as we attempt to estimate beyond the 5th most positive/negative episodes, sentiment begins to behave ambiguously. Perhaps it is fan-bias, but The Desert shouldn't be a relatively neutral episode.

As it stands, the estimator processes dialogue as well as stage directions included in each episode. Perhaps these directions (which include a large number of adjectives and adverbs' are skewing the estimator? Otherwise, the wordbanks need to be refined to better fit the content in the series.

In [21]:

df

Out[21]:

	Episode Number	Episode	Total Non-Neutral Weight	Sentiment Balance	Balance Percentage	Viewership (Mil)
0	101	The Boy in the Iceberg	304.0	-82.0	-0.270	3.47
1	102	The Avatar Returns	273.0	-61.0	-0.223	3.47
2	103	The Southern Air Temple	388.5	-69.5	-0.179	3.41
3	104	The Warriors of Kyoshi	358.5	-47.5	-0.132	3.47
4	105	The King of Omashu	329.5	-28.5	-0.086	3.54
5	106	Imprisoned	350.5	-45.5	-0.130	3.38
6	107	The Spirit World (Winter Solstice, Part 1)	310.5	-31.5	-0.101	3.29
7	108	Avatar Roku (Winter Solstice, Part 2)	307.0	-39.0	-0.127	3.12
8	109	The Waterbending Scroll	404.0	-60.0	-0.149	3.15
9	110	Jet	416.5	-109.5	-0.263	3.40
10	111	The Great Divide	432.0	-150.0	-0.347	3.10
11	112	The Storm	404.5	-109.5	-0.271	3.25
12	113	The Blue Spirit	284.5	-109.5	-0.385	3.19
13	114	The Fortuneteller	399.0	-50.0	-0.125	2.05
14	115	Bato of the Water Tribe	343.5	-59.5	-0.173	3.37
15	116	The Deserter	357.5	-64.5	-0.180	3.17
16	117	The Northern Air Temple	350.0	-48.0	-0.137	1.68
17	118	The Waterbending Master	349.0	-33.0	-0.095	3.50
18	119	The Siege of the North, Part 1	360.5	-39.5	-0.110	3.42
19	120	The Siege of the North, Part 2	384.5	-123.5	-0.321	3.42
20	201	The Avatar State	446.5	-131.5	-0.295	3.38
21	202	The Cave of Two Lovers	410.5	-84.5	-0.206	3.27
22	203	Return to Omashu	451.0	-89.0	-0.197	3.20
23	204	The Swamp	340.5	-127.5	-0.374	3.10
24	205	Avatar Day	408.5	-79.5	-0.195	3.11
25	206	The Blind Bandit	489.5	-56.5	-0.115	3.33
26	207	Zuko Alone	387.0	-103.0	-0.266	3.33
27	208	The Chase	406.5	-67.5	-0.166	3.33
28	209	Bitter Work	383.0	-49.0	-0.128	3.26
29	210	The Library	372.5	-60.5	-0.162	3.18
...	...	...	...	...	...	...
31	212	The Serpent's Pass	408.5	-61.5	-0.151	4.10
32	213	The Drill	440.0	-130.0	-0.295	4.10
33	214	City of Walls and Secrets	311.0	-25.0	-0.080	3.27
34	215	Tales of Ba Sing Se	398.0	-16.0	-0.040	3.12
35	216	Appa's Lost Days	414.5	-174.5	-0.421	3.54
36	217	Lake Laogai	416.0	-160.0	-0.385	3.27
37	218	The Earth King	453.0	-71.0	-0.157	3.76
38	219	The Guru	473.5	-20.5	-0.043	4.40
39	220	The Crossroads of Destiny	442.0	-76.0	-0.172	4.40
40	301	The Awakening	316.0	-76.0	-0.241	3.06
41	302	The Headband	329.0	-51.0	-0.155	3.06
42	303	The Painted Lady	244.0	-77.0	-0.316	3.22
43	304	Sokka's Master	391.5	-48.5	-0.124	3.22
44	305	The Beach	409.0	-69.0	-0.169	3.22
45	306	The Avatar and the Firelord	396.0	-4.0	-0.010	3.20
46	307	The Runaway	261.0	-63.0	-0.241	3.22
47	308	The Puppetmaster	337.5	-84.5	-0.250	3.52
48	309	Nightmares and Daydreams	353.0	-95.0	-0.269	3.52
49	310	The Day of Black Sun, Part 1: The Invasion	316.5	-122.5	-0.387	3.77
50	311	The Day of Black Sun, Part 2: The Eclipse	330.0	-50.0	-0.152	3.77
51	312	The Western Air Temple	426.0	-123.0	-0.289	3.55
52	313	The Firebending Masters	375.0	-90.0	-0.240	3.55
53	314	The Boiling Rock, Part 1	304.0	-124.0	-0.408	3.97
54	315	The Boiling Rock, Part 2	279.0	-107.0	-0.384	3.97
55	316	The Southern Raiders	314.5	-111.5	-0.355	4.23
56	317	The Ember Island Players	426.0	-98.0	-0.230	4.53
57	318	Sozin's Comet, Part 1: The Phoenix King	399.0	-65.0	-0.163	5.59
58	319	Sozin's Comet, Part 2: The Old Masters	339.0	-22.0	-0.065	5.59
59	320	Sozin's Comet, Part 3: Into the Inferno	306.0	-121.0	-0.395	5.59
60	321	Sozin's Comet, Part 4: Avatar Aang	337.0	-70.0	-0.208	5.59

61 rows × 6 columns

Plotting Viewership data and Balance Percentage as the Primary Sentiment Indicator¶

In [22]:

df[['Episode',
    'Balance Percentage',
    'Viewership (Mil)']].plot(
kind='bar',
figsize=(50,10),
x='Episode',
title='Avatar: The Last Airbender',
#marker='.',
#markersize=20
)

Out[22]:

<matplotlib.axes._subplots.AxesSubplot at 0x186ab3c8>

Aggregating all tokens for a series-wide analysis¶

In [23]:

import requests
from bs4 import BeautifulSoup as bs
import nltk
import re

In [24]:

raw_html = [requests.get(url).text for url in urls]

In [25]:

#EXCLUDE NON-RELEVANT PAGE SECTIONS
top = 'Transcriber'
bottom = '[End Credits]'
slim_html = [x.split(top)[1].split(bottom)[0] for x in raw_html]
#html = str()

In [28]:

#html = slim_html - formatted

In [40]:

soup = [bs(x) for x in slim_html]
souped_text = [x.get_text() for x in soup]
encoded_text = [x.encode('utf-8') for x in souped_text]

In [47]:

#TOKENIZE TEXT
import nltk
tokens = [nltk.word_tokenize(x) for x in encoded_text] #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'

In [48]:

sum([len(x) for x in tokens]) #total words

Out[48]:

In [49]:

#Stripping non-alphabetic items
consolidated_text = list()

for row in tokens:
    consolidated_text += [word for word in row]
    
alpha = [word for word in consolidated_text if word.isalpha() == True]

In [50]:

len(alpha)

Out[50]:

Calculating word counts¶

In [51]:

a_df = pd.DataFrame(alpha)
a_df.columns = ['Word']
a_df['Word Count'] = [1 for x in a_df['Word']]
a_df = a_df.groupby('Word').count()
a_df = a_df.reset_index()

Removing stopwords¶

In [52]:

#IMPORTING STOPWORD CORPUS
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
a_df = a_df[[x not in stopwords for x in a_df['Word']]]

Calculating frequency, percentages¶

In [53]:

#DETERMINING FREQUENCY-WEIGHTS FOR EACH WORD
a_df['Frequency-Weight'] = 1
a_df.ix[a_df['Word Count'] == 1, 'Frequency-Weight'] = 2
a_df.ix[a_df['Word Count'] > 10, 'Frequency-Weight'] = 0.5

#CALCULATING SHARE OF TEXT FOR EACH UNIQUE WORD
a_df['Share of Words'] = a_df['Word Count']/a_df['Word Count'].sum()

Assigning calculated sentiment¶

In [54]:

#ASSIGNING INITAL POLARITY WITH PRE-DEFINED POS/NEG WORDBANKS
a_df['Polarity'] = 0

positive_emotions = ['curious', 'attract', 'surprise', 'hope', 'Hope', 
                 'thank', 'joy', 'relief', 'proud', 'Joy', 
                 'generous', 'sympath', 'love', 'amuse', 'Love',
                 'delight', 'elat', 'excit', 'happy', 'happi', 'Happi',
                 'joy', 'pleasure', 'affection', 'empath', 'friendl']
negative_emotions = ['alarm', 'disgust', 'indifferen', 'fear', 'Fear', 
                 'rage', 'sorrow', 'grief', 'frustrat', 'disappoint', 
                 'embarrass', 'shame', 'guilt', 'remorse', 'greed', 'Greed',  
                 'miser', 'jealous', 'cruel', 'hate', 'anger', 'annoyed', 'Anger', 'Hate',
                 'disgust', 'irrit', 'anxious', 'anxiety', 'helpless',
                 'worry', 'doubt', 'shame', 'bored', 'despar', 'hurt']

In [55]:

#PROCESSING PRE-DEFINED POSITIVE AND NEGATIVE WORDBANKS
pos_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/positive_wordbank.txt'
pos_words_text = requests.get(pos_bank).text
pos_soup = bs(pos_words_text) #converts string variable to unicode
souped_pos_words = pos_soup.get_text() #stores unicode text in variable 'souped_text'
encoded_pos_words = souped_pos_words.encode('utf-8') #converts unicode text to utf-8
positive_words = nltk.word_tokenize(encoded_pos_words) #tokenizes unicode text, separating each word and turning it into an item in list 'tokens'

neg_bank = 'https://raw.githubusercontent.com/c-trl/nlp-with-xanga-entries/master/negative_wordbank.txt'    
neg_words_text = requests.get(neg_bank).text
neg_soup = bs(neg_words_text) #converts string variable to unicode
souped_neg_words = neg_soup.get_text() #stores unicode text in variable 'souped_text'
encoded_neg_words = souped_neg_words.encode('utf-8') #converts unicode text to utf-8
negative_words = nltk.word_tokenize(encoded_neg_words)

In [56]:

a_df.ix[a_df.Word.isin(positive_emotions), 'Polarity'] = 1
a_df.ix[a_df.Word.isin(negative_emotions), 'Polarity'] = -1
a_df.ix[a_df.Word.isin(positive_words), 'Polarity'] = 1
a_df.ix[a_df.Word.isin(negative_words), 'Polarity'] = -1

In [57]:

a_df['Adjusted Sentiment'] = a_df['Polarity'] * a_df['Frequency-Weight']
a_df['Relative Weight'] = a_df['Word Count'] * a_df['Adjusted Sentiment']

In [75]:

a_df[a_df['Relative Weight'] != 0].sort(columns='Relative Weight', ascending=False)

Out[75]:

	Word	Word Count	Frequency-Weight	Share of Words	Polarity	Adjusted Sentiment	Relative Weight
10188	right	639	0.5	0.002847	1	0.5	319.5
7803	like	615	0.5	0.002740	1	0.5	307.5
12364	top	280	0.5	0.001247	1	0.5	140.0
11106	smiles	204	0.5	0.000909	1	0.5	102.0
6471	good	168	0.5	0.000748	1	0.5	84.0
6534	great	157	0.5	0.000699	1	0.5	78.5
11109	smiling	156	0.5	0.000695	1	0.5	78.0
9807	ready	127	0.5	0.000566	1	0.5	63.5
3961	clearly	102	0.5	0.000454	1	0.5	51.0
11105	smile	98	0.5	0.000437	1	0.5	49.0
13151	well	95	0.5	0.000423	1	0.5	47.5
3141	better	91	0.5	0.000405	1	0.5	45.5
5474	enough	80	0.5	0.000356	1	0.5	40.0
7933	love	79	0.5	0.000352	1	0.5	39.5
13336	work	73	0.5	0.000325	1	0.5	36.5
11883	surprise	69	0.5	0.000307	1	0.5	34.5
9472	pretty	66	0.5	0.000294	1	0.5	33.0
7691	leading	65	0.5	0.000290	1	0.5	32.5
8089	master	65	0.5	0.000290	1	0.5	32.5
6976	hope	63	0.5	0.000281	1	0.5	31.5
7003	hot	54	0.5	0.000241	1	0.5	27.0
8542	nice	54	0.5	0.000241	1	0.5	27.0
9413	powerful	51	0.5	0.000227	1	0.5	25.5
6731	happy	51	0.5	0.000227	1	0.5	25.5
6205	free	49	0.5	0.000218	1	0.5	24.5
3958	clear	47	0.5	0.000209	1	0.5	23.5
3046	beautiful	46	0.5	0.000205	1	0.5	23.0
6443	glow	46	0.5	0.000205	1	0.5	23.0
3133	best	44	0.5	0.000196	1	0.5	22.0
7320	instantly	41	0.5	0.000183	1	0.5	20.5
...	...	...	...	...	...	...	...
13351	worry	55	0.5	0.000245	-1	-0.5	-27.5
4427	crazy	55	0.5	0.000245	-1	-0.5	-27.5
11739	struggles	58	0.5	0.000258	-1	-0.5	-29.0
2896	backward	62	0.5	0.000276	-1	-0.5	-31.0
2899	bad	62	0.5	0.000276	-1	-0.5	-31.0
10777	shocked	63	0.5	0.000281	-1	-0.5	-31.5
5949	fist	64	0.5	0.000285	-1	-0.5	-32.0
13399	wrong	64	0.5	0.000285	-1	-0.5	-32.0
7923	lost	65	0.5	0.000290	-1	-0.5	-32.5
12041	tanks	70	0.5	0.000312	-1	-0.5	-35.0
10776	shock	70	0.5	0.000312	-1	-0.5	-35.0
6735	hard	72	0.5	0.000321	-1	-0.5	-36.0
8915	pan	73	0.5	0.000325	-1	-0.5	-36.5
2581	angrily	77	0.5	0.000343	-1	-0.5	-38.5
3734	cave	79	0.5	0.000352	-1	-0.5	-39.5
3406	breaks	83	0.5	0.000370	-1	-0.5	-41.5
5259	dust	85	0.5	0.000379	-1	-0.5	-42.5
8339	monster	85	0.5	0.000379	-1	-0.5	-42.5
11273	sorry	87	0.5	0.000388	-1	-0.5	-43.5
7974	lying	88	0.5	0.000392	-1	-0.5	-44.0
5749	falling	89	0.5	0.000397	-1	-0.5	-44.5
2802	attack	92	0.5	0.000410	-1	-0.5	-46.0
11115	smoke	93	0.5	0.000414	-1	-0.5	-46.5
4627	dark	93	0.5	0.000414	-1	-0.5	-46.5
2582	angry	98	0.5	0.000437	-1	-0.5	-49.0
4007	cloud	98	0.5	0.000437	-1	-0.5	-49.0
5747	fall	104	0.5	0.000463	-1	-0.5	-52.0
8462	na	182	0.5	0.000811	-1	-0.5	-91.0
5750	falls	220	0.5	0.000980	-1	-0.5	-110.0
11064	slowly	249	0.5	0.001109	-1	-0.5	-124.5

1893 rows × 7 columns

In [59]:

a_df['Relative Weight'].sum()

Out[59]:

-2181.0

In [60]:

pos_20 = a_df.sort(columns='Relative Weight', ascending=False).head(50)

In [61]:

neg_20 = a_df.sort(columns='Relative Weight', ascending=True).head(50)

In [72]:

top_10_negative_words = [
'fall',
'angry',
'cloud', #remember the episode with the fortune teller?
'dark',
'smoke',
'attack',
'lying',
'sorry',
'monster',
'tanks',
'lost',
'wrong',
'bad',
'crazy'
]

In [71]:

top_10_positive_words = [
'like',
'good',
'great',
'smile',
'love',
'surprise',
'pretty',
'master',
'hope',
'hot',
'powerful',
'beautiful',
'happy',
'free'
]

In [ ]: