# Import libraries that we are going to use in the project import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm # Load our dataset excluding bad lines (format problem) # The dataset has been cut into three parts to be uploaded on Github data_1 = pd.read_csv('data/data_1.csv', error_bad_lines=False) data_2 = pd.read_csv('data/data_2.csv', error_bad_lines=False) data_3 = pd.read_csv('data/data_3.csv', error_bad_lines=False) data = pd.concat([data_1, data_2, data_3], axis=0) # Reindex the data frame and drop the column added by the reset_index function data.reset_index(drop=True, inplace=True) # Set max_colwidth to 140 in order to fully see the tweet pd.set_option('max_colwidth', 140) # Display the first 10 rows data.head(10) plt.close() fig, ax = plt.subplots() counts, bins, patches = ax.hist(data.Sentiment.as_matrix(), edgecolor='gray') # Set plot title ax.set_title("Histogram of Sentiments") # Set x-axis name ax.set_xlabel("Sentiment") # Set y-axis name ax.set_ylabel("Frequecy") # Select the first patch (a rectangle, object of class matplotlib.patches.Patch) # corresponding to negative sentiment and color it patches[0].set_facecolor("#5d4037") patches[0].set_label("negative") # Same for the positive sentiment but in another color. patches[-1].set_facecolor("#ff9100") patches[-1].set_label("positive") # Add legend to a plot plt.legend() data.Sentiment.value_counts() # Show duplicated tweets if exist len(data[data.duplicated('SentimentText')]) # Display the number of RT len(data.SentimentText[data.SentimentText.str.extract('(RT)').notnull()]) # Load all of the resources emoticons = pd.read_csv('data/smileys.csv') positive_emoticons = emoticons[emoticons.Sentiment == 1] negative_emoticons = emoticons[emoticons.Sentiment == 0] emoticons.head(5) acronyms = pd.read_csv('data/acronyms.csv') acronyms.tail(5) stops = pd.read_csv('data/stopwords.csv') stops.columns = ['Word'] stops.head(5) positive_words = pd.read_csv('data/positive-words.csv', sep='\t') positive_words.columns = ['Word', 'Sentiment'] negative_words = pd.read_csv('data/negative-words.csv', sep='\t') negative_words.columns = ['Word', 'Sentiment'] positive_words.head(5) negative_words.head(5) negation_words = pd.read_csv('data/negation.csv') negation_words.head(5) import re def make_emoticon_pattern(emoticons): pattern = "|".join(map(re.escape, emoticons.Smiley)) pattern = "(?<=\s)(" + pattern + ")(?=\s)" return pattern def find_with_pattern(pattern, replace=False, tag=None): if replace and tag == None: raise Exception("Parameter error", "If replace=True you should add the tag by which the pattern will be replaced") regex = re.compile(pattern) if replace: return data.SentimentText.apply(lambda tweet: re.sub(pattern, tag, " " + tweet + " ")) return data.SentimentText.apply(lambda tweet: re.findall(pattern, " " + tweet + " ")) pos_emoticons_found = find_with_pattern(make_emoticon_pattern(positive_emoticons)) neg_emoticons_found = find_with_pattern(make_emoticon_pattern(negative_emoticons)) nb_pos_emoticons = len(pos_emoticons_found[pos_emoticons_found.map(lambda emoticons : len(emoticons) > 0)]) nb_neg_emoticons = len(neg_emoticons_found[neg_emoticons_found.map(lambda emoticons : len(emoticons) > 0)]) print "Number of positive emoticons: " + str(nb_pos_emoticons) + " Number of negative emoticons: " + str(nb_neg_emoticons) data.SentimentText = find_with_pattern(make_emoticon_pattern(positive_emoticons), True, '||pos||') data.SentimentText = find_with_pattern(make_emoticon_pattern(negative_emoticons), True, '||neg||') data.head(10) pattern_url = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))') url_found = find_with_pattern(pattern_url) print "Number of urls: " + str(len(url_found[url_found.map(lambda urls : len(urls) > 0)])) data[50:60] data.SentimentText = find_with_pattern(pattern_url, True, '||url||') data[50:60] data[1578592:1578602] def remove_unicode(string): try: string = string.decode('unicode_escape').encode('ascii','ignore') except UnicodeDecodeError: pass return string data.SentimentText = data.SentimentText.apply(lambda tweet: remove_unicode(tweet)) data[1578592:1578602] data.SentimentText[599982] import HTMLParser html_parser = HTMLParser.HTMLParser() # Convert tweets in unicode utf-8 to avoid mixing unicode with ascii and causing an error during unescape data.SentimentText = data.SentimentText.apply(lambda tweet: html_parser.unescape(tweet)) data.SentimentText[599982] data[1578592:1578602] data.SentimentText = data.SentimentText.str.lower() data.head(10) pattern_usernames = "@\w{1,}" usernames_found = find_with_pattern(pattern_usernames) len(data.SentimentText[usernames_found.apply(lambda usernames : len(usernames) > 0)]) data[45:55] data.SentimentText = find_with_pattern(pattern_usernames, True, '||target||') data[45:55] import string from collections import Counter # Create a dictionary of acronym which will be used to get translations acronym_dictionary = dict(zip(acronyms.Acronym, acronyms.Translation)) # Will be used to get rid of the punctuation in tweets (does not include | since we use it for our tokens and ' # to take care of don't, can't) punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{}~' # Frequency table for acronyms acronyms_counter = Counter() # Loop on acronyms to replace those matched in the tweet by the corresponding translations # Return the tweet and the acronyms used def acronym_to_translation(tweet, acronyms_counter): table = string.maketrans(punctuation," " * len(punctuation)) tweet = str(tweet).translate(table) words = tweet.split() new_words = [] for i, word in enumerate(words): if acronym_dictionary.has_key(word): acronyms_counter[word] += 1 new_words.extend(acronym_dictionary[word].split()) else: new_words.append(word) return new_words data.SentimentText = data.SentimentText.apply(lambda tweet: acronym_to_translation(tweet, acronyms_counter)) # Get and display top20 acronyms top20acronyms = acronyms_counter.most_common(20) top20acronyms # Just to better visualize the top 20 acronym for i, (acronym, value) in enumerate(top20acronyms): print str(i + 1) + ") " + acronym + " => " + acronym_dictionary[acronym] + " : " + str(value) # With a bar plot plt.close() top20acronym_keys = [x[0] for x in top20acronyms] top20acronym_values = [x[1] for x in top20acronyms] indexes = np.arange(len(top20acronym_keys)) width = 0.7 plt.bar(indexes, top20acronym_values, width) plt.xticks(indexes + width * 0.5, top20acronym_keys, rotation="vertical") print data.SentimentText[29] # Transform the dataframe into a dictionary negation_dictionary = dict(zip(negation_words.Negation, negation_words.Tag)) # Find a negation in a tweet and replace it by its tag def replace_negation(tweet): return [negation_dictionary[word] if negation_dictionary.has_key(word) else word for word in tweet] # Apply the function on every tweet data.SentimentText = data.SentimentText.apply(lambda tweet: replace_negation(tweet)) print data.SentimentText[29] data[1578604:] pattern = re.compile(r'(.)\1*') def reduce_sequence_word(word): return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)]) def reduce_sequence_tweet(tweet): return [reduce_sequence_word(word) for word in tweet] data.SentimentText = data.SentimentText.apply(lambda tweet: reduce_sequence_tweet(tweet)) data[1578604:] def make_training_test_sets(data): # Before making the training and test set, we shuffle our data set in order to avoid keeping any order data_shuffled = data.iloc[np.random.permutation(len(data))] data_shuffled = data_shuffled.reset_index(drop=True) # Join the words back into one string separated by space for each tweet data_shuffled.SentimentText = data_shuffled.SentimentText.apply(lambda tweet: " ".join(tweet)) # Separate positive and negative tweets positive_tweets = data_shuffled[data_shuffled.Sentiment == 1] negative_tweets = data_shuffled[data_shuffled.Sentiment == 0] # Cutoff, 3/4 for training of each sentiment and 1/4 of each sentiment for testing positive_tweets_cutoff = int(len(positive_tweets) * (3./4.)) negative_tweets_cutoff = int(len(negative_tweets) * (3./4.)) # Make the training and test set training_tweets = pd.concat([positive_tweets[:positive_tweets_cutoff], negative_tweets[:negative_tweets_cutoff]]) test_tweets = pd.concat([positive_tweets[positive_tweets_cutoff:], negative_tweets[negative_tweets_cutoff:]]) # We suffle the training and test set to break the order of tweets based on their sentiment training_tweets = training_tweets.iloc[np.random.permutation(len(training_tweets))].reset_index(drop=True) test_tweets = test_tweets.iloc[np.random.permutation(len(test_tweets))].reset_index(drop=True) return training_tweets, test_tweets training_tweets, test_tweets = make_training_test_sets(data) print "size of training set: " + str(len(training_tweets)) print "size of test set: " + str(len(test_tweets)) from sklearn.cross_validation import KFold from sklearn.metrics import confusion_matrix, f1_score from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB def classify(training_tweets, test_tweets, ngram=(1, 1)): # F1 scores for each fold scores = [] # Provides train/test indices to split data in train, validation sets. k_fold = KFold(n=len(training_tweets), n_folds=10) # Used to convert a collection of text docuements to a matrix of token counts => Bag of words count_vectorizer = CountVectorizer(ngram_range=ngram) # Confusion matrix with TP/FP/TN/FN confusion = np.array([[0, 0], [0, 0]]) for training_indices, validation_indices in k_fold: training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values) training_labels = training_tweets.iloc[training_indices]['Sentiment'].values validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values) validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values classifier = MultinomialNB() classifier.fit(training_features, training_labels) validation_predictions = classifier.predict(validation_features) confusion += confusion_matrix(validation_labels, validation_predictions) score = f1_score(validation_labels, validation_predictions) scores.append(score) return (sum(scores) / len(scores)), confusion score, confusion = classify(training_tweets, test_tweets) print 'Total tweets classified: ' + str(len(training_tweets)) print 'Score: ' + str(sum(scores) / len(scores)) print 'Confusion matrix:' print(confusion) labels = ['Positive', 'Negative'] def plot_confusion_matrix(cm, labels, title='Confusion matrix', cmap=plt.cm.Blues): plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(labels)) plt.xticks(tick_marks, labels, rotation=45) plt.yticks(tick_marks, labels) plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') print 'Confusion matrix without normalization' plt.figure() plot_confusion_matrix(confusion, labels) confusion_normalized = confusion.astype(float) / confusion.sum(axis=1)[:, np.newaxis] print 'Confusion matrix normalized' plt.figure() plot_confusion_matrix(confusion_normalized, labels, title='Confusion matrix normalized') # We build a word frequency table to see which words are the most used word_frequency_table = Counter() def count_word(tweet): for word in tweet: word_frequency_table[word] += 1 return tweet data.SentimentText.map(lambda tweet: count_word(tweet)) word_frequency_table.most_common()[:20] # list of tags tags = ['||target||', '||url||', '||pos||', '||neg||', '||not||'] # list of tuples representing tags with their corresponding count tag_counter = [(w, c) for w,c in word_frequency_table.iteritems() if w in tags] print tag_counter plt.close() tag_counter_keys = [x[0] for x in tag_counter] tag_counter_values = [x[1] for x in tag_counter] indexes = np.arange(len(tag_counter_keys)) width = 0.7 plt.bar(indexes, tag_counter_values, width) plt.xticks(indexes + width * 0.5, tag_counter_keys, rotation="vertical") # Transform the dataframe into a dictionary stopword_dictionary = dict.fromkeys(stops.Word, None) # Remove stopword from tweets def remove_stopwords(tweet): tweet = [stopword_dictionary[word] if stopword_dictionary.has_key(word) else word for word in tweet] return [word for word in tweet if word] data.SentimentText = data.SentimentText.apply(lambda tweet: remove_stopwords(tweet)) print data.SentimentText.head(20) # Most common words after deleting stop words word_frequency_table = Counter() data.SentimentText.map(lambda tweet: count_word(tweet)) print word_frequency_table.most_common()[:20] training_tweets_nosw, test_tweets_nosw = make_training_test_sets(data) score, confusion = classify(training_tweets_nosw, test_tweets_nosw) print 'Total tweets classified: ' + str(len(training_tweets_nosw)) print 'Score: ' + str(score) print 'Confusion matrix:' print(confusion) import nltk pstemmer = nltk.PorterStemmer() def stemming_words(tweet): return [pstemmer.stem_word(word) if word not in tags else word for word in tweet] data.SentimentText = data.SentimentText.apply(lambda tweet: stemming_words(tweet)) print data.SentimentText.head(20) training_tweets_stems, test_tweets_stems = make_training_test_sets(data) score, confusion = classify(training_tweets_stems, test_tweets_stems) print 'Total tweets classified: ' + str(len(training_tweets_stems)) print 'Score: ' + str(score) print 'Confusion matrix:' print(confusion) score, confusion = classify(training_tweets, test_tweets, (2, 2)) print 'Total tweets classified: ' + str(len(training_tweets)) print 'Score: ' + str(score) print 'Confusion matrix:' print(confusion) score, confusion = classify(training_tweets, test_tweets, (1, 2)) print 'Total tweets classified: ' + str(len(training_tweets)) print 'Score: ' + str(score) print 'Confusion matrix:' print(confusion)