# Import libraries that we are going to use in the project

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Load our dataset excluding bad lines (format problem)
# The dataset has been cut into three parts to be uploaded on Github
data_1 = pd.read_csv('data/data_1.csv', error_bad_lines=False)
data_2 = pd.read_csv('data/data_2.csv', error_bad_lines=False)
data_3 = pd.read_csv('data/data_3.csv', error_bad_lines=False)
data = pd.concat([data_1, data_2, data_3], axis=0)

# Reindex the data frame and drop the column added by the reset_index function
data.reset_index(drop=True, inplace=True)

# Set max_colwidth to 140 in order to fully see the tweet
pd.set_option('max_colwidth', 140)

# Display the first 10 rows
data.head(10)

plt.close()
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data.Sentiment.as_matrix(), edgecolor='gray')

# Set plot title
ax.set_title("Histogram of Sentiments")

# Set x-axis name
ax.set_xlabel("Sentiment")

# Set y-axis name
ax.set_ylabel("Frequecy")

# Select the first patch (a rectangle, object of class matplotlib.patches.Patch)
# corresponding to negative sentiment and color it
patches[0].set_facecolor("#5d4037")
patches[0].set_label("negative")

# Same for the positive sentiment but in another color.
patches[-1].set_facecolor("#ff9100")
patches[-1].set_label("positive")

# Add legend to a plot     
plt.legend()

data.Sentiment.value_counts()

# Show duplicated tweets if exist
len(data[data.duplicated('SentimentText')])

# Display the number of RT
len(data.SentimentText[data.SentimentText.str.extract('(RT)').notnull()])

# Load all of the resources
emoticons = pd.read_csv('data/smileys.csv')
positive_emoticons = emoticons[emoticons.Sentiment == 1]
negative_emoticons = emoticons[emoticons.Sentiment == 0]
emoticons.head(5)

acronyms = pd.read_csv('data/acronyms.csv')
acronyms.tail(5)

stops = pd.read_csv('data/stopwords.csv')
stops.columns = ['Word']
stops.head(5)

positive_words = pd.read_csv('data/positive-words.csv', sep='\t')
positive_words.columns = ['Word', 'Sentiment']
negative_words = pd.read_csv('data/negative-words.csv', sep='\t')
negative_words.columns = ['Word', 'Sentiment']
positive_words.head(5)

negative_words.head(5)

negation_words = pd.read_csv('data/negation.csv')
negation_words.head(5)

import re

def make_emoticon_pattern(emoticons):
    pattern = "|".join(map(re.escape, emoticons.Smiley))
    pattern = "(?<=\s)(" + pattern + ")(?=\s)"
    return pattern

def find_with_pattern(pattern, replace=False, tag=None):
    if replace and tag == None:
        raise Exception("Parameter error", "If replace=True you should add the tag by which the pattern will be replaced")
    regex = re.compile(pattern)
    if replace:
        return data.SentimentText.apply(lambda tweet: re.sub(pattern, tag, " " + tweet + " "))
    return data.SentimentText.apply(lambda tweet: re.findall(pattern, " " + tweet + " "))

pos_emoticons_found = find_with_pattern(make_emoticon_pattern(positive_emoticons))
neg_emoticons_found = find_with_pattern(make_emoticon_pattern(negative_emoticons))

nb_pos_emoticons = len(pos_emoticons_found[pos_emoticons_found.map(lambda emoticons : len(emoticons) > 0)])
nb_neg_emoticons = len(neg_emoticons_found[neg_emoticons_found.map(lambda emoticons : len(emoticons) > 0)])
print "Number of positive emoticons: " + str(nb_pos_emoticons) + " Number of negative emoticons: " + str(nb_neg_emoticons)

data.SentimentText = find_with_pattern(make_emoticon_pattern(positive_emoticons), True, '||pos||')
data.SentimentText = find_with_pattern(make_emoticon_pattern(negative_emoticons), True, '||neg||')
data.head(10)

pattern_url = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')

url_found = find_with_pattern(pattern_url)
print "Number of urls: " + str(len(url_found[url_found.map(lambda urls : len(urls) > 0)]))

data[50:60]

data.SentimentText = find_with_pattern(pattern_url, True, '||url||')
data[50:60]

data[1578592:1578602]

def remove_unicode(string):
    try:
        string = string.decode('unicode_escape').encode('ascii','ignore')
    except UnicodeDecodeError:
        pass
    return string

data.SentimentText = data.SentimentText.apply(lambda tweet: remove_unicode(tweet))
data[1578592:1578602]

data.SentimentText[599982]

import HTMLParser

html_parser = HTMLParser.HTMLParser()
# Convert tweets in unicode utf-8 to avoid mixing unicode with ascii and causing an error during unescape
data.SentimentText = data.SentimentText.apply(lambda tweet: html_parser.unescape(tweet))
data.SentimentText[599982]

data[1578592:1578602]

data.SentimentText = data.SentimentText.str.lower()
data.head(10)

pattern_usernames = "@\w{1,}"
usernames_found = find_with_pattern(pattern_usernames)
len(data.SentimentText[usernames_found.apply(lambda usernames : len(usernames) > 0)])

data[45:55]

data.SentimentText = find_with_pattern(pattern_usernames, True, '||target||')
data[45:55]

import string
from collections import Counter

# Create a dictionary of acronym which will be used to get translations
acronym_dictionary = dict(zip(acronyms.Acronym, acronyms.Translation))

# Will be used to get rid of the punctuation in tweets (does not include | since we use it for our tokens and ' 
# to take care of don't, can't)
punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{}~'

# Frequency table for acronyms
acronyms_counter = Counter()

# Loop on acronyms to replace those matched in the tweet by the corresponding translations
# Return the tweet and the acronyms used
def acronym_to_translation(tweet, acronyms_counter):
    table = string.maketrans(punctuation," " * len(punctuation))
    tweet = str(tweet).translate(table)
    words = tweet.split()
    new_words = []
    for i, word in enumerate(words):
        if acronym_dictionary.has_key(word):
            acronyms_counter[word] += 1
            new_words.extend(acronym_dictionary[word].split())
        else:
            new_words.append(word)
    return new_words

data.SentimentText = data.SentimentText.apply(lambda tweet: acronym_to_translation(tweet, acronyms_counter))

# Get and display top20 acronyms
top20acronyms = acronyms_counter.most_common(20)
top20acronyms

# Just to better visualize the top 20 acronym
for i, (acronym, value) in enumerate(top20acronyms):
    print str(i + 1) + ") " + acronym + " => " + acronym_dictionary[acronym] + " : " + str(value) 

# With a bar plot
plt.close()
top20acronym_keys = [x[0] for x in top20acronyms]
top20acronym_values = [x[1] for x in top20acronyms]
indexes = np.arange(len(top20acronym_keys))
width = 0.7
plt.bar(indexes, top20acronym_values, width)
plt.xticks(indexes + width * 0.5, top20acronym_keys, rotation="vertical")

print data.SentimentText[29]

# Transform the dataframe into a dictionary
negation_dictionary = dict(zip(negation_words.Negation, negation_words.Tag))

# Find a negation in a tweet and replace it by its tag
def replace_negation(tweet):
    return [negation_dictionary[word] if negation_dictionary.has_key(word) else word for word in tweet]
    
# Apply the function on every tweet
data.SentimentText = data.SentimentText.apply(lambda tweet: replace_negation(tweet))
print data.SentimentText[29]

data[1578604:]

pattern = re.compile(r'(.)\1*')

def reduce_sequence_word(word):
    return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)])

def reduce_sequence_tweet(tweet):
    return [reduce_sequence_word(word) for word in tweet]

data.SentimentText = data.SentimentText.apply(lambda tweet: reduce_sequence_tweet(tweet))
data[1578604:]

def make_training_test_sets(data):
    
    # Before making the training and test set, we shuffle our data set in order to avoid keeping any order
    data_shuffled = data.iloc[np.random.permutation(len(data))]
    data_shuffled = data_shuffled.reset_index(drop=True)

    # Join the words back into one string separated by space for each tweet
    data_shuffled.SentimentText = data_shuffled.SentimentText.apply(lambda tweet: " ".join(tweet))

    # Separate positive and negative tweets
    positive_tweets = data_shuffled[data_shuffled.Sentiment == 1]
    negative_tweets = data_shuffled[data_shuffled.Sentiment == 0]

    # Cutoff, 3/4 for training of each sentiment and 1/4 of each sentiment for testing
    positive_tweets_cutoff = int(len(positive_tweets) * (3./4.))
    negative_tweets_cutoff = int(len(negative_tweets) * (3./4.))

    # Make the training and test set
    training_tweets = pd.concat([positive_tweets[:positive_tweets_cutoff], negative_tweets[:negative_tweets_cutoff]])
    test_tweets = pd.concat([positive_tweets[positive_tweets_cutoff:], negative_tweets[negative_tweets_cutoff:]])

    # We suffle the training and test set to break the order of tweets based on their sentiment
    training_tweets = training_tweets.iloc[np.random.permutation(len(training_tweets))].reset_index(drop=True)
    test_tweets = test_tweets.iloc[np.random.permutation(len(test_tweets))].reset_index(drop=True)
    
    return training_tweets, test_tweets

training_tweets, test_tweets = make_training_test_sets(data)

print "size of training set: " + str(len(training_tweets))
print "size of test set: " + str(len(test_tweets))


from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def classify(training_tweets, test_tweets, ngram=(1, 1)):
    # F1 scores for each fold
    scores = []

    # Provides train/test indices to split data in train, validation sets.
    k_fold = KFold(n=len(training_tweets), n_folds=10)

    # Used to convert a collection of text docuements to a matrix of token counts => Bag of words
    count_vectorizer = CountVectorizer(ngram_range=ngram)

    # Confusion matrix with TP/FP/TN/FN
    confusion = np.array([[0, 0], [0, 0]])

    for training_indices, validation_indices in k_fold:
        training_features = count_vectorizer.fit_transform(training_tweets.iloc[training_indices]['SentimentText'].values)
        training_labels = training_tweets.iloc[training_indices]['Sentiment'].values

        validation_features = count_vectorizer.transform(training_tweets.iloc[validation_indices]['SentimentText'].values)
        validation_labels = training_tweets.iloc[validation_indices]['Sentiment'].values

        classifier = MultinomialNB()
        classifier.fit(training_features, training_labels)
        validation_predictions = classifier.predict(validation_features)

        confusion += confusion_matrix(validation_labels, validation_predictions)
        score = f1_score(validation_labels, validation_predictions)
        scores.append(score)
    
    return (sum(scores) / len(scores)), confusion

score, confusion = classify(training_tweets, test_tweets)

print 'Total tweets classified: ' + str(len(training_tweets))
print 'Score: ' +  str(sum(scores) / len(scores))
print 'Confusion matrix:'
print(confusion)

labels = ['Positive', 'Negative']
def plot_confusion_matrix(cm, labels, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=45)
    plt.yticks(tick_marks, labels)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
print 'Confusion matrix without normalization'
plt.figure()
plot_confusion_matrix(confusion, labels)

confusion_normalized = confusion.astype(float) / confusion.sum(axis=1)[:, np.newaxis]
print 'Confusion matrix normalized'
plt.figure()
plot_confusion_matrix(confusion_normalized, labels, title='Confusion matrix normalized')

# We build a word frequency table to see which words are the most used
word_frequency_table = Counter()

def count_word(tweet):
    for word in tweet:
        word_frequency_table[word] += 1
    return tweet

data.SentimentText.map(lambda tweet: count_word(tweet))
word_frequency_table.most_common()[:20]

# list of tags
tags = ['||target||', '||url||', '||pos||', '||neg||', '||not||']

# list of tuples representing tags with their corresponding count
tag_counter = [(w, c) for w,c in word_frequency_table.iteritems() if w in tags]

print tag_counter

plt.close()
tag_counter_keys = [x[0] for x in tag_counter]
tag_counter_values = [x[1] for x in tag_counter]
indexes = np.arange(len(tag_counter_keys))
width = 0.7
plt.bar(indexes, tag_counter_values, width)
plt.xticks(indexes + width * 0.5, tag_counter_keys, rotation="vertical")

# Transform the dataframe into a dictionary
stopword_dictionary = dict.fromkeys(stops.Word, None)

# Remove stopword from tweets
def remove_stopwords(tweet):
    tweet = [stopword_dictionary[word] if stopword_dictionary.has_key(word) else word for word in tweet]
    return [word for word in tweet if word]

data.SentimentText = data.SentimentText.apply(lambda tweet: remove_stopwords(tweet))
print data.SentimentText.head(20)

# Most common words after deleting stop words
word_frequency_table = Counter()

data.SentimentText.map(lambda tweet: count_word(tweet))
print word_frequency_table.most_common()[:20]

training_tweets_nosw, test_tweets_nosw = make_training_test_sets(data)
score, confusion = classify(training_tweets_nosw, test_tweets_nosw)
print 'Total tweets classified: ' + str(len(training_tweets_nosw))
print 'Score: ' + str(score)
print 'Confusion matrix:'
print(confusion)

import nltk

pstemmer = nltk.PorterStemmer()
def stemming_words(tweet):
    return [pstemmer.stem_word(word) if word not in tags else word for word in tweet]

data.SentimentText = data.SentimentText.apply(lambda tweet: stemming_words(tweet))
print data.SentimentText.head(20)

training_tweets_stems, test_tweets_stems = make_training_test_sets(data)
score, confusion = classify(training_tweets_stems, test_tweets_stems)
print 'Total tweets classified: ' + str(len(training_tweets_stems))
print 'Score: ' + str(score)
print 'Confusion matrix:'
print(confusion)

score, confusion = classify(training_tweets, test_tweets, (2, 2))

print 'Total tweets classified: ' + str(len(training_tweets))
print 'Score: ' +  str(score)
print 'Confusion matrix:'
print(confusion)

score, confusion = classify(training_tweets, test_tweets, (1, 2))

print 'Total tweets classified: ' + str(len(training_tweets))
print 'Score: ' +  str(score)
print 'Confusion matrix:'
print(confusion)