import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

# read yelp.csv into a DataFrame
url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/yelp.csv'
yelp = pd.read_csv(url)

# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(yelp_best_worst.text, yelp_best_worst.stars, random_state=1)

# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train)
test_dtm = vect.transform(X_test)

# rows are documents, columns are terms (aka "tokens" or "features")
train_dtm.shape

# last 50 features
print vect.get_feature_names()[-50:]

# show vectorizer options
vect

# don't convert to lowercase
vect = CountVectorizer(lowercase=False)
train_dtm = vect.fit_transform(X_train)
train_dtm.shape

# allow tokens of one character
vect = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
train_dtm = vect.fit_transform(X_train)
train_dtm.shape

# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
train_dtm = vect.fit_transform(X_train)
train_dtm.shape

# last 50 features
print vect.get_feature_names()[-50:]

# use default options for CountVectorizer
vect = CountVectorizer()

# create document-term matrices
train_dtm = vect.fit_transform(X_train)
test_dtm = vect.transform(X_test)

# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)

# calculate accuracy
print metrics.accuracy_score(y_test, y_pred_class)

# calculate null accuracy
y_test_binary = np.where(y_test==5, 1, 0)
y_test_binary.mean()

# define a function that accepts a vectorizer and returns the accuracy
def tokenize_test(vect):
    train_dtm = vect.fit_transform(X_train)
    print 'Features: ', train_dtm.shape[1]
    test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(train_dtm, y_train)
    y_pred_class = nb.predict(test_dtm)
    print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)

# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)

# show vectorizer options
vect

# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

# set of stop words
print vect.get_stop_words()

# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)

# all 100 features
print vect.get_feature_names()

# include 1-grams and 2-grams, and limit the number of features
vect = CountVectorizer(ngram_range=(1, 2), max_features=100000)
tokenize_test(vect)

# include 1-grams and 2-grams, and only include terms that appear at least 2 times
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)

# print the first review
print yelp_best_worst.text[0]

# save it as a TextBlob object
review = TextBlob(yelp_best_worst.text[0])

# list the words
review.words

# list the sentences
review.sentences

# some string methods are available
review.lower()

# initialize stemmer
stemmer = SnowballStemmer('english')

# stem each word
print [stemmer.stem(word) for word in review.words]

# assume every word is a noun
print [word.lemmatize() for word in review.words]

# assume every word is a verb
print [word.lemmatize(pos='v') for word in review.words]

# define a function that accepts text and returns a list of lemmas
def split_into_lemmas(text):
    text = unicode(text, 'utf-8').lower()
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

# use split_into_lemmas as the feature extraction function
vect = CountVectorizer(analyzer=split_into_lemmas)
tokenize_test(vect)

# last 50 features
print vect.get_feature_names()[-50:]

# example documents
train_simple = ['call you tonight',
                'Call me a cab',
                'please call me... PLEASE!']

# CountVectorizer
vect = CountVectorizer()
pd.DataFrame(vect.fit_transform(train_simple).toarray(), columns=vect.get_feature_names())

# TfidfVectorizer
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(train_simple).toarray(), columns=vect.get_feature_names())

# create a document-term matrix using TF-IDF
vect = TfidfVectorizer(stop_words='english')
dtm = vect.fit_transform(yelp.text)
features = vect.get_feature_names()
dtm.shape

def summarize():
    
    # choose a random review that is at least 300 characters
    review_length = 0
    while review_length < 300:
        review_id = np.random.randint(0, len(yelp))
        review_text = unicode(yelp.text[review_id], 'utf-8')
        review_length = len(review_text)
    
    # create a dictionary of words and their TF-IDF scores
    word_scores = {}
    for word in TextBlob(review_text).words:
        word = word.lower()
        if word in features:
            word_scores[word] = dtm[review_id, features.index(word)]
    
    # print words with the top 5 TF-IDF scores
    print 'TOP SCORING WORDS:'
    top_scores = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:5]
    for word, score in top_scores:
        print word
    
    # print 5 random words
    print '\n' + 'RANDOM WORDS:'
    random_words = np.random.choice(word_scores.keys(), size=5, replace=False)
    for word in random_words:
        print word
    
    # print the review
    print '\n' + review_text

summarize()

print review

# polarity ranges from -1 (most negative) to 1 (most positive)
review.sentiment.polarity

# understanding the apply method
yelp['length'] = yelp.text.apply(len)

# define a function that accepts text and returns the polarity
def detect_sentiment(text):
    return TextBlob(text.decode('utf-8')).sentiment.polarity

# create a new DataFrame column for sentiment
yelp['sentiment'] = yelp.text.apply(detect_sentiment)

# boxplot of sentiment grouped by stars
yelp.boxplot(column='sentiment', by='stars')

# reviews with most positive sentiment
yelp[yelp.sentiment == 1].text.head()

# reviews with most negative sentiment
yelp[yelp.sentiment == -1].text.head()

# widen the column display
pd.set_option('max_colwidth', 500)

# negative sentiment in a 5-star review
yelp[(yelp.stars == 5) & (yelp.sentiment < -0.3)].head()

# positive sentiment in a 1-star review
yelp[(yelp.stars == 1) & (yelp.sentiment > 0.5)].head()

# reset the column display width
pd.reset_option('max_colwidth')

# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

# split the new DataFrame into training and testing sets
feature_cols = ['text', 'sentiment', 'cool', 'useful', 'funny']
X = yelp_best_worst[feature_cols]
y = yelp_best_worst.stars
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# use CountVectorizer with text column only
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train[:, 0])
test_dtm = vect.transform(X_test[:, 0])
print train_dtm.shape
print test_dtm.shape

# shape of other four feature columns
X_train[:, 1:].shape

# cast other feature columns to float and convert to a sparse matrix
extra = sp.sparse.csr_matrix(X_train[:, 1:].astype(float))
extra.shape

# combine sparse matrices
train_dtm_extra = sp.sparse.hstack((train_dtm, extra))
train_dtm_extra.shape

# repeat for testing set
extra = sp.sparse.csr_matrix(X_test[:, 1:].astype(float))
test_dtm_extra = sp.sparse.hstack((test_dtm, extra))
test_dtm_extra.shape

# use logistic regression with text column only
logreg = LogisticRegression(C=1e9)
logreg.fit(train_dtm, y_train)
y_pred_class = logreg.predict(test_dtm)
print metrics.accuracy_score(y_test, y_pred_class)

# use logistic regression with all features
logreg = LogisticRegression(C=1e9)
logreg.fit(train_dtm_extra, y_train)
y_pred_class = logreg.predict(test_dtm_extra)
print metrics.accuracy_score(y_test, y_pred_class)

# spelling correction
TextBlob('15 minuets late').correct()

# spellcheck
Word('parot').spellcheck()

# definitions
Word('bank').define('v')

# language identification
TextBlob('Hola amigos').detect_language()