import pandas as pd import numpy as np import scipy as sp from sklearn.cross_validation import train_test_split from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LogisticRegression from sklearn import metrics from textblob import TextBlob, Word from nltk.stem.snowball import SnowballStemmer %matplotlib inline # read yelp.csv into a DataFrame url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/yelp.csv' yelp = pd.read_csv(url) # create a new DataFrame that only contains the 5-star and 1-star reviews yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)] # split the new DataFrame into training and testing sets X_train, X_test, y_train, y_test = train_test_split(yelp_best_worst.text, yelp_best_worst.stars, random_state=1) # use CountVectorizer to create document-term matrices from X_train and X_test vect = CountVectorizer() train_dtm = vect.fit_transform(X_train) test_dtm = vect.transform(X_test) # rows are documents, columns are terms (aka "tokens" or "features") train_dtm.shape # last 50 features print vect.get_feature_names()[-50:] # show vectorizer options vect # don't convert to lowercase vect = CountVectorizer(lowercase=False) train_dtm = vect.fit_transform(X_train) train_dtm.shape # allow tokens of one character vect = CountVectorizer(token_pattern=r'(?u)\b\w+\b') train_dtm = vect.fit_transform(X_train) train_dtm.shape # include 1-grams and 2-grams vect = CountVectorizer(ngram_range=(1, 2)) train_dtm = vect.fit_transform(X_train) train_dtm.shape # last 50 features print vect.get_feature_names()[-50:] # use default options for CountVectorizer vect = CountVectorizer() # create document-term matrices train_dtm = vect.fit_transform(X_train) test_dtm = vect.transform(X_test) # use Naive Bayes to predict the star rating nb = MultinomialNB() nb.fit(train_dtm, y_train) y_pred_class = nb.predict(test_dtm) # calculate accuracy print metrics.accuracy_score(y_test, y_pred_class) # calculate null accuracy y_test_binary = np.where(y_test==5, 1, 0) y_test_binary.mean() # define a function that accepts a vectorizer and returns the accuracy def tokenize_test(vect): train_dtm = vect.fit_transform(X_train) print 'Features: ', train_dtm.shape[1] test_dtm = vect.transform(X_test) nb = MultinomialNB() nb.fit(train_dtm, y_train) y_pred_class = nb.predict(test_dtm) print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class) # include 1-grams and 2-grams vect = CountVectorizer(ngram_range=(1, 2)) tokenize_test(vect) # show vectorizer options vect # remove English stop words vect = CountVectorizer(stop_words='english') tokenize_test(vect) # set of stop words print vect.get_stop_words() # remove English stop words and only keep 100 features vect = CountVectorizer(stop_words='english', max_features=100) tokenize_test(vect) # all 100 features print vect.get_feature_names() # include 1-grams and 2-grams, and limit the number of features vect = CountVectorizer(ngram_range=(1, 2), max_features=100000) tokenize_test(vect) # include 1-grams and 2-grams, and only include terms that appear at least 2 times vect = CountVectorizer(ngram_range=(1, 2), min_df=2) tokenize_test(vect) # print the first review print yelp_best_worst.text[0] # save it as a TextBlob object review = TextBlob(yelp_best_worst.text[0]) # list the words review.words # list the sentences review.sentences # some string methods are available review.lower() # initialize stemmer stemmer = SnowballStemmer('english') # stem each word print [stemmer.stem(word) for word in review.words] # assume every word is a noun print [word.lemmatize() for word in review.words] # assume every word is a verb print [word.lemmatize(pos='v') for word in review.words] # define a function that accepts text and returns a list of lemmas def split_into_lemmas(text): text = unicode(text, 'utf-8').lower() words = TextBlob(text).words return [word.lemmatize() for word in words] # use split_into_lemmas as the feature extraction function vect = CountVectorizer(analyzer=split_into_lemmas) tokenize_test(vect) # last 50 features print vect.get_feature_names()[-50:] # example documents train_simple = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!'] # CountVectorizer vect = CountVectorizer() pd.DataFrame(vect.fit_transform(train_simple).toarray(), columns=vect.get_feature_names()) # TfidfVectorizer vect = TfidfVectorizer() pd.DataFrame(vect.fit_transform(train_simple).toarray(), columns=vect.get_feature_names()) # create a document-term matrix using TF-IDF vect = TfidfVectorizer(stop_words='english') dtm = vect.fit_transform(yelp.text) features = vect.get_feature_names() dtm.shape def summarize(): # choose a random review that is at least 300 characters review_length = 0 while review_length < 300: review_id = np.random.randint(0, len(yelp)) review_text = unicode(yelp.text[review_id], 'utf-8') review_length = len(review_text) # create a dictionary of words and their TF-IDF scores word_scores = {} for word in TextBlob(review_text).words: word = word.lower() if word in features: word_scores[word] = dtm[review_id, features.index(word)] # print words with the top 5 TF-IDF scores print 'TOP SCORING WORDS:' top_scores = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:5] for word, score in top_scores: print word # print 5 random words print '\n' + 'RANDOM WORDS:' random_words = np.random.choice(word_scores.keys(), size=5, replace=False) for word in random_words: print word # print the review print '\n' + review_text summarize() print review # polarity ranges from -1 (most negative) to 1 (most positive) review.sentiment.polarity # understanding the apply method yelp['length'] = yelp.text.apply(len) # define a function that accepts text and returns the polarity def detect_sentiment(text): return TextBlob(text.decode('utf-8')).sentiment.polarity # create a new DataFrame column for sentiment yelp['sentiment'] = yelp.text.apply(detect_sentiment) # boxplot of sentiment grouped by stars yelp.boxplot(column='sentiment', by='stars') # reviews with most positive sentiment yelp[yelp.sentiment == 1].text.head() # reviews with most negative sentiment yelp[yelp.sentiment == -1].text.head() # widen the column display pd.set_option('max_colwidth', 500) # negative sentiment in a 5-star review yelp[(yelp.stars == 5) & (yelp.sentiment < -0.3)].head() # positive sentiment in a 1-star review yelp[(yelp.stars == 1) & (yelp.sentiment > 0.5)].head() # reset the column display width pd.reset_option('max_colwidth') # create a new DataFrame that only contains the 5-star and 1-star reviews yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)] # split the new DataFrame into training and testing sets feature_cols = ['text', 'sentiment', 'cool', 'useful', 'funny'] X = yelp_best_worst[feature_cols] y = yelp_best_worst.stars X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # use CountVectorizer with text column only vect = CountVectorizer() train_dtm = vect.fit_transform(X_train[:, 0]) test_dtm = vect.transform(X_test[:, 0]) print train_dtm.shape print test_dtm.shape # shape of other four feature columns X_train[:, 1:].shape # cast other feature columns to float and convert to a sparse matrix extra = sp.sparse.csr_matrix(X_train[:, 1:].astype(float)) extra.shape # combine sparse matrices train_dtm_extra = sp.sparse.hstack((train_dtm, extra)) train_dtm_extra.shape # repeat for testing set extra = sp.sparse.csr_matrix(X_test[:, 1:].astype(float)) test_dtm_extra = sp.sparse.hstack((test_dtm, extra)) test_dtm_extra.shape # use logistic regression with text column only logreg = LogisticRegression(C=1e9) logreg.fit(train_dtm, y_train) y_pred_class = logreg.predict(test_dtm) print metrics.accuracy_score(y_test, y_pred_class) # use logistic regression with all features logreg = LogisticRegression(C=1e9) logreg.fit(train_dtm_extra, y_train) y_pred_class = logreg.predict(test_dtm_extra) print metrics.accuracy_score(y_test, y_pred_class) # spelling correction TextBlob('15 minuets late').correct() # spellcheck Word('parot').spellcheck() # definitions Word('bank').define('v') # language identification TextBlob('Hola amigos').detect_language()