import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
#Class defined for processing raw HTML Text
class KaggleWord2VecUtility(object):
"""KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""
@staticmethod
def review_to_wordlist( review, remove_stopwords=False ):
# Function to convert a document to a sequence of words,
# optionally removing stop words. Returns a list of words.
#
# 1. Remove HTML
review_text = BeautifulSoup(review).get_text()
#
# 2. Remove non-letters
review_text = re.sub("[^a-zA-Z]"," ", review_text)
#
# 3. Convert words to lower case and split them
words = review_text.lower().split()
#
# 4. Optionally remove stop words (false by default)
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
#
# 5. Return a list of words
return(words)
# Define a function to split a review into parsed sentences
@staticmethod
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
# 1. Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
remove_stopwords ))
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
def process():
train = pd.read_csv("/Users/taposh/workspace/kaggle/bow/labeledTrainData.tsv", header=0, \
delimiter="\t", quoting=3)
test = pd.read_csv("/Users/taposh/workspace/kaggle/sam/test.tsv", header=0, delimiter="\t", \
quoting=3 )
y = train["Sentiment"]
print("Cleaning and parsing movie reviews...\n")
traindata = []
for i in range( 0, len(train["Phrase"])):
traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["Phrase"][i], False)))
testdata = []
for i in range(0,len(test["Phrase"])):
testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["Phrase"][i], False)))
print ('vectorizing... ',)
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)
#print(X_all)
print ("fitting pipeline... ",)
tfv.fit(X_all)
X_all = tfv.transform(X_all)
X = X_all[:lentrain]
X_test = X_all[lentrain:]
model = LogisticRegression(penalty='l2', dual=True, tol=0.0000001,
C=1, fit_intercept=True, intercept_scaling=1.0,
class_weight=None, random_state=None)
from sklearn.tree import DecisionTreeRegressor
clf_1 = DecisionTreeRegressor(max_depth=2)
#clf_2 = DecisionTreeRegressor(max_depth=5)
clf_1.fit(X.toarray(), y)
#clf_2.fit(X.toarray(), y)
# Predict
X_test = np.arange(0, 4,1)[:, np.newaxis]
y_1 = clf_1.predict(X_test)
#y_2 = clf_2.predict(X_test)
# Plot the results
import matplotlib.pyplot as plt
plt.figure()
plt.scatter(X, y, c="k", label="data")
plt.plot(X_test, y_1, c="g", label="max_depth=2", linewidth=2)
#plt.plot(X_test, y_2, c="r", label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()
kf = cross_validation.KFold(4, n_folds=2)
print(len(kf))
print(kf)
for train_index, test_index in kf:
model.fit(X,y)
print("TRAIN:", train_index, "TEST:", test_index)
result = model.predict(train_index)
#rocvalue= cross_validation.cross_val_score(model, train_index, cv=20)
#print("20 Fold CV Score: ")
#rocvalue= cross_validation.cross_val_score(model, X, cv=20, scoring='roc_auc')
#print(rocvalue)
#meanvalue = np.mean()
#print(meanvalue)
2 sklearn.cross_validation.KFold(n=4, n_folds=2, shuffle=False, random_state=None)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-40-4fc9aef85c26> in <module>() 3 print(kf) 4 for train_index, test_index in kf: ----> 5 model.fit(train_index,y) 6 print("TRAIN:", train_index, "TEST:", test_index) 7 result = model.predict(train_index) /Users/taposh/anaconda/lib/python3.4/site-packages/sklearn/svm/base.py in fit(self, X, y) 685 raise ValueError("X and y have incompatible shapes.\n" 686 "X has %s samples, but y has %s." % --> 687 (X.shape[0], y_ind.shape[0])) 688 689 liblinear.set_verbosity_wrap(self.verbose) ValueError: X and y have incompatible shapes. X has 1 samples, but y has 156060.
print("Retrain on all training data, predicting test labels...\n")
model.fit(X,y)
#result = model.predict_proba(X_test)[:,1]
result = model.predict(X_test)
print(max(result))
Retrain on all training data, predicting test labels... 4 Wrote results to submission_sam.csv
output = pd.DataFrame( data={"PhraseId":test["PhraseId"], "Sentiment":result} )
import csv
# Use pandas to write the comma-separated output file
output.to_csv('/Users/taposh/workspace/kaggle/sam/submission.csv',quoting=3,escapechar=",",index=False,encoding='utf-8')
#output.to_csv("/Users/taposhdr/workspace/decision_science/kaggle/bow/data/Bag_of_Words_model-1.csv", index=False, quoting=csv.QUOTE_NONE)
print("Wrote results to submission_sam.csv")
print(max(result))
0.793510629359
print(min(result))
0.00641866008008
print(np.mean(result))
0.151920802397
print(np.median(result))
0.112866573887