%matplotlib inline import json import numpy as np import copy import pandas as pd import networkx as nx import requests import scipy from pattern import web import matplotlib.pyplot as plt import matplotlib.pylab as plt2 from scipy.stats import pearsonr from datetime import datetime import nltk from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cross_validation import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.metrics.pairwise import cosine_similarity from myalchemy import MyAlchemy from sklearn import svm, tree from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.linear_model import RidgeClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import NearestCentroid from sklearn.utils.extmath import density from sklearn import metrics # set some nicer defaults for matplotlib from matplotlib import rcParams #these colors come from colorbrewer2.org. Each is an RGB triplet dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = False rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecessary plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() big_table = pd.read_csv('Data/full.csv', encoding='utf-8') big_table = big_table[big_table['author'] != "deleted"] print "Number of posts: ", len(big_table) print "Number of distinct authors: ", len(big_table .groupby('author')) def get_author_stats(): author_table = big_table.groupby('author') author_count = author_table['author'].count() author_count.sort() return author_count author_count = get_author_stats() author_count[-10:] plt.hist(author_count, bins = 20, log=True) plt.title("Distribution of number of submissions") remove_border() types = list(big_table['type'].unique()) ''' returns: - the number of active users with more than 2 posts - the number of distinct authors - the ratio of active/distinct users for a subreddit ''' def get_sub_stats(subreddit): author_table = subreddit.groupby('author') dist_authors = len(subreddit.groupby('author')) #print "Number of distinct authors: ", dist_authors successful_authors = subreddit[author_table.author.transform(lambda x: x.count() > 1).astype('bool')] authorset = set() for a in successful_authors.index: authorset.add(successful_authors.ix[a]['author']) active_users = len(authorset) #print "number of authors with more than 1 submission in the top 1000: ", active_users if dist_authors >0: succ_ratio = float(active_users) / dist_authors else: succ_ratio = 0 return active_users, dist_authors, succ_ratio #get the values for all types of data authorstats = {} for ctype in types: curr_df = big_table[big_table['type'] == ctype] authorstats[ctype] = get_sub_stats(curr_df) del curr_df #reduce memory ''' plots a scatterplot for a list of subreddit stats calculated before X-Axis: Number of distinct users Y-Axis: Success ratio ''' def plot_author_success(successlist): xvals = [value[0] for key, value in successlist.iteritems()] yvals = [value[2] for key, value in successlist.iteritems()] labellist = [key for key, value in successlist.iteritems()] fig, ax = plt.subplots() ax.scatter(xvals, yvals) for i, txt in enumerate(labellist): ax.annotate(txt, (xvals[i],yvals[i])) plt.title("Active Users with their success rate") plt.xlabel("No. distinct users") plt.ylabel("fraction of users with multiple posts") remove_border() plot_author_success(authorstats) subreddits = list(big_table['subreddit'].unique()) sr_stats = {} for ctype in subreddits: curr_df = big_table[big_table['subreddit'] == ctype] sr_stats[ctype] = get_sub_stats(curr_df) del curr_df #reduce memory plot_author_success(sr_stats) del sr_stats #reduce memory #regression line m_fit,b_fit = plt2.polyfit(big_table.comments, big_table.score, 1) plt2.plot(big_table.comments, big_table.score, 'yo', big_table.comments, m_fit*big_table.comments+b_fit, color='purple', alpha=0.3) plt.title("Comments versus Score") plt.xlabel("Comments") plt.ylabel("Score") plt.xlim(-10, max(big_table.comments) * 1.05) plt.ylim(-10, max(big_table.score) * 1.05 ) remove_border() big_table_filtered = big_table[big_table['comments'] < 50] #only look at posts with <50 comments big_table_filtered = big_table_filtered[big_table_filtered['score'] < 100] # and less than 100 score plt.scatter(big_table_filtered.comments, big_table_filtered.score, alpha=0.2) plt.title("Comments versus Score") plt.xlabel("Comments") plt.ylabel("Score") plt.xlim(-1, max(big_table_filtered.comments) * 1.05) plt.ylim(-1, max(big_table_filtered.score) * 1.05 ) remove_border() del big_table_filtered def split_selftext_DataFrame(df): ''' returns a list with 0 if a post has no selftext and a 1 if it has ''' is_string_list = [] i = 0 for idx, record in df['selftext'].iteritems(): if type(record) == float: #for some reason no selftext is formatted as float is_string_list.append(0) else: is_string_list.append(1) return is_string_list big_table['islink'] = split_selftext_DataFrame(big_table) big_table_link = big_table[big_table['islink'] == 0] big_table_self = big_table[big_table['islink'] == 1] def plot_link_vs_self(table_link, table_self): ''' plots a scatterplot of scores and comments for two different datasets ''' p1 = plt.scatter(table_link.comments, table_link.score, color='red', alpha = 0.2) p2 = plt.scatter(table_self.comments, table_self.score, color='blue', alpha = 0.2) plt.legend([p1, p2], ["no self text", "self texts"]) plt.title("Comments versus Score ") plt.xlabel("Comments") plt.ylabel("Score") plt.ylim(-10, 5000) plt.xlim(-10, 30000) remove_border() plot_link_vs_self(big_table_link, big_table_self) del big_table_link del big_table_self logkrm = np.log(big_table['karma']) loglinkkrm = np.log(big_table['link_karma']) logscore = np.log(big_table['score']) plt.scatter(logkrm, logscore, c='g') plt.title("Karma versus Score - Both on a Logarithimic Scale") plt.xlabel("Karma (Log)") plt.ylabel("Score (Log)") plt.xlim(-0.5, 16) plt.ylim(-0.5, 10) remove_border() plt.show() r_row, p_value = pearsonr(big_table['karma'], big_table['score']) print "Pearson coefficient is " + str(r_row) + " with a p-value of " + str(p_value) plt.scatter(loglinkkrm, logscore, c='g') plt.title("Link Karma versus Score - Both on a Logarithimic Scale") plt.xlabel("Link Karma (Log)") plt.ylabel("Score (Log)") plt.xlim(-0.5, 16) plt.ylim(-0.5, 10) remove_border() plt.show() r_row, p_value = pearsonr(big_table['link_karma'], big_table['score']) print "Pearson r coefficient is " + str(r_row) + " with a p-value of " + str(p_value) del logkrm, loglinkkrm, logscore r_row, p_value = pearsonr(big_table['karma'], big_table['link_karma']) print "Pearson r coefficient is " + str(r_row) + " with a p-value of " + str(p_value) big_table['length']=big_table['comments'] # Done simply to initialize the column "length", so to speak for i in big_table.index: big_table['length'][i]=len(str(big_table['title'][i])) plt.scatter(big_table['length'], big_table['score'], c='g') plt.title("Post Title Length versus Post Score") plt.xlabel("Title Length") plt.ylabel("Score") plt.xlim(0, 300) plt.ylim(0, 9000) remove_border() plt.show() r_row, p_value = pearsonr(big_table['length'], big_table['score']) print "Pearson r coefficient is " + str(r_row) + " with a p-value of " + str(p_value) #p =datetime.utcfromtimestamp(messwith)#.strftime('%Y-%m-%d %H:%M:%S') #Year, Month, Day, Hour, Minute, Second format dates = list(big_table['time_created']) #Function to return the time between dates def convertdate(dates, which): dts = [] for date in dates: dts.append(datetime.utcfromtimestamp(date)) currenttime = datetime.now() until = max(dts) days = [] hrs = [] for date in dts: days.append((until-date).days) hrs.append((until-date).total_seconds()/3600.0) #print "Last post in the data set has a date/time of", until.strftime('%Y-%m-%d %H:%M:%S') if which == 'days': return days elif which == 'hours': return hrs else: print 'Enter days or hours' big_table['daysfrom'] = convertdate(dates, 'days') big_table['hoursfrom'] = convertdate(dates, 'hours') # Color each scatter plot point according to subreddit type df = big_table #Set the colors of each category for a nicer looking graph colors = ['c', 'g', 'y', 'b', 'r', 'm', 'k', 'w'] talldf = df[df['type'] == types[0]] talldf['color'] = colors[0] tallcol = list(talldf['color']) newdf = df[df['type'] == types[1]] newdf['color'] = colors[1] newcol= list(newdf['color']) hotdf = df[df['type'] == types[2]] hotdf['color'] = colors[2] hotcol= list(hotdf['color']) tweekdf = df[df['type'] == types[3]] tweekdf['color'] = colors[3] tweekcol= list(tweekdf['color']) tdaydf = df[df['type'] == types[4]] tdaydf['color'] = colors[4] tdaycol= list(tdaydf['color']) #Plot time vs. score tall = plt.scatter(talldf['daysfrom'], talldf['score'], c=tallcol) new = plt.scatter(newdf['daysfrom'], newdf['score'], c=newcol) hot = plt.scatter(hotdf['daysfrom'], hotdf['score'], c=hotcol) tweek = plt.scatter(tweekdf['daysfrom'], tweekdf['score'], c=tweekcol) tday = plt.scatter(tdaydf['daysfrom'], tdaydf['score'], c=tdaycol) plt.title("Post Date (in Days posted before November, 11 2013) versus Post Score") plt.xlabel("Number of Days posted before last post date") plt.ylabel("Score (Upvotes-Downvotes)") plt.xlim(0, 2100) plt.ylim(0, 9000) plt.legend((tall, new, hot, tweek, tday), ('Top all', 'New', 'Hot', 'Top Weekly', 'Top Day'), loc='upper right') remove_border() plt.show() r_row, p_value = pearsonr(talldf['length'], talldf['score']) print "Pearson r coefficient for top all is " + str(r_row) + " with a p-value of " + str(p_value) ''' The following code will plot four scatterplots from different combinations of this data and various axis limits to see if any patterns can be observed ''' #exclude data older than 100 days from the plots tall = plt.scatter(talldf['hoursfrom']/24.0, talldf['score'], c=tallcol) new = plt.scatter(newdf['hoursfrom']/24.0, newdf['score'], c=newcol) hot = plt.scatter(hotdf['hoursfrom']/24.0, hotdf['score'], c=hotcol) tweek = plt.scatter(tweekdf['hoursfrom']/24.0, tweekdf['score'], c=tweekcol) tday = plt.scatter(tdaydf['hoursfrom']/24.0, tdaydf['score'], c=tdaycol) plt.title("Post Date (in Days posted before November, 11 2013) versus Post Score") plt.xlabel("Number of Days posted before last post date") plt.ylabel("Score (Upvotes-Downvotes)") plt.xlim(0, 100) plt.ylim(0, 5500) plt.legend((tall, new, hot, tweek, tday), ('Top all', 'New', 'Hot', 'Top Weekly', 'Top Day'), loc='upper right') remove_border() plt.show() #leave out all and hot new = plt.scatter(newdf['hoursfrom']/24.0, newdf['score'], c=newcol) tweek = plt.scatter(tweekdf['hoursfrom']/24.0, tweekdf['score'], c=tweekcol) tday = plt.scatter(tdaydf['hoursfrom']/24.0, tdaydf['score'], c=tdaycol) plt.title("Post Date (in Days posted before November, 11 2013) versus Post Score") plt.xlabel("Number of Days posted before last post date") plt.ylabel("Score (Upvotes-Downvotes)") plt.xlim(0, 100) plt.ylim(0, 4200) plt.legend((new, tweek, tday), ('New', 'Top Weekly', 'Top Day'), loc='upper right') remove_border() plt.show() #plot it with hot new = plt.scatter(newdf['hoursfrom']/24.0, newdf['score'], c=newcol) hot = plt.scatter(hotdf['hoursfrom']/24.0, hotdf['score'], c=hotcol) tweek = plt.scatter(tweekdf['hoursfrom']/24.0, tweekdf['score'], c=tweekcol) tday = plt.scatter(tdaydf['hoursfrom']/24.0, tdaydf['score'], c=tdaycol) plt.title("Post Date (in Days posted before November, 11 2013) versus Post Score") plt.xlabel("Number of Days posted before last post date") plt.ylabel("Score (Upvotes-Downvotes)") plt.xlim(0, 100) plt.ylim(0, 4200) plt.legend((new, hot, tweek, tday), ('New', 'Hot', 'Top Weekly', 'Top Day'), loc='upper right') remove_border() plt.show() #look only at day and week tweek = plt.scatter(tweekdf['hoursfrom']/24.0, tweekdf['score'], c=tweekcol) tday = plt.scatter(tdaydf['hoursfrom']/24.0, tdaydf['score'], c=tdaycol) plt.title("Post Date (in days posted before November, 11 2013) versus Post Score") plt.xlabel("Number of Days posted before last post date") plt.ylabel("Score (Upvotes-Downvotes)") plt.xlim(0, 8) plt.ylim(0, 4200) plt.legend((tweek, tday), ('Top Weekly', 'Top Day'), loc='upper right') remove_border() plt.show() del talldf, tallcol, newdf, newcol, hotdf, hotcol, tweekdf, tweekcol, tdaydf, tdaycol df = pd.read_csv('Data/full.csv', encoding='utf-8') # Top all is our training data set print len(df) df['up/down'] = df['upvotes'].astype(float)/df['downvotes'].astype(float) # Reddit fuzzes this so... topcomments=float(max(df['comments'])) topsscore=float(max(df['score'])) leastcontro = max(df['up/down']) # The following metric is something we invented for testing purposes df['mymetric'] = (((df['comments'].astype(float)/topcomments)*0.10)+\ ((df['score'].astype(float)/topsscore)*0.85)+\ ((df['up/down']/leastcontro)*0.05))**(0.30) df['nrmscore'] = (df['score'].astype(float)/topsscore)**(0.30) bigdf = df df = df[df['subreddit'] == 'AskReddit'] df2 = df[df['type'] == 'top_week'] print len(df2) df = df[df['type'] == 'top_all'] print len(df) #It's important in cross validation that the sets are disjoint, so we are removing duplicates dfids = list(df['id']) df2ids = list(df2['id']) dupids = [] for redditid in dfids: if redditid in df2ids: dupids.append(redditid) #This part is slightly overengineered, but the motivation behind it is that we didn't want to simply strip out the #posts from other data set at will. Instead, we are splitting the duplicates in half and assigning them to one of the data sets #to avoid some sort of possible bias. if len(dupids)%2 != 0: a = len(dupids)/2 a = a+1 dup1 = dupids[0:a] dup2 = dupids[a:] else: a = len(dupids)/2 dup1 = dupids[0:a] dup2 = dupids[a:] if np.random.randint(2) == 0: df=df[df['id'].apply(lambda x: x in dup1) == False] df2=df2[df2['id'].apply(lambda x: x in dup2) == False] else: df=df[df['id'].apply(lambda x: x in dup2) == False] df2=df2[df2['id'].apply(lambda x: x in dup1) == False] print len(df) print len(df2) vectorizer = CountVectorizer(min_df=0.001) title = list(df['title']) + list(df2['title']) vectorizer.fit(title) def category(x, df, num=2): size = len(df) blocksize = size/num for i in range(num): blockmax = max(sorted(df['score'])[blocksize*i:blocksize*(i+1)]) if x < blockmax: return i+1 return num x_train = vectorizer.transform(df['title']) x_test = vectorizer.transform(df2['title']) score = [category(i, df2) for i in df['score']] score2 = [category(i, df2) for i in df2['score']] y_train = np.array(score) y_test = np.array(score2) vectorizer2 = CountVectorizer(min_df=0.001) title2 = df2['title'] vectorizer2.fit(title2) X2 = vectorizer2.transform(title2) Y2 = np.array(df2['score']) clf = MultinomialNB(alpha=1) clf.fit(x_train, y_train) print "Training accuracy is", clf.score(x_train, y_train) print "Test accuracy is", clf.score(x_test, y_test) dftitles = df['title'] df2titles = df2['title'] vectorizer = CountVectorizer(min_df=0.001) title = list(dftitles) + list(df2titles) vectorizer.fit(title) def category(x, df, num=2): size = len(df) blocksize = size/num for i in range(num): blockmax = max(sorted(df['score'])[blocksize*i:blocksize*(i+1)]) if x < blockmax: return i+1 return num #scores = [category(i) for i in df2['score']] #print scores #X = vectorizer.transform(title) #Y = np.array(scores) x_train = vectorizer.transform(dftitles) x_test = vectorizer.transform(df2titles) score = [1 if i > np.mean(df['score']) else 0 for i in df['score']] score2 = [1 if i > np.mean(df2['score']) else 0 for i in df2['score']] y_train = np.array(score) y_test = np.array(score2) clf = MultinomialNB(alpha=1) clf.fit(x_train, y_train) print "Training accuracy is", clf.score(x_train, y_train) print "Test accuracy is", clf.score(x_test, y_test) apikey = "e945cef59338f9e8e7bc962badde170e623fb7e5" #Please insert your own key here p = MyAlchemy(apikey) dftitles = list(df['title']) df2titles = list(df2['title']) print dftitles[5] print p.run_method(dftitles[5], 'concepts') print p.run_method(dftitles[5], 'keywords') print p.run_method(dftitles[5], 'category') #print p.run_method(dftitles[5], 'sentiment') print p.run_method(dftitles[5], 'entities') print len(df) #Concepts, keywords, category, sentiment, entities - all things Alchemy can provide categories = [] concepts, concepts2 = [], [] for i in range(30): conceptlist = p.run_method(dftitles[i], 'concepts') for c in conceptlist: concepts.append(c[1]) for i in range(30): conceptlist = p.run_method(df2titles[i], 'concepts') for c in conceptlist: concepts2.append(c[1]) print concepts print "--------" print concepts2 vectorizer = CountVectorizer(min_df=0.001) vectorizer.fit(concepts) X = vectorizer.transform(concepts) Y = np.array(df['score'][0:55]) title2 = df2['title'] print len(Y) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) #I added the train size parameter. clf = MultinomialNB(alpha=1) clf.fit(x_train, y_train) print "Training accuracy is", clf.score(x_train, y_train) print "Test accuracy is", clf.score(x_test, y_test) del big_table #don't need this any longer df = pd.read_csv('Data/full.csv', encoding='utf-8') #using a fresh dataset for this print "Original size of data set is", len(df) df = df.drop_duplicates('id') print "Size of data set with only unique posts is", len(df) dfmean = np.mean(df['score']) df = df.sort('score') df = df.reset_index(level=0, drop=True) median = len(df)/2 md = df['score'][median] def make_xy(titles, scores, vectorizer=None): #Set default vecotrizer if not vectorizer: vectorizer = CountVectorizer(min_df=0.001) #Build the vocabulary by fitting the vectorizer to the list of quotes vectorizer.fit(titles) #Convert into a bag-of-words and use a sparse array to save memory x = vectorizer.transform(titles) x = x.tocsc() #save into numpy array, and return everything y = np.array(scores) return x, y, vectorizer X,Y,vectorizer = make_xy(list(df['title']), df['score']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = MultinomialNB(alpha=1) clf.fit(x_train, y_train) print "Training accuracy is", clf.score(x_train, y_train) print "Test accuracy is", clf.score(x_test, y_test) sorteddf = df.sort('score') sorteddf['category'] = df['score'] size = len(df) num = 5 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sorteddf['category'][blocks[i]:blocks[i+1]] = i+1 Xsort, Ysort, vectorizer2 = make_xy(list(sorteddf['title']), sorteddf['category']) x_train3, x_test3, y_train3, y_test3 = train_test_split(Xsort, Ysort, train_size=0.5) clf3 = MultinomialNB(alpha=1) clf3.fit(x_train3, y_train3) train_acc = clf3.score(x_train3, y_train3) test_acc = clf3.score(x_test3, y_test3) print "Training accuracy is", train_acc print "Test accuracy is", test_acc sorteddf = df.sort('score') sorteddf['category'] = df['score'] size = len(df) best_test = 0 best_vect = None best_Ysort = None best_clf = None for num in range(2, 11): blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sorteddf['category'][blocks[i]:blocks[i+1]] = i+1 Xsort, Ysort, vectorizer2 = make_xy(list(sorteddf['title']), sorteddf['category']) x_train3, x_test3, y_train3, y_test3 = train_test_split(Xsort, Ysort, train_size=0.5) clf3 = MultinomialNB(alpha=1) clf3.fit(x_train3, y_train3) train_acc = clf3.score(x_train3, y_train3) test_acc = clf3.score(x_test3, y_test3) if best_test < test_acc: best_test = test_acc best_vect = copy.deepcopy(vectorizer2) best_Ysort = copy.deepcopy(Ysort) best_clf = copy.deepcopy(clf3) print "For", num, "bins:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" n_grams = CountVectorizer(ngram_range=[1, 5], analyzer='word') n_grams.fit(list(sorteddf['title'])) Xngram = n_grams.transform(list(sorteddf['title'])) x_train4, x_test4, y_train4, y_test4 = train_test_split(Xngram, best_Ysort, train_size=0.5) clf4 = MultinomialNB(alpha=1) clf4.fit(x_train4, y_train4) print "Training accuracy is", clf4.score(x_train4, y_train4) print "Test accuracy is", clf4.score(x_test4, y_test4) tdidf = TfidfVectorizer(ngram_range=[1, 5], sublinear_tf=True) tdidf.fit(list(sorteddf['title'])) Xtdidf = tdidf.transform(list(sorteddf['title'])) x_train5, x_test5, y_train5, y_test5 = train_test_split(Xtdidf, best_Ysort, train_size=0.5) clf5 = MultinomialNB(alpha=1) clf5.fit(x_train5, y_train5) print "Training accuracy is", clf5.score(x_train5, y_train5) print "Test accuracy is", clf5.score(x_test5, y_test5) #Calculate the rotten and fresh word probabilities #and create a new, sorted DataFrame for them mywords = best_vect.get_feature_names() print len(mywords) diag = np.eye(len(mywords)) unpop, pop = zip(*best_clf.predict_proba(diag)) data = pd.DataFrame({'words': mywords, 'p_pop': pop, 'p_unpop': unpop}) sort = data.sort('p_pop', ascending=False).copy() print 'Top 10 \"Best\" Words:' print for i in sort[:10].index: print "The word",sort.words[i],"has probability", sort.p_pop[i], "of being popular" print print 'Top 10 \"Worst\" Words:' print for i in sort[:-11:-1].index: print "The word",sort.words[i],"has probability", sort.p_unpop[i], "of being unpopular" #let's get started with a new and clean data set once again df = pd.read_csv('Data/full.csv', encoding='utf-8') df = df.drop_duplicates('id') df = df.sort('score') df = df.reset_index(level=0, drop=True) df = df.drop_duplicates() subreddit_ngrams = {} for subreddit in subreddits: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(sortedsmalldf['title'])) X = n_grams.transform(list(sortedsmalldf['title'])) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = MultinomialNB(alpha=50) clf.fit(x_train, y_train) subreddit_ngrams[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" # after http://stackoverflow.com/questions/12787650/finding-the-index-of-n-biggest-elements-in-python-array-list-efficiently # we need this to efficiently get the largest cosine scores def f(a): return np.argsort(a)[0][::-1] def make_xy(titles, scores, vectorizer=None): #this one uses a tfidf vectorizer as opposed to the earlier version #Set default vecotrizer if not vectorizer: vectorizer = TfidfVectorizer() #Build the vocabulary by fitting the vectorizer to the list of quotes #Convert into a bag-of-words and use a sparse array to save memory x = vectorizer.fit_transform(titles) #x = x.tocsc() #save into numpy array, and return everything y = np.array(scores) return x, y, vectorizer X,Y,vectorizer = make_xy(list(df['title']), df['score']) #this calculates close to 2.5 billion scores + sorts every of the 50,000 list of 50,000 -> might take a while def make_closest(): ''' calculates the cosine similarity between each pair of titles and returns a dictionary containing them ''' closest_title_scores = {} i = 1 for a in X: vec = cosine_similarity(a, X) #sort the results sorted_vec = f(vec) num = 0 already_printed = 0 closest_list = [] while already_printed < 10: #try because of the dropped ID's -> may have nonexistent entries try: curr = df['title'][sorted_vec[num]] sco = df['score'][sorted_vec[num]] closest_list.append((curr, sco, vec[0][sorted_vec[num]])) already_printed +=1 except: pass num +=1 #drop the first entry because its the cosine of the title with itself (score 1) closest_title_scores[closest_list[0][0]] = closest_list[1:] return closest_title_scores closest_title_scores = make_closest() def knearest(title, k=7): """ Given a restaurant_id, dataframe, and database, get a sorted list of the k most similar restaurants from the entire database. """ return closest_title_scores[title][:k] #generate the new columns df['max_cosine'] = df['title'].map(lambda x: 0) df['avg_cosine'] = df['title'].map(lambda x: 0) df['min_cosine'] = df['title'].map(lambda x: 0) df['closest_cosine'] = df['title'].map(lambda x: 0) #fill in the values to the new columns for key, value in df.iterrows(): max_score = 0 mean_score = 0 min_score = 0 closest_score = 0 try: tuple_list = knearest(value['title']) closest_score = tuple_list[0][1] max_score = max(tuple_list,key=lambda item:item[1])[1] min_score = min(tuple_list,key=lambda item:item[1])[1] mean_score = np.mean([a[1] for a in tuple_list] ) except: pass df['max_cosine'][key] = max_score df['avg_cosine'][key] = mean_score df['min_cosine'][key] = min_score df['closest_cosine'][key] = closest_score #calculate the pearsonr print "max cosine pearson" r_row, p_value = pearsonr(df['max_cosine'], df['score']) print "Pearson coefficient is " + str(r_row) + " with a p-value of " + str(p_value) print "avg cosine pearson" r_row, p_value = pearsonr(df['avg_cosine'], df['score']) print "Pearson coefficient is " + str(r_row) + " with a p-value of " + str(p_value) print "min cosine pearson" r_row, p_value = pearsonr(df['min_cosine'], df['score']) print "Pearson coefficient is " + str(r_row) + " with a p-value of " + str(p_value) print "closest cosine pearson" r_row, p_value = pearsonr(df['closest_cosine'], df['score']) print "Pearson coefficient is " + str(r_row) + " with a p-value of " + str(p_value) spec_probs = [] for i in df.index: title = df.title[i] subreddit = df.subreddit[i] clf = subreddit_ngrams[subreddit][0] n_grams_spec = subreddit_ngrams[subreddit][1] prob_spec = clf.predict_proba(n_grams_spec.transform([title]))[0][1] spec_probs.append(prob_spec) df['spec_probs'] = spec_probs df.to_csv("Data/new_full.csv", index=False, encoding='utf-8') def plot_spec_prob(table): ''' plots a scatterplot of scores against the specific probability ''' m_fit,b_fit = plt2.polyfit(table.spec_probs, table.score, 1) plt2.plot(table.spec_probs, table.score, 'yo', table.spec_probs, m_fit*table.spec_probs+b_fit, color='red', alpha=.9) #p1 = plt.scatter(table.spec_probs, table.score, color='red', alpha = 0.2) plt.title("Specific probability against score") plt.xlabel("Specific probability") plt.ylabel("Score") plt.ylim([0, 7000]) remove_border() plot_spec_prob(df) m, b, r, p, std = scipy.stats.linregress(np.array(df['spec_probs']), np.array(df['score'])) print "slope", m print "slope intercept", b print "squared correlation", r**2 print "probability", p print "standard deviation", std def predict(title): x = clf.predict_proba(n_grams_spec.transform([title]))[0][1] y = m*x + b return y subreddit_svm = {} for subreddit in subreddits: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(sortedsmalldf['title'])) X = n_grams.transform(list(sortedsmalldf['title'])) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = RidgeClassifier(tol=1e-2, solver="lsqr") clf.fit(x_train, y_train) subreddit_svm[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" subreddit_alchemy = {} for subreddit in subreddits: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 alch_titles = [] for title in list(sortedsmalldf['title']): titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']] titles = [lst.replace(')', '') for lst in titles] titles = [lst.replace('[', '') for lst in titles] titles = [lst.replace(']', '') for lst in titles] titles = "".join(titles) titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ') titles = titles.replace(' ', ' ') titles = titles.split(' ') alch_titles.append(" ".join(titles[1:])) n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(alch_titles)) X = n_grams.transform(alch_titles) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = RidgeClassifier(tol=1e-2, solver="lsqr") clf.fit(x_train, y_train) subreddit_alchemy[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" subreddit_alchemy = {} for subreddit in subreddits: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 alch_titles = [] for title in list(sortedsmalldf['title']): titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']] titles = [lst.replace(')', '') for lst in titles] titles = [lst.replace('[', '') for lst in titles] titles = [lst.replace(']', '') for lst in titles] titles = "".join(titles) titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ') titles = titles.replace(' ', ' ') titles = titles.split(' ') alch_titles.append(" ".join(titles[1:])) n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(alch_titles)) X = n_grams.transform(alch_titles) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = Perceptron(n_iter=50) clf.fit(x_train, y_train) subreddit_alchemy[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" subreddit_alchemy = {} for subreddit in subreddits: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 alch_titles = [] for title in list(sortedsmalldf['title']): titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']] titles = [lst.replace(')', '') for lst in titles] titles = [lst.replace('[', '') for lst in titles] titles = [lst.replace(']', '') for lst in titles] titles = "".join(titles) titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ') titles = titles.replace(' ', ' ') titles = titles.split(' ') alch_titles.append(" ".join(titles[1:])) n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(alch_titles)) X = n_grams.transform(alch_titles) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = PassiveAggressiveClassifier(n_iter=50) clf.fit(x_train, y_train) subreddit_alchemy[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" #this needs a lot of memory! you might get a memory error running it for i, d in enumerate(['Not alchemy', 'Alchemy']): for clf, name in ( (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive")): subreddit_svm = {} for subreddit in subreddits: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 titles = list(sortedsmalldf['title']) bins = list(sortedsmalldf['category']) if (i==1): alch_titles = [] for title in list(sortedsmalldf['title']): titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']] titles = [lst.replace(')', '') for lst in titles] titles = [lst.replace('[', '') for lst in titles] titles = [lst.replace(']', '') for lst in titles] titles = "".join(titles) titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ') titles = titles.replace(' ', ' ') titles = titles.split(' ')[1:] alch_titles.append(titles) alch_bins = [] categories = np.array(sortedsmalldf['category']) for i, lst in enumerate(alch_titles): b = categories[i] for j in range(len(lst)): alch_bins.append(b) alch_titles = [word for words in alch_titles for word in words] titles = alch_titles bins = alch_bins n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(titles) X = n_grams.transform(titles) Y = np.array(bins) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf2 = clf clf2.fit(x_train, y_train) subreddit_svm[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", d, "and", subreddit, "subreddit and", name, "classifier:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------"