import pandas as pd import json import os import urllib import urllib2 import numpy as np import unicodedata from myalchemy import MyAlchemy #Since we only get so many Alchemy API calls, might as well not call #Alchemy on duplicate posts between subreddits. We'll merge these later...... #df = pd.read_csv('Data/full.csv', encoding='utf-8') #print "Original size of data set is", len(df) #df = df.drop_duplicates('id') # We only want unique post id entries, not to waste alchemy calls #print "Size of data set with only unique posts is", len(df) #subs = list(df['subreddit'].unique()) #dflen = len(df) #df['alchemy'] = ['null']*dflen #df = pd.read_csv('Data/uniqueentries.csv', encoding='utf-8') file_dir = "Data/combinedcomments/" path, dirs, files = os.walk(file_dir).next() csvfiles = [file_dir + i for i in files if ".csv" in i ] #Builds a list with .csv files csvfiles.sort() #csvfiles def check_null(x): try: np.isnan(x) return False except: return True def alchemy_comments(df, start, apikey, csvfiles, end=len(df)): ''' will run the alchemy keyword annalysis on the comment files ''' p = MyAlchemy(apikey) dfids = list(df.index) for i in range(start, end): subrequest = df['subreddit'][dfids[i]] commentfile = '' for comb in csvfiles: if subrequest in comb: commentfile = comb commentdf = pd.read_csv(commentfile, encoding='utf-8') commentdf = commentdf.drop('type',1) commentdf = commentdf.drop_duplicates() commentdf = commentdf[commentdf['comment'].apply(lambda x: check_null(x))] commentdf['comment'] = commentdf['comment'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore')) commentdf = commentdf[commentdf['post'] == df['id'][dfids[i]]] comments = list(commentdf['comment']) # If we want to add the title to the alchemy call comments.append(df['title'][dfids[i]]) # If we want to add the self text to the alchemy call if check_null(df['selftext'][dfids[i]]): comments.append(df['selftext'][dfids[i]]) # Both joining the comments and sending alchemy calls can be problematic try: comments = ' '.join(comments) if len(comments) > 8000: comments = comments[0:7999] except: print "Comment join error", comments # I'm not sure what Alchemy does once you reach the cap... so you might want to check if null or something here. try: df['alchemy'][dfids[i]] = p.run_method(comments, 'keywords', {'keywordExtractMode':'strict'}) except: print "Alchemy error", df['id'][dfids[i]] return df #Alchemy keys apikey1 = "dcac82649daaa2627ee783b25779cfaed4af0067" #Jay's key apikey2 = "e945cef59338f9e8e7bc962badde170e623fb7e5" #Basti's key apikey3 = "cb736ca44e57cd6764b70ec86886f4fce8f6a68d" #Serguei's Key #df = alchemy_comments(df, 25000, apikey2, csvfiles, end=25992) #df.to_csv('Data/uniqueentries.csv', index=False, encoding='utf-8') fulldf = pd.read_csv('Data/full.csv', encoding='utf-8') fulldf['alchemy'] = ['null']*len(fulldf) for i in fulldf.index: fid = fulldf['id'][i] alc = df[df['id'] == fid]['alchemy'] fulldf['alchemy'][i] = alc fulldf.to_csv('Data/full.csv', index=False, encoding='utf-8') #print p.run_method(comments, 'concepts') #print p.run_method(comments, 'keywords', {'keywordExtractMode':'strict'}) #print p.run_method(comments, 'category') #print p.run_method(comments, 'sentiment') #print p.run_method(comments, 'entities') #print p.run_method(reddit_base, 'urlkeywords')