from IPython.display import Image Image(url="http://trackmaven.com/assets/tm_logo_horizontal.png") import requests response = requests.get("http://isitlunchtimeyet.com/", auth=('fheisler', 'passw0rd')) response print response.text import requests class RateLimitError(Exception): pass class ClientError(Exception): pass class APIClient(object): """ A generic API client to handle interaction using the requests module; throw specific errors based on various responses """ # Should be overwritten by implementations, i.e. HTTPError=FacebookHTTPError HTTPError = requests.exceptions.HTTPError ConnectionError = requests.exceptions.ConnectionError Error = ClientError RateLimitError = RateLimitError def __init__(self): """ Throw exceptions if BASE_URI or BASE_PARAMS are not set """ if not hasattr(self, 'BASE_URI'): raise NotImplementedError('Must specify a base uri') if not hasattr(self, 'BASE_PARAMS'): raise NotImplementedError('Must specify base params') def _validate_response(self, response): """ Optional response validation """ pass def _validate_error(self, response): """ Optional error validation, for when you want to raise a specific exception, for example rate limit exceptions. """ pass def _get(self, url, params={}, timeout=10, retries=3): """ Gets a response based on the url and params passed to it; will retry 3 times if there is a connection error """ full_url = '{}/{}'.format(self.BASE_URI, url) while retries > 0: try: if hasattr(self, 'BASE_PARAMS'): params.update(self.BASE_PARAMS) response = requests.get(full_url, params=params) try: response.raise_for_status() self._validate_response(response) return response except requests.exceptions.HTTPError as e: self._validate_error(response) raise self.HTTPError(e) except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: retries -= 1 if not retries: raise self.ConnectionError(e) def _json(self, response): """ Try to convert the response to JSON """ try: return response.json() except Exception as e: raise self.Error(e) import pandas as pd import matplotlib.pyplot as plt !pwd !ls import sqlite3 # Import Instagram picture data from SQL table connection = sqlite3.connect('instagram.db') instagram_data = pd.io.sql.read_sql("SELECT * FROM instagram;", con=connection) instagram_data.head() instagram_data.dtypes instagram_data.describe() instagram_data['social_actions'] = instagram_data.likes + instagram_data.comments instagram_data['social_actions'].median() instagram_data.social_actions.hist(bins=50); instagram_data[instagram_data.social_actions > 50000].social_actions.hist(bins=50) plt.title("Distribution of social actions (50k+)") plt.ylabel("Total likes + comments") plt.xlabel("Number of Instagram pictures"); # Top 10 filters by usage top_filters = instagram_data.groupby('filter').size().order(ascending=False) (100 * top_filters / float(sum(top_filters)))[:10] # Top 10 filters by social engagement social_actions = instagram_data.groupby('filter').social_actions (social_actions.sum() / social_actions.count()).order(ascending=False)[:10] # Top pics using Willow and Sierra filter filter_top_filters = instagram_data['filter'].isin(["Willow", "Sierra"]) instagram_data[filter_top_filters].sort('social_actions', ascending=False)[:10] # Most engaging Instagram posters posters = instagram_data.groupby('account').social_actions top_posters = (posters.sum() / posters.count()).order(ascending=False) top_posters[:5] # Average number of hashtags used by top brands instagram_data['num_hashtags'] = instagram_data.caption.str.count("#") tag_counts = instagram_data.groupby('account').num_hashtags avg_tags = (tag_counts.sum() / tag_counts.count()) pd.concat([top_posters, avg_tags], axis=1).sort('social_actions', ascending=False).num_hashtags[:5] # Plot most effective number of hashtags tag_effect = instagram_data.groupby('num_hashtags').social_actions plt.plot((tag_effect.sum() / tag_effect.count())[:5]); plt.ylabel("Average number of likes + comments"); plt.xlabel("Number of hashtags used"); # Biggest hashtags users avg_tags.order(ascending=False)[:5] # Find the best day of week to post import arrow # Create an Arrow timestamp instagram_data['day'] = instagram_data.timestamp.apply(arrow.get, args=("YYMMDD HH:mm",)) # Format to day of week, 1 through 7 instagram_data.day = instagram_data.day.apply(format, args=("d",)) dow_effect = instagram_data.groupby('day').social_actions (dow_effect.sum() / dow_effect.count()).plot(kind='bar'); plt.ylabel("Avg likes + comments"); plt.xticks(range(7), ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")); import nltk # Import blog data from CSV file blog_data = pd.read_csv("blog.csv") blog_data.head() # Clean up HTML tags (not quite safe!) import re blog_data.summary = blog_data.summary.apply(lambda s: re.sub("<[^<]+?>", "", s)) # Remove multiple spaces blog_data.summary = blog_data.summary.apply(lambda s: re.sub(' +', ' ', s)) # Remove leading and trailing spaces blog_data.title = blog_data.title.apply(lambda t: t.strip()) blog_data.summary = blog_data.summary.apply(lambda s: s.strip()) # Decode HTML entities from lxml import html blog_data.summary = blog_data.summary.apply(lambda s: html.fromstring(s).text) # Collapse social shares blog_data['shares'] = blog_data.fb_likes + blog_data.fb_shares + blog_data.linkedin_shares + blog_data.pins + blog_data.tweets blog_data = blog_data.drop(['fb_likes', 'fb_shares', 'linkedin_shares', 'pins', 'tweets'], axis=1); blog_data.shares.describe() # Top performing blog post blog_data[blog_data.shares == blog_data.shares.max()] title_word_bag = blog_data.title.apply(lambda t: t + " ").sum() # Top 10 most common words from collections import Counter Counter(title_word_bag.split()).most_common()[:10] # Top 10 most common non-stopwords stopwords = [unicode(word) for word in nltk.corpus.stopwords.words('english')] title_words = [word for word in title_word_bag.split() if word.lower() not in stopwords] Counter(title_words).most_common()[:10] bigram_measures = nltk.collocations.BigramAssocMeasures() bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(title_words) # Filter to only bigrams that appear 20+ times bigram_finder.apply_freq_filter(20) bigram_finder.score_ngrams(bigram_measures.raw_freq)[:10] # Top 10 bigrams with the highest PMI (pointwise mutual information) bigram_finder.nbest(bigram_measures.pmi, 10) # Examine "top list"-type posts; titles begin with a number blog_data['list_post'] = blog_data.title.apply(lambda t: t[0].isdigit()) # How many posts are "top list" posts? float(blog_data.list_post.sum())/blog_data.list_post.count() # How effective are list-type posts? list_effect = blog_data.groupby('list_post').shares (list_effect.sum() / list_effect.count()).plot(kind='bar'); plt.ylabel("Avg social shares"); # Examine "question"-type posts; title ends with a "?" blog_data['question_post'] = blog_data.title.apply(lambda t: t[-1] is "?") # How many posts are "question" posts? float(blog_data.question_post.sum()) / blog_data.question_post.count() # How effective are question posts? question_effect = blog_data.groupby('question_post').shares (question_effect.sum() / question_effect.count()).plot(kind='bar') plt.ylabel("Avg social shares"); # Do you want to find out what these amazing questions are? blog_data[blog_data.question_post].sort('shares', ascending=False).title[:10] blog_data['actually'] = blog_data.title.apply(lambda t: "actually" in t.lower()) # How many posts are "actually" posts? float(blog_data.actually.sum()) / blog_data.actually.count() # But actually, how effective are they? actual_effect = blog_data.groupby('actually').shares (actual_effect.sum() / actual_effect.count()).plot(kind='bar') plt.ylabel("Avg social shares"); # Examine post title length blog_data['title_length'] = blog_data.title.apply(lambda t: len(t)) # Distribution of title lengths blog_data.title_length.hist(); plt.ylabel("Number of posts"); plt.xlabel("Number of characters in title"); # Most effective title lengths title_len_effect = blog_data.groupby('title_length').shares plt.plot((title_len_effect.sum() / title_len_effect.count())); plt.ylabel("Average shares"); plt.xlabel("Number of characters in title"); # Examine post summary length blog_data['summary_length'] = blog_data.summary.apply(lambda t: len(t)) # Distribution of summary lengths blog_data.summary_length.hist(bins=50); plt.xlabel("Number of characters in summary") plt.ylabel("Number of posts"); # Highly skewed distribution; save a log-transformed summary length for later blog_data['summary_log_len'] = blog_data.summary_length.apply(log) blog_data.summary_log_len.hist(); # Bin summary lengths bins = range(0, 3000, 100) blog_data['binned_summary_length'] = pd.cut(blog_data.summary_length, bins=bins, labels=bins[1:]) # Most effective post summary lengths summary_len_effect = blog_data.groupby('binned_summary_length').shares plt.plot(bins[1:], summary_len_effect.sum() / summary_len_effect.count()); plt.ylabel("Average shares"); plt.xlabel("Number of characters in summary"); import arrow # Convert timestamp to arrow object for manipulation blog_data['timestamp'] = blog_data.timestamp.apply(arrow.get, args=("YYMMDD HH:mm",)) # Day of week distribution blog_data['dow'] = blog_data.timestamp.apply(lambda ts: int(ts.format('d'))) blog_data.dow.hist(bins=7, range=(0,8)); plt.ylabel("Total number of posts") plt.xticks(range(8), ("", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")); # Day of week effectiveness dow_effect = blog_data.groupby('dow').shares (dow_effect.sum() / dow_effect.count()).plot(kind='bar') plt.ylabel("Avg social shares") plt.xticks(range(7), ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")); # Time of day distribution (hourly) blog_data['tod'] = blog_data.timestamp.apply(lambda ts: int(ts.format('HH'))) blog_data.tod.hist(bins=24, range=(0,24)); plt.ylabel("Avg social shares"); plt.xlabel("Hour of day (ET)"); # Time of day effectiveness tod_effect = blog_data.groupby('tod').shares (tod_effect.sum() / tod_effect.count()).plot(kind='bar') plt.ylabel("Avg social shares"); # Prepare readability scores based on Flesch-Kincaid Grade Level from re import match # Load Carnegie Mellon Pronouncing Dictionary cmu = nltk.corpus.cmudict.dict() def reduce_word(word): return ''.join([x for x in word.lower() if match(r'\w', x)]) def get_syllable_count(word): word = reduce_word(word) if (not len(word)) or (not word in cmu): return 0 return len([x for x in list(''.join(list(cmu[word])[-1])) if match(r'\d', x)]) def get_grade_level(text): """Flesch-Kincaid Grade Level formula""" sentences = nltk.tokenize.sent_tokenize(text) sentence_count = len(sentences) word_count = 0 syllable_count = 0 for sentence in sentences: words = nltk.tokenize.word_tokenize(sentence) words = [reduce_word(word) for word in words] words = [word for word in words if word != ''] word_count += len(words) syllable_count += sum([get_syllable_count(word) for word in words]) if word_count is 0: return 0 word_count = float(word_count) return (0.39 * (word_count / sentence_count) + 11.8 * (syllable_count / word_count) - 15.59) blog_data['grade_level'] = blog_data.summary.apply(get_grade_level) # Distribution of summary grade-level scores blog_data.grade_level.hist(bins=30, range=(-10,20)); blog_data.grade_level.describe() bins = range(-10, 20, 5) blog_data['binned_grade_level'] = pd.cut(blog_data.grade_level, bins=bins, labels=bins[1:]) grade_lvl_effect = blog_data.groupby('binned_grade_level').shares plt.plot(bins[1:], grade_lvl_effect.sum() / grade_lvl_effect.count()); plt.ylabel("Average shares"); plt.xlabel("Flesch-Kinkaid grade level"); # What are these negative scores? blog_data.sort(['grade_level', 'shares'], ascending=[True, False])[['title', 'summary', 'grade_level', 'shares']][:10] # Randomly shuffle rows blog_data = blog_data.apply(np.random.permutation) # Use these columns as features feature_list = ['list_post', 'question_post', 'actually', 'title_length', 'summary_log_len', 'dow', 'tod', 'grade_level', ] # Prepare only the columns we need (features + target) reduced_blog_data = blog_data[feature_list + ['shares']] # Check for NaN's reduced_blog_data.isnull().any() from sklearn.preprocessing import StandardScaler # Normalize the data scaler = StandardScaler().fit(reduced_blog_data.astype(np.float)) norm_array = scaler.transform(reduced_blog_data.astype(np.float)) norm_blog_data = pd.DataFrame(norm_array, columns=reduced_blog_data.columns) # Sample 80% of the data for training; keep 20% for testing train_prop = int(.8*len(norm_blog_data)) training_set = norm_blog_data[:train_prop] testing_set = norm_blog_data[train_prop:] features = training_set[feature_list] target = training_set.shares from sklearn import linear_model # Fit a linear classifier to the training data using stochastic gradient descent # (probably not a great idea; features are likely to be highly correlated) clf = linear_model.SGDRegressor() clf.fit(features, target) # Predict results of testing set to measure accuracy predicted_shares = clf.predict(testing_set[feature_list]) from sklearn.metrics import r2_score # Measure the accuracy of the predictions r2_score(testing_set.shares, predicted_shares) # Bin the number of shares into buckets bins = [0, 1e3, 1e4, 1e5, blog_data.shares.max()] blog_data['binned_shares'] = pd.cut(blog_data.shares, bins=bins, labels=bins[1:]) # Check the distribution of binned shares blog_data.groupby('binned_shares').size() # As before, prepare the data... reduced_blog_data = blog_data[feature_list] # + ['binned_shares'] scaler = StandardScaler().fit(reduced_blog_data.astype(np.float)) norm_array = scaler.transform(reduced_blog_data.astype(np.float)) norm_blog_data = pd.DataFrame(norm_array, columns=reduced_blog_data.columns) norm_blog_data.isnull().any() train_prop = int(.8*len(norm_blog_data)) training_set = norm_blog_data[:train_prop] testing_set = norm_blog_data[train_prop:] features = training_set[feature_list] target = blog_data[:train_prop].binned_shares.astype(str) true_test_shares = blog_data[train_prop:].binned_shares.astype(str) # Fit a classifier on the binned shares and predict clf = linear_model.SGDClassifier() clf.fit(features, target) predicted_shares = clf.predict(testing_set) from sklearn.metrics import accuracy_score # Measure the accuracy of binned predictions among the 4 categories accuracy_score(true_test_shares, predicted_shares) Image("http://scikit-learn.org/stable/_static/ml_map.png")