from IPython.display import Image
Image(url="http://trackmaven.com/assets/tm_logo_horizontal.png")

import requests
response = requests.get("http://isitlunchtimeyet.com/", auth=('fheisler', 'passw0rd'))
response

print response.text

import requests


class RateLimitError(Exception):
    pass

class ClientError(Exception):
    pass

class APIClient(object):
    """
    A generic API client to handle interaction using the
    requests module; throw specific errors based on various responses
    """

    # Should be overwritten by implementations, i.e. HTTPError=FacebookHTTPError
    HTTPError = requests.exceptions.HTTPError
    ConnectionError = requests.exceptions.ConnectionError
    Error = ClientError
    RateLimitError = RateLimitError

    def __init__(self):
        """ Throw exceptions if BASE_URI or BASE_PARAMS are not set """
        if not hasattr(self, 'BASE_URI'):
            raise NotImplementedError('Must specify a base uri')
        if not hasattr(self, 'BASE_PARAMS'):
            raise NotImplementedError('Must specify base params')

    def _validate_response(self, response):
        """ Optional response validation """
        pass

    def _validate_error(self, response):
        """
        Optional error validation, for when you want to raise a specific
        exception, for example rate limit exceptions.
        """
        pass

    def _get(self, url, params={}, timeout=10, retries=3):
        """
        Gets a response based on the url and params passed to it; 
        will retry 3 times if there is a connection error
        """
        full_url = '{}/{}'.format(self.BASE_URI, url)
        while retries > 0:
            try:
                if hasattr(self, 'BASE_PARAMS'):
                    params.update(self.BASE_PARAMS)
                response = requests.get(full_url, params=params)
                try:
                    response.raise_for_status()
                    self._validate_response(response)
                    return response
                except requests.exceptions.HTTPError as e:
                    self._validate_error(response)
                    raise self.HTTPError(e)
            except (requests.exceptions.ConnectionError,
                    requests.exceptions.Timeout) as e:
                retries -= 1
                if not retries:
                    raise self.ConnectionError(e)

    def _json(self, response):
        """ Try to convert the response to JSON """
        try:
            return response.json()
        except Exception as e:
            raise self.Error(e)

import pandas as pd
import matplotlib.pyplot as plt

!pwd
!ls

import sqlite3

# Import Instagram picture data from SQL table
connection = sqlite3.connect('instagram.db')
instagram_data = pd.io.sql.read_sql("SELECT * FROM instagram;", con=connection)

instagram_data.head()

instagram_data.dtypes

instagram_data.describe()

instagram_data['social_actions'] = instagram_data.likes + instagram_data.comments
instagram_data['social_actions'].median()

instagram_data.social_actions.hist(bins=50);

instagram_data[instagram_data.social_actions > 50000].social_actions.hist(bins=50)
plt.title("Distribution of social actions (50k+)")
plt.ylabel("Total likes + comments")
plt.xlabel("Number of Instagram pictures");

# Top 10 filters by usage
top_filters = instagram_data.groupby('filter').size().order(ascending=False)
(100 * top_filters / float(sum(top_filters)))[:10]

# Top 10 filters by social engagement
social_actions = instagram_data.groupby('filter').social_actions
(social_actions.sum() / social_actions.count()).order(ascending=False)[:10]

# Top pics using Willow and Sierra filter
filter_top_filters = instagram_data['filter'].isin(["Willow", "Sierra"])
instagram_data[filter_top_filters].sort('social_actions', ascending=False)[:10]

# Most engaging Instagram posters
posters = instagram_data.groupby('account').social_actions
top_posters = (posters.sum() / posters.count()).order(ascending=False)
top_posters[:5]

# Average number of hashtags used by top brands
instagram_data['num_hashtags'] = instagram_data.caption.str.count("#")
tag_counts = instagram_data.groupby('account').num_hashtags
avg_tags = (tag_counts.sum() / tag_counts.count())
pd.concat([top_posters, avg_tags], axis=1).sort('social_actions', ascending=False).num_hashtags[:5]

# Plot most effective number of hashtags
tag_effect = instagram_data.groupby('num_hashtags').social_actions
plt.plot((tag_effect.sum() / tag_effect.count())[:5]);
plt.ylabel("Average number of likes + comments");
plt.xlabel("Number of hashtags used");

# Biggest hashtags users
avg_tags.order(ascending=False)[:5]

# Find the best day of week to post
import arrow

# Create an Arrow timestamp
instagram_data['day'] = instagram_data.timestamp.apply(arrow.get, args=("YYMMDD HH:mm",))

# Format to day of week, 1 through 7
instagram_data.day = instagram_data.day.apply(format, args=("d",))

dow_effect = instagram_data.groupby('day').social_actions
(dow_effect.sum() / dow_effect.count()).plot(kind='bar');
plt.ylabel("Avg likes + comments");
plt.xticks(range(7), ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"));

import nltk

# Import blog data from CSV file
blog_data = pd.read_csv("blog.csv")

blog_data.head()

# Clean up HTML tags (not quite safe!)
import re

blog_data.summary = blog_data.summary.apply(lambda s: re.sub("<[^<]+?>", "", s))

# Remove multiple spaces
blog_data.summary = blog_data.summary.apply(lambda s: re.sub(' +', ' ', s))

# Remove leading and trailing spaces
blog_data.title = blog_data.title.apply(lambda t: t.strip())
blog_data.summary = blog_data.summary.apply(lambda s: s.strip())

# Decode HTML entities
from lxml import html

blog_data.summary = blog_data.summary.apply(lambda s: html.fromstring(s).text)

# Collapse social shares
blog_data['shares'] = blog_data.fb_likes + blog_data.fb_shares + blog_data.linkedin_shares + blog_data.pins + blog_data.tweets
blog_data = blog_data.drop(['fb_likes', 'fb_shares', 'linkedin_shares', 'pins', 'tweets'], axis=1);

blog_data.shares.describe()

# Top performing blog post
blog_data[blog_data.shares == blog_data.shares.max()]

title_word_bag = blog_data.title.apply(lambda t: t + " ").sum()

# Top 10 most common words
from collections import Counter

Counter(title_word_bag.split()).most_common()[:10]

# Top 10 most common non-stopwords
stopwords = [unicode(word) for word in nltk.corpus.stopwords.words('english')]
title_words = [word for word in title_word_bag.split() if word.lower() not in stopwords]
Counter(title_words).most_common()[:10]

bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(title_words)

# Filter to only bigrams that appear 20+ times
bigram_finder.apply_freq_filter(20)

bigram_finder.score_ngrams(bigram_measures.raw_freq)[:10]

# Top 10 bigrams with the highest PMI (pointwise mutual information)
bigram_finder.nbest(bigram_measures.pmi, 10)

# Examine "top list"-type posts; titles begin with a number
blog_data['list_post'] = blog_data.title.apply(lambda t: t[0].isdigit())

# How many posts are "top list" posts?
float(blog_data.list_post.sum())/blog_data.list_post.count()

# How effective are list-type posts?
list_effect = blog_data.groupby('list_post').shares
(list_effect.sum() / list_effect.count()).plot(kind='bar');
plt.ylabel("Avg social shares");

# Examine "question"-type posts; title ends with a "?"
blog_data['question_post'] = blog_data.title.apply(lambda t: t[-1] is "?")

# How many posts are "question" posts?
float(blog_data.question_post.sum()) / blog_data.question_post.count()

# How effective are question posts?
question_effect = blog_data.groupby('question_post').shares
(question_effect.sum() / question_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares");

# Do you want to find out what these amazing questions are?
blog_data[blog_data.question_post].sort('shares', ascending=False).title[:10]


blog_data['actually'] = blog_data.title.apply(lambda t: "actually" in t.lower())

# How many posts are "actually" posts?
float(blog_data.actually.sum()) / blog_data.actually.count()

# But actually, how effective are they?
actual_effect = blog_data.groupby('actually').shares
(actual_effect.sum() / actual_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares");

# Examine post title length
blog_data['title_length'] = blog_data.title.apply(lambda t: len(t))

# Distribution of title lengths
blog_data.title_length.hist();
plt.ylabel("Number of posts");
plt.xlabel("Number of characters in title");

# Most effective title lengths
title_len_effect = blog_data.groupby('title_length').shares
plt.plot((title_len_effect.sum() / title_len_effect.count()));
plt.ylabel("Average shares");
plt.xlabel("Number of characters in title");

# Examine post summary length
blog_data['summary_length'] = blog_data.summary.apply(lambda t: len(t))

# Distribution of summary lengths
blog_data.summary_length.hist(bins=50);
plt.xlabel("Number of characters in summary")
plt.ylabel("Number of posts");

# Highly skewed distribution; save a log-transformed summary length for later
blog_data['summary_log_len'] = blog_data.summary_length.apply(log)
blog_data.summary_log_len.hist();

# Bin summary lengths
bins = range(0, 3000, 100)
blog_data['binned_summary_length'] = pd.cut(blog_data.summary_length, bins=bins, labels=bins[1:])


# Most effective post summary lengths
summary_len_effect = blog_data.groupby('binned_summary_length').shares
plt.plot(bins[1:], summary_len_effect.sum() / summary_len_effect.count());
plt.ylabel("Average shares");
plt.xlabel("Number of characters in summary");

import arrow

# Convert timestamp to arrow object for manipulation
blog_data['timestamp'] = blog_data.timestamp.apply(arrow.get, args=("YYMMDD HH:mm",))

# Day of week distribution
blog_data['dow'] = blog_data.timestamp.apply(lambda ts: int(ts.format('d')))
blog_data.dow.hist(bins=7, range=(0,8));
plt.ylabel("Total number of posts")
plt.xticks(range(8), ("", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"));

# Day of week effectiveness
dow_effect = blog_data.groupby('dow').shares
(dow_effect.sum() / dow_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares")
plt.xticks(range(7), ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"));

# Time of day distribution (hourly)
blog_data['tod'] = blog_data.timestamp.apply(lambda ts: int(ts.format('HH')))
blog_data.tod.hist(bins=24, range=(0,24));
plt.ylabel("Avg social shares");
plt.xlabel("Hour of day (ET)");

# Time of day effectiveness
tod_effect = blog_data.groupby('tod').shares
(tod_effect.sum() / tod_effect.count()).plot(kind='bar')
plt.ylabel("Avg social shares");

# Prepare readability scores based on Flesch-Kincaid Grade Level
from re import match

# Load Carnegie Mellon Pronouncing Dictionary
cmu = nltk.corpus.cmudict.dict()

def reduce_word(word):
    return ''.join([x for x in word.lower() if match(r'\w', x)])

def get_syllable_count(word):
    word = reduce_word(word)
    if (not len(word)) or (not word in cmu):
        return 0
    return len([x for x in list(''.join(list(cmu[word])[-1])) if match(r'\d', x)])
 
def get_grade_level(text):
    """Flesch-Kincaid Grade Level formula"""
    sentences = nltk.tokenize.sent_tokenize(text)
    sentence_count = len(sentences)
    word_count = 0
    syllable_count = 0
    for sentence in sentences:
        words = nltk.tokenize.word_tokenize(sentence)
        words = [reduce_word(word) for word in words]
        words = [word for word in words if word != '']
        word_count += len(words)
        syllable_count += sum([get_syllable_count(word) for word in words])
    if word_count is 0:
        return 0
    word_count = float(word_count)
    return (0.39 * (word_count / sentence_count)
            + 11.8 * (syllable_count / word_count)
            - 15.59)

blog_data['grade_level'] = blog_data.summary.apply(get_grade_level)

# Distribution of summary grade-level scores
blog_data.grade_level.hist(bins=30, range=(-10,20));
blog_data.grade_level.describe()

bins = range(-10, 20, 5)
blog_data['binned_grade_level'] = pd.cut(blog_data.grade_level, bins=bins, labels=bins[1:])

grade_lvl_effect = blog_data.groupby('binned_grade_level').shares
plt.plot(bins[1:], grade_lvl_effect.sum() / grade_lvl_effect.count());
plt.ylabel("Average shares");
plt.xlabel("Flesch-Kinkaid grade level");

# What are these negative scores?
blog_data.sort(['grade_level', 'shares'], ascending=[True, False])[['title', 'summary', 'grade_level', 'shares']][:10]

# Randomly shuffle rows
blog_data = blog_data.apply(np.random.permutation)

# Use these columns as features
feature_list = ['list_post', 
                'question_post', 
                'actually', 
                'title_length', 
                'summary_log_len', 
                'dow', 
                'tod', 
                'grade_level',
                ]

# Prepare only the columns we need (features + target)
reduced_blog_data = blog_data[feature_list + ['shares']]

# Check for NaN's
reduced_blog_data.isnull().any()

from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler().fit(reduced_blog_data.astype(np.float))
norm_array = scaler.transform(reduced_blog_data.astype(np.float))
norm_blog_data = pd.DataFrame(norm_array, columns=reduced_blog_data.columns)

# Sample 80% of the data for training; keep 20% for testing
train_prop = int(.8*len(norm_blog_data))
training_set = norm_blog_data[:train_prop]
testing_set = norm_blog_data[train_prop:]

features = training_set[feature_list]
target = training_set.shares

from sklearn import linear_model

# Fit a linear classifier to the training data using stochastic gradient descent
# (probably not a great idea; features are likely to be highly correlated)
clf = linear_model.SGDRegressor()
clf.fit(features, target)

# Predict results of testing set to measure accuracy
predicted_shares = clf.predict(testing_set[feature_list])

from sklearn.metrics import r2_score

# Measure the accuracy of the predictions
r2_score(testing_set.shares, predicted_shares)

# Bin the number of shares into buckets
bins = [0, 1e3, 1e4, 1e5, blog_data.shares.max()]
blog_data['binned_shares'] = pd.cut(blog_data.shares, bins=bins, labels=bins[1:])

# Check the distribution of binned shares
blog_data.groupby('binned_shares').size()

# As before, prepare the data...
reduced_blog_data = blog_data[feature_list] # + ['binned_shares']

scaler = StandardScaler().fit(reduced_blog_data.astype(np.float))
norm_array = scaler.transform(reduced_blog_data.astype(np.float))
norm_blog_data = pd.DataFrame(norm_array, columns=reduced_blog_data.columns)
norm_blog_data.isnull().any()

train_prop = int(.8*len(norm_blog_data))
training_set = norm_blog_data[:train_prop]
testing_set = norm_blog_data[train_prop:]

features = training_set[feature_list]
target = blog_data[:train_prop].binned_shares.astype(str)
true_test_shares = blog_data[train_prop:].binned_shares.astype(str)

# Fit a classifier on the binned shares and predict
clf = linear_model.SGDClassifier()
clf.fit(features, target)
predicted_shares = clf.predict(testing_set)

from sklearn.metrics import accuracy_score

# Measure the accuracy of binned predictions among the 4 categories
accuracy_score(true_test_shares, predicted_shares)

Image("http://scikit-learn.org/stable/_static/ml_map.png")