This is a step-by-step guide for using an image-analysis SDK in combination with a "bag of words" approach to training a naive bayes classifier, for use with Instagram images and captions.
#if you dont't have the python-instagram library, run 'pip install python-instagram'.
client_id = #Your Client ID goes here
client_secret = #Your Client Secret goes here
access_token = #Your Instagram access token goes here, it can be obtained at http://www.pinceladasdaweb.com.br/instagram/access-token/
apiauth = InstagramAPI(access_token=access_token)
from instagram.client import InstagramAPI
api = InstagramAPI(client_id=client_id, client_secret=client_secret)
user_id = #some instagram user id
user_media = []
tmpmedia = api.user_recent_media(user_id=user_id, count = 33)
tmpmedia[0]
for m in tmpmedia[0]:
user_media.append(m)
from urlparse import urlparse
parsed = urlparse(tmpmedia[1])
params = {a:b for a,b in [x.split('=') for x in parsed.query.split('&')]}
int(params['max_id'].split('_')[0])
#you may run into rate limiting here, if you are trying to acquire a large dataset. If so, perhaps lower the range argument.
for i in range(100):
max_id = int(params['max_id'].split('_')[0])
tmpmedia = api.user_recent_media(user_id=user_id, max_id=max_id - 1, count=33)
for m in tmpmedia[0]:
user_media.append(m)
parsed = urlparse(tmpmedia[1])
params = {a:b for a,b in [x.split('=') for x in parsed.query.split('&')]}
import urllib2
imagedir = #PATH to a the directory where you want to save the .jpg images. ex. 'home/user/igimages/'
for m in user_media:
f = urllib2.urlopen(m.images['standard_resolution'].url)
data = f.read()
with open(imagedir + m.id + '.jpg', "wb") as code:
code.write(data)
#The above for-loop will save all of the pictures with the media id as the name. ex. '866246441730669483_11404563.jpg'
# If you would like to reference the actual media item associated with a media-id, you can use this helper function:
def getmediaitem(curlist, media_id):
for m in curlist:
if m.id == media_id:
return m
else:
continue
# Pass the list you want to look through, in this case'user_media', and whichever media-id you want to examine.
# The function will return that item.
mediaobject = getmediaitem(user_media, '866246441730669483_11404563')
mediaobject
Navigate to https://github.com/jetpacapp/DeepBeliefSDK and clone the repository. For the sake of this example code, I used the SimpleLinux examples to run analysis against a hard-coded repository. You can configure this very powerful library yourself, however, if you so desire. Once it is installed, you can run the DeepBelief analysis on your Instagram Image files.
NOTE: Because this is a simple example, the reference to the jetpac.ntwk file is hard-coded, meaning you MUST run this type of command from the /DeepBeliefSDK/examples/SimpleLinux/ directory. For the shell script below to work, move all your instagram images into the /DeepBeliefSDK/examples/SimpleLinux/ directory.
Here is a little shell script that I used to crank out the text files:
#!/usr/bin/bash
for file in *.jpg
do
./deepbelief $file | sort -k2nr | head -25 > $file.txt
done
This makes text files with the name of the photograph as the first part. (ex. 866246441730669483_11404563.jpg.txt)
#Start by getting the like_counts of all posts.
like_counts = [m.like_count for m in user_media]
#Let's plot the data into a boxplot to break down the five areas we want to categorize:
%pylab inline
boxplot(like_counts,0,'')
#to give you the exact numbers in the boxplot
median = np.median(like_counts)
upper_quartile = np.percentile(like_counts, 75)
lower_quartile = np.percentile(like_counts, 25)
iqr = upper_quartile - lower_quartile
upper_whisker = upper_quartile + 1.5*iqr
print lower_quartile, median, upper_quartile, upper_whisker, iqr
#Here is a sorting function that will help sort our instagram posts into quartile like_count bins. When calling this function,
#pass in the values you got from the cell directly above.
#Here you could write your own function if you wanted to, say, divide the posts into only two groups, counts above the median,
#or counts below the median. The amount of bins you create is up to you.
def sortintoquartiles(num,lq,median,uq,uw):
if num>= 0 and num<lq:
return 'lw'
if num>=lq and num<median:
return 'lq'
if num>=median and num<uq:
return 'uq'
if num>=uq and num<uw:
return 'uw'
if num>=uw:
return 'ol'
#lw: Lower Whisker
#lq: Lower Quartile
#median: Median
#up: Upper Quartile
#uw: Upper Whisker
#ol: Outlier (more likes than the upper whisker limit)
#If you don't have nltk, visit this link http://www.nltk.org/install.html and follow instructions to install it.
#Also, make sure you install at least the stopwords corpus, check out this link http://www.nltk.org/data.html to do so.
# This is a way of compiling text into a "wordbag" with each word set to "TRUE". I gave you the option of subtracting
#usermentions or hashtags from the text. Uncomment them below to subtract them from the wordbags.
import nltk
from nltk.corpus import stopwords
import re
stub = re.compile('[^A-Za-z]')
def bag_of_non_stopwords(text):
words = [stub.sub('', w).lower() for w in text.split()]
usermentions = re.findall("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z_]+[A-Za-z0-9_]+)", text, re.I)
tagmentions = re.findall("(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z_]+[A-Za-z0-9_]+)", text, re.I)
finalwords = set(words) - set(stopwords.words('english')) #- set(usermentions) #- set(tagmentions)
featureset = dict([(word, True) for word in finalwords if not word.startswith('http') and len(word)>2])
return featureset
#Here is a function to create the actual formatted wordbags for the text files we created earlier. They should be words describing the
#Instagram photos with the same media id.
def extractphotofeatures(mediaid):
textpath = #Change this to the path where your .txt files are
file = open(textpath + mediaid + '.jpg.txt', 'r')
stub = re.compile('[^A-Za-z]')
listy = file.readlines()[:25]
newdict = []
for text in listy:
newdict.append([stub.sub('', w).lower() for w in text.split('\t')][2])
featureset = dict([(word, True) for word in newdict if not word.startswith('http') and len(word)>2])
return featureset
#Here we will finally create the list of wordbag features associated with the images and captions from the instagram posts,
#and also we will create an index to check our results. The wordsbags will each be classified according to the quartile
#of like_counts for that post.
likefeats = []
index = []
for m in user_media:
if hasattr(m.caption, 'text'):
temptup = (dict(bag_of_non_stopwords(m.caption.text).items() + extractphotofeatures(m.id).items()),
sortintoquartiles(m.like_count,#Use your own numbers here: 2575,3941,5330,9462))
likefeats.append(temptup)
index.append([m.id, m.link, m.caption.text, m.like_count, temptup])
else:
temptup = (dict(bag_of_non_stopwords('nocaption.').items() + extractphotofeatures(m.id).items()), sortintoquartiles(m.like_count,2575,3941,5330,9462))
likefeats.append(temptup)
index.append([m.id, m.link, 'nocaption', m.like_count, temptup])
#test to make sure these matchup
likefeats[5]
index[5]
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
cutoff = len(likefeats)*3/4
trainfeats = likefeats[:cutoff]
testfeats = likefeats[cutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()
#By taking a new Instagram post from the same user, running the Deepbelief on the image file and adding the caption words, simply
#run this function now to make a guess at the possible popularity of the post.
classifier.classify(#featureset representing a single post. ex: likefeats[0])
Thank you to Gilad Lotan, and the others whose code I "Frankensteined" to create this approach.