In [86]:

import pandas as pd
import numpy as np
import pylab as pl
import PIL
from PIL import Image
import os
import base64
from StringIO import StringIO

In [87]:

from sklearn.decomposition import RandomizedPCA
from sklearn.neighbors import KNeighborsClassifier

In [88]:

%matplotlib inline

Importing Image Data

Create Processing Functions

In [89]:

#setup a standard image size; this will distort some images but will get everything into the same shape
STANDARD_SIZE = (300, 167)
def img_to_matrix(filename, verbose=False):
    """
    takes a filename and turns it into a numpy array of RGB pixels
    """
    img = PIL.Image.open(filename)
    if verbose==True:
        print "changing size from %s to %s" % (str(img.size), str(STANDARD_SIZE))
    img = img.resize(STANDARD_SIZE)
    img = list(img.getdata())
    img = map(list, img)
    img = np.array(img)
    return img

def flatten_image(img):
    """
    takes in an (m, n) numpy array and flattens it 
    into an array of shape (1, m * n)
    """
    s = img.shape[0] * img.shape[1]
    img_wide = img.reshape(1, s)
    return img_wide[0]

Define the location of your data and import it

In [90]:

# TODO PATH TO YOUR DATA
img_dir = "/Users/hernamesbarbara/Desktop/img/"
images = [img_dir+ f for f in os.listdir(img_dir)]
labels = ["check" if "check" in f.split('/')[-1] else "drivers_license" for f in images]

data = []
for image in images:
    img = img_to_matrix(image)
    img = flatten_image(img)
    data.append(img)

data = np.array(data)
data

Out[90]:

array([[255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255]])

Creating Features

Define a training and test set

In [91]:

is_train = np.random.uniform(0, 1, len(data)) <= 0.7
y = np.where(np.array(labels)=="check", 1, 0)

train_x, train_y = data[is_train], y[is_train]
test_x, test_y = data[is_train==False], y[is_train==False]

RandomizedPCA to create features

Before we actually create our feature vectors, we're going to show a demo of RandomizedPCA in 2 dimensions. This makes it easy to plot high dimensional data

In [92]:

pca = RandomizedPCA(n_components=2)
X = pca.fit_transform(data)
df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1], "label":np.where(y==1, "Check", "Drivers License")})
colors = ["red", "yellow"]
for label, color in zip(df['label'].unique(), colors):
    mask = df['label']==label
    pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
pl.legend()

Out[92]:

<matplotlib.legend.Legend at 0x112e256d0>

RandomizedPCA in 5 dimensions

Instead of 2 dimenisons, we're going to do RandomizedPCA in 5 dimensions. This will make it a bit harder to visualize, but it will make it easier for some of the classifiers to work with the dataset.

In [93]:

pca = RandomizedPCA(n_components=5)
train_x = pca.fit_transform(train_x)
test_x = pca.transform(test_x)

This gives our classifier a nice set of tabular data that we can then use to train the model

In [94]:

train_x[:5]

Out[94]:

array([[ -2.12918920e+04,   1.04438570e+02,   3.34687547e+02,
         -3.34329786e-13],
       [  6.52578587e+03,   3.92348064e+03,  -7.99249275e+03,
          3.96863098e-13],
       [  6.95094940e+03,  -9.37127015e+03,   6.52066013e+02,
         -1.22346577e-13],
       [  7.81515669e+03,   5.34335094e+03,   7.00573919e+03,
          3.27363137e-13]])

We're going to be using a K-Nearest Neighbors classifier. Based on our set of training data, we're going to caclulate which training obersvations are closest to a given test point. Whichever class has the most votes wins.

In [95]:

knn = KNeighborsClassifier()
knn.fit(train_x, train_y)

Out[95]:

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           n_neighbors=5, p=2, weights='uniform')

In [96]:

pd.crosstab(test_y, knn.predict(test_x), rownames=["Actual"], colnames=["Predicted"])

Out[96]:

Predicted	0
Actual
0	2

In [97]:

STANDARD_SIZE

Out[97]:

(300, 167)

In [104]:

# string_to_img(new_image)

In [99]:

def string_to_img(image_string):
    print "called string_to_image"
    #we need to decode the image from base64
    image_string = base64.decodestring(image_string)
    #since we're seing this as a JSON string, we use StringIO so it acts like a file
    img = StringIO(image_string)
    img = PIL.Image.open(img)
    img = img.resize(STANDARD_SIZE)
    img = list(img.getdata())
    img = map(list, img)
    img = np.array(img)
    s = img.shape[0] * img.shape[1]
    img_wide = img.reshape(1, s)
    return pca.transform(img_wide[0])

In [100]:

def classify_image(data):
    print "called classify_image"
    preds = knn.predict(data)
    preds = np.where(preds==1, "check", "drivers_license")
    pred = preds[0]
    return {"image_label": pred}

In [101]:

from yhat import Yhat, YhatModel, preprocess

In [102]:

class ImageClassifier(YhatModel):
    REQUIREMENTS = [
      "PIL==1.1.7"
    ]
    
    def execute(self, data):
        print "called execute"
        img_string = data.get("image_as_base64_string", None)
        if img_string is None:
            return {"status": "error", "message": "data was None", "input_data": data}
        else:
            img = string_to_img(img_string)
            pred = classify_image(img)
            return pred

In [103]:

# authenticate
yh = Yhat("USERNAME", "YOUR API KEY", "http://cloud.yhathq.com/")
 
# upload model to yhat
yh.deploy("ImageClassifier", ImageClassifier, globals())

Are you sure you want to deploy? (y/N): y
Uploading model data
[----------------------------------------------------------------->] 10718/10718 KiB
Model uploaded

Out[103]:

{u'lang': u'python',
 u'message': u'Your model has been uploaded and is currently being built. You can check the status by logging into your Yhat account and viewing: /model/ImageClassifier/',
 u'model_endpoint': u'/austin/models/ImageClassifier/',
 u'modelname': u'ImageClassifier',
 u'status': u'success',
 u'timestamp': u'20150415074208',
 u'version': 12}

In [105]:

# i don't have the image data set any more 
# so just some dummy data to get it to work :(
new_image = open("/Users/hernamesbarbara/Desktop/img/1-plot-iris.png", 'rb').read()

#we need to make the image JSON serializeable
new_image = base64.encodestring(new_image)

yh.predict("ImageClassifier", {"image_as_base64_string": new_image})

Out[105]:

{u'result': {u'image_label': u'drivers_license'},
 u'version': u'eb6cbff',
 u'yhat_id': u'b90186db-450e-40e0-9b53-454a16f6fe2a',
 u'yhat_model': u'ImageClassifier'}

In [ ]: