#!/usr/bin/env python # coding: utf-8 # Copyright (c) 2015, 2016 [Sebastian Raschka](sebastianraschka.com) # # https://github.com/rasbt/python-machine-learning-book # # [MIT License](https://github.com/rasbt/python-machine-learning-book/blob/master/LICENSE.txt) # # Python Machine Learning - Code Examples # # Chapter 9 - Embedding a Machine Learning Model into a Web Application # Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s). # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -v -p numpy,pandas,matplotlib,nltk,sklearn") # *The use of `watermark` is optional. You can install this IPython extension via "`pip install watermark`". For more information, please see: https://github.com/rasbt/watermark.* #
#
# ### Overview # - [Chapter 8 recap - Training a model for movie review classification](#Chapter-6-recap---Training-a-model-for-movie-review-classification) # # - [Serializing fitted scikit-learn estimators](#Serializing-fitted-scikit-learn-estimators) # - [Setting up a SQLite database for data storage Developing a web application with Flask](#Setting-up-a-SQLite-database-for-data-storage-Developing-a-web-application-with-Flask) # - [Our first Flask web application](#Our-first-Flask-web-application) # - [Form validation and rendering](#Form-validation-and-rendering) # - [Turning the movie classifier into a web application](#Turning-the-movie-classifier-into-a-web-application) # - [Deploying the web application to a public server](#Deploying-the-web-application-to-a-public-server) # - [Updating the movie review classifier](#Updating-the-movie-review-classifier) # - [Summary](#Summary) # The code for the Flask web applications can be found in the following directories: # # - `1st_flask_app_1/`: A simple Flask web app # - `1st_flask_app_2/`: `1st_flask_app_1` extended with flexible form validation and rendering # - `movieclassifier/`: The movie classifier embedded in a web application # - `movieclassifier_with_update/`: same as `movieclassifier` but with update from sqlite database upon start # To run the web applications locally, `cd` into the respective directory (as listed above) and execute the main-application script, for example, # # cd ./1st_flask_app_1 # python3 app.py # # Now, you should see something like # # * Running on http://127.0.0.1:5000/ # * Restarting with reloader # # in your terminal. # Next, open a web browsert and enter the address displayed in your terminal (typically http://127.0.0.1:5000/) to view the web application. # **Link to a live example application built with this tutorial: http://raschkas.pythonanywhere.com/**. #
#
# In[2]: from IPython.display import Image # # Chapter 8 recap - Training a model for movie review classification # This section is a recap of the logistic regression model that was trained in the last section of Chapter 6. Execute the folling code blocks to train a model that we will serialize in the next section. # In[3]: import numpy as np import re from nltk.corpus import stopwords from nltk.stem import PorterStemmer stop = stopwords.words('english') porter = PorterStemmer() def tokenizer(text): text = re.sub('<[^>]*>', '', text) emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()) text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '') tokenized = [w for w in text.split() if w not in stop] return tokenized def stream_docs(path): with open(path, 'r') as csv: next(csv) # skip header for line in csv: text, label = line[:-3], int(line[-2]) yield text, label # In[4]: next(stream_docs(path='./movie_data.csv')) # ### Note # # The pickling-section may be a bit tricky so that I included simpler test scripts in this directory (pickle-test-scripts/) to check if your environment is set up correctly. Basically, it is just a trimmed-down version of the relevant sections from Ch08, including a very small movie_review_data subset. # # Executing # # python pickle-dump-test.py # # will train a small classification model from the `movie_data_small.csv` and create the 2 pickle files # # stopwords.pkl # classifier.pkl # # Next, if you execute # # python pickle-load-test.py # # You should see the following 2 lines as output: # # Prediction: positive # Probability: 85.71% #

# ### Note # # If you haven't created the `movie_data.csv` file in the previous chapter, you can find a download a zip archive at # https://github.com/rasbt/python-machine-learning-book/tree/master/code/datasets/movie #

# In[5]: def get_minibatch(doc_stream, size): docs, y = [], [] try: for _ in range(size): text, label = next(doc_stream) docs.append(text) y.append(label) except StopIteration: return None, None return docs, y # In[6]: from sklearn.feature_extraction.text import HashingVectorizer from sklearn.linear_model import SGDClassifier vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) doc_stream = stream_docs(path='./movie_data.csv') # In[7]: import pyprind pbar = pyprind.ProgBar(45) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) pbar.update() # In[8]: X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print('Accuracy: %.3f' % clf.score(X_test, y_test)) # In[9]: clf = clf.partial_fit(X_test, y_test) #
#
# # Serializing fitted scikit-learn estimators # After we trained the logistic regression model as shown above, we know save the classifier along woth the stop words, Porter Stemmer, and `HashingVectorizer` as serialized objects to our local disk so that we can use the fitted classifier in our web application later. # In[10]: import pickle import os dest = os.path.join('movieclassifier', 'pkl_objects') if not os.path.exists(dest): os.makedirs(dest) pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4) pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4) # Next, we save the `HashingVectorizer` as in a separate file so that we can import it later. # In[11]: get_ipython().run_cell_magic('writefile', 'movieclassifier/vectorizer.py', "from sklearn.feature_extraction.text import HashingVectorizer\nimport re\nimport os\nimport pickle\n\ncur_dir = os.path.dirname(__file__)\nstop = pickle.load(open(\n os.path.join(cur_dir, \n 'pkl_objects', \n 'stopwords.pkl'), 'rb'))\n\ndef tokenizer(text):\n text = re.sub('<[^>]*>', '', text)\n emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)',\n text.lower())\n text = re.sub('[\\W]+', ' ', text.lower()) \\\n + ' '.join(emoticons).replace('-', '')\n tokenized = [w for w in text.split() if w not in stop]\n return tokenized\n\nvect = HashingVectorizer(decode_error='ignore',\n n_features=2**21,\n preprocessor=None,\n tokenizer=tokenizer)\n") # After executing the preceeding code cells, we can now restart the IPython notebook kernel to check if the objects were serialized correctly. # First, change the current Python directory to `movieclassifer`: # In[12]: import os os.chdir('movieclassifier') # In[13]: import pickle import re import os from vectorizer import vect clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb')) # In[14]: import numpy as np label = {0:'negative', 1:'positive'} example = ['I love this movie'] X = vect.transform(example) print('Prediction: %s\nProbability: %.2f%%' %\ (label[clf.predict(X)[0]], clf.predict_proba(X).max()*100)) #
#
# # Setting up a SQLite database for data storage # Before you execute this code, please make sure that you are currently in the `movieclassifier` directory. # In[15]: import sqlite3 import os if os.path.exists('reviews.sqlite'): os.remove('reviews.sqlite') conn = sqlite3.connect('reviews.sqlite') c = conn.cursor() c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)') example1 = 'I love this movie' c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example1, 1)) example2 = 'I disliked this movie' c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example2, 0)) conn.commit() conn.close() # In[16]: conn = sqlite3.connect('reviews.sqlite') c = conn.cursor() c.execute("SELECT * FROM review_db WHERE date BETWEEN '2015-01-01 10:10:10' AND DATETIME('now')") results = c.fetchall() conn.close() # In[17]: print(results) # In[18]: Image(filename='../images/09_01.png', width=700) #
# # Developing a web application with Flask # ... # ## Our first Flask web application # Directory structure: # # 1st_flask_app_1/ # app.py # templates/ # first_app.html # # In[3]: get_ipython().system('cat 1st_flask_app_1/app.py') # In[4]: get_ipython().system('cat 1st_flask_app_1/templates/first_app.html') # ## Form validation and rendering # In[19]: Image(filename='../images/09_02.png', width=400) # In[20]: Image(filename='../images/09_03.png', width=400) # Directory structure: # # 1st_flask_app_2/ # app.py # static/ # style.css # templates/ # _formhelpers.html # first_app.html # hello.html # In[7]: get_ipython().system('cat 1st_flask_app_2/app.py') # In[8]: get_ipython().system('cat 1st_flask_app_2/templates/_formhelpers.html') #
#
# # Turning the movie classifier into a web application # In[21]: Image(filename='../images/09_04.png', width=400) # In[22]: Image(filename='../images/09_05.png', width=400) # In[23]: Image(filename='../images/09_06.png', width=400) # In[24]: Image(filename='../images/09_07.png', width=200) # In[9]: get_ipython().system('cat ./movieclassifier/app.py') # In[10]: get_ipython().system('cat ./movieclassifier/templates/reviewform.html') # In[11]: get_ipython().system('cat ./movieclassifier/templates/results.html') # In[12]: get_ipython().system('cat ./movieclassifier/static/style.css') # In[13]: get_ipython().system('cat ./movieclassifier/templates/thanks.html') #
#
# # Deploying the web application to a public server # In[25]: Image(filename='../images/09_08.png', width=600) #
#
# ## Updating the movie review classifier # Change current directory to `movieclassifier`: # Define a function to update the classifier with the data stored in the local SQLite database: # In[26]: import pickle import sqlite3 import numpy as np # import HashingVectorizer from local dir from vectorizer import vect def update_model(db_path, model, batch_size=10000): conn = sqlite3.connect(db_path) c = conn.cursor() c.execute('SELECT * from review_db') results = c.fetchmany(batch_size) while results: data = np.array(results) X = data[:, 0] y = data[:, 1].astype(int) classes = np.array([0, 1]) X_train = vect.transform(X) clf.partial_fit(X_train, y, classes=classes) results = c.fetchmany(batch_size) conn.close() return None # Update the model: # In[27]: cur_dir = '.' # Use the following path instead if you embed this code into # the app.py file # import os # cur_dir = os.path.dirname(__file__) clf = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'classifier.pkl'), 'rb')) db = os.path.join(cur_dir, 'reviews.sqlite') update_model(db_path=db, model=clf, batch_size=10000) # Uncomment the following lines to update your classifier.pkl file # pickle.dump(clf, open(os.path.join(cur_dir, # 'pkl_objects', 'classifier.pkl'), 'wb') # , protocol=4) # In[14]: get_ipython().system('cat ./movieclassifier_with_update/update.py') #
#
# # Summary #
# ... #