#!/usr/bin/env python # coding: utf-8 # # AlgebraNet - Learning Algebra from Scratch # # *This notebook first appeared as a [blog post](//betatim.github.io/posts/algebra-from-scratch) on [Tim Head](//betatim.github.io)'s blog.* # # *License: [MIT](http://opensource.org/licenses/MIT)* # # *(C) 2016, Tim Head.* # *Feel free to use, distribute, and modify with the above attribution.* # In this post a computer will learn basic algebra, from scratch! Not excited? Keep reading. This is an introduction to basic sequence-to-sequence learning using a [Long short term memory](https://en.wikipedia.org/wiki/Long_short-term_memory) (LSTM) module. # # Sequences of varying length are everywhere around us. Yet most machine-learning problems # are presented as "here are five features describing each instance, please predict these three # properties". Or some variation on this. This is not a particularly good fit with # sequences of varying length like: sentences, sound recordings, or time series data. In # addition the order in which things occur in the sequence carries meaning. Let's take # advantage of all that structure! # # Imagine trying to shoehorn the task of translating English sentences to German sentences into # a classical $N$ features and $M$ outputs setup. A bit of a nightmare. This is where # sequence to sequence learning comes in. You feed in a sequence and produce a new sequence, # potentially of different length. # # You can use this for translating English to German, generating captions for images, or # as we will do: learn how to do basic algebra. # # # ## The Problem # # Given a string of characters representing a math problem `"3141+42"` we would like # to generate a string of characters representing the correct solution: `"3183"`. Our # network will learn how to do addition, subtraction, multiplication and division. # # The important # part is that we will not first use our human intelligence to break the string up into # integers and a mathematical operator. We want the computer to figure all that out by # itself. # # Let's go! This post will use the excellent [`keras`](https://keras.io/) deep-learning # library and is based on one of their [examples](https://github.com/fchollet/keras/blob/master/examples/addition_rnn.py). # # First some imports, and then we generate examples. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('config', "InlineBackend.figure_format='retina'") # In[2]: import operator import numpy as np import matplotlib.pyplot as plt plt.rcParams["figure.figsize"] = (8, 8) plt.rcParams["font.size"] = 14 from sklearn.cross_validation import train_test_split from sklearn.utils import check_random_state from keras.models import Sequential from keras.layers import Dense, RepeatVector, TimeDistributed, Activation from keras.layers.recurrent import LSTM from keras.optimizers import SGD # In[3]: OPS = ['+', '*', '-', '/'] ABC = '1234567890' + ''.join(OPS) + '. ' # ## Creating Examples # # Using simple algebra problems like this demonstrates all the basics of # sequence to sequence learning and allows us to generate a lot of data # very quickly (compared to say audio recordings). # # We need a way to represent a sequence of characters as a sequence of # binary vectors, this is what `encode` and `decode` take care of. Each # character is [one-hot encoded](https://en.wikipedia.org/wiki/One-hot). The # `make_maths_problem` function uses them to generate a large set of # examples. # In[12]: def encode(characters): """Encode a string as an array of integers.""" char_idx = dict((c, i) for i, c in enumerate(ABC)) X_ = np.zeros((len(characters), len(ABC))) for i, c in enumerate(characters): try: X_[i, char_idx[c]] = 1 except: print('chars:', characters) print(i, c, char_idx[c], X_.shape) return X_ def decode(X, calc_argmax=True): """Decode an array of integers to a string.""" idx_char = dict((i, c) for i, c in enumerate(ABC)) X = X.argmax(axis=-1) return ''.join(idx_char[i] for i in X) # Generate data def make_maths_problem(n_samples=1000, n_digits=3, invert=True, random_state=None): rng = check_random_state(random_state) X_ = [] y_ = [] assert all(op in ABC for op in OPS), "Not all operations are in the alphabet" math_op = {'+': operator.add, '-': operator.sub, '*': operator.mul, '/': operator.truediv} n_ops = len(OPS) # the one represents the character for the operator max_len = 2 * n_digits + 1 while len(X_) < n_samples: a, b = rng.randint(10**n_digits, size=2) op = rng.choice(OPS) a_op_b = "%i%s%i" % (a, op, b) if a_op_b not in X_: answer = math_op[op](a, b) if answer == np.inf or np.isnan(answer): continue if invert: X_.append(a_op_b.ljust(max_len)[::-1]) else: X_.append(a_op_b.ljust(max_len)) # make sure the string is white space padded but no longer # than the max length (could happen when representing 1/3) y_.append(str(answer).ljust(2 * n_digits)[:2 * n_digits]) X = np.zeros((n_samples, max_len, len(ABC)), dtype=np.bool) y = np.zeros((n_samples, 2 * n_digits, len(ABC)), dtype=np.bool) for i, x in enumerate(X_): X[i] = encode(x) for i, x in enumerate(y_): y[i] = encode(x) return (X, y) # ## Encoding # # An algebra problem like `"23+42"` would be encoded as a sequence of five # arrays, one each for each character. # In[5]: encode("23+42") # Let's try out our math problem generator and check that we can correctly # round trip problems: # In[14]: X, y = make_maths_problem(10, n_digits=3) for i in range(10): # undo question reversal q = decode(X[i][::-1]) correct = decode(y[i]) print(q, '=', correct) # Looks like it works. # # # ## Big Data # # To learn the model we need more than ten examples, so we make a big set of # problems and simultaneously set some of them aside as test data set that # we can use to evaluate the performance of the model. # In[17]: X, y = make_maths_problem(2*50000, n_digits=3, random_state=234) X_train,X_test, y_train,y_test = train_test_split(X, y, train_size=0.8) # ## The Model # # We use a single LSTM to encode the input sequence into a fixed length # state vector. This vector is then decoded by a second LSTM to produce # the output sequence. The final layer is a softmax layer to pick which # character to place at each position of the output sequence. # In[18]: def rnn_model(hidden_size=128, n_digits=3, abc_size=len(ABC)): model = Sequential() # encoder model.add(LSTM(hidden_size, input_shape=(None, abc_size))) # input for the decoder # this sets the length of the output sequence and has to # match the length we used when constructing the examples model.add(RepeatVector(2 * n_digits)) # decoder model model.add((LSTM(hidden_size, return_sequences=True))) # For each step of the output sequence, decide which character should be chosen model.add(TimeDistributed(Dense(abc_size))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model # ## Training # # Let's train the model for a few epochs and see how it performs afterwards. Every ten epochs # we will print out ten test math problems for manual inspection. By reusing the same ten # examples we can see if the network is making progress or not. # In[22]: BATCH_SIZE = 128 model = rnn_model() # use same 10 examples for all iterations n_test = 10 idx = np.random.randint(len(X_test), size=n_test) Xs, ys = X_test[idx], y_test[idx] for iteration in range(3): print() print('-' * 50) print('After', 10 * (iteration + 1), 'epochs:') model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=10, verbose=False, validation_data=(X_test, y_test)) probas = model.predict_proba(Xs, verbose=0) for i in range(n_test): # undo question reversal q = decode(Xs[i][::-1]) correct = decode(ys[i]) guess = decode(probas[i]) print(q, '=', correct, '=?', guess) # In[24]: def fractional_difference(n_test=200): idx = np.random.randint(len(X_test), size=n_test) Xs, ys = X_test[idx], y_test[idx] probas = model.predict_proba(Xs, verbose=0) true_ = np.array([float(decode(ys[i])) for i in range(n_test)]) predicted_ = np.array([float(decode(probas[i])) for i in range(n_test)]) _ = plt.hist((true_ - predicted_)/true_, range=(-0.1, 0.1), bins=20) plt.xlabel('(true-guess)/true') plt.xlim(-0.11, 0.11) fractional_difference() # It looks as if after such a short amount of training we get within roughly 5% of the right # answer, and frequently even closer. From monitoring the loss on an evaluation set I think # this network would continue to improve if trained for a larger number of epochs. # # # ## Conclusion # # While the network does not perform perfectly yet, this illustrates how to build a network # that performs sequence-to-sequence learning. Without any knowledge about algebra or even # digits it learnt to do reasonably well on basic math problems. It even figured out how to # use the decimal point! # # Some next steps could be: # # * generating words instead of numbers, the network would output `"thirtytwo"` # instead of `"32"`; # * using images of digits as input instead of strings; # * using voice as input or output; or # * solving a completely different problem based around sequences. # # Get in touch on twitter @[betatim](//twitter.com/betatim) if you have questions or comments.