#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')
from pylab import *
import codecs,string,os,sys,os.path,glob,re


# The CLSTM command line tools take their training data in HDF5 files (you will evenutally also be able to train directly from images saved on disk, as in ocropy, but that's not quite implemented yet). This illustrates how to store images into an HDF5 file and then how to run the `clstmctc` training tool on the data.

# # The UW3-500 Dataset

# We illustrate loading data with the `uw3-500.tgz` dataset, available from `tmbdev.net`.

# In[2]:


get_ipython().system('test -f uw3-500.tgz || wget -nd http://www.tmbdev.net/ocrdata/uw3-500.tgz')


# Let's untar the file unless it has already been untarred.

# In[3]:


get_ipython().system('test -d book || tar -zxvf uw3-500.tgz')


# The UW3-500 dataset is a collection of text line images and corresponding ground truth transcription. It's organized as a directory tree of the form `book/<page_no>/<line_id>.bin.png` etc.

# In[4]:


get_ipython().system('ls book/0005/010001.*')


# Let's now run `clstmctc` training. We report every 100 training steps. Since we didn't dewarp or size-normalize the lines, we need to use a `dewarp=center` argument to training.

# In[25]:


get_ipython().system('dewarp=center report_every=500 save_name=test save_every=10000 ntrain=11000 ../clstmctc uw3-500.h5')


# In[5]:


get_ipython().system('ls book/*/*.bin.png | sort -r > uw3.files')
get_ipython().system('sed 100q uw3.files > uw3-test.files')
get_ipython().system('sed 1,100d uw3.files > uw3-train.files')
get_ipython().system('wc -l uw3*.files')


# In[ ]:


get_ipython().system('params=1 save_name=uw3small save_every=1000 report_every=100 maxtrain=50000 test_every=1000 ../clstmocrtrain uw3-train.files uw3-test.files')


# In[ ]: