The code is exercising the recently introduced 'contains' function in numexpr so it is shown how fast these queries can be.
The data is based in Google Books Ngrams, whose description can be found here and the files can be downloaded from here.
In particular, the dataset used is the "English One Million", and you can download it from a bash shell with a command like:
for i in `seq 0 199`; do wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-1M-3gram-20090715-$i.csv.zip; done
However, in order to keep the time for running the script reasonable, only the 2 first files will be downloaded, so the next command has been used here:
for i in `seq 0 1`; do wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-1M-3gram-20090715-$i.csv.zip; done
which will be stored in the 'eng-1M-3gram' directory.
import glob
import os.path
import zipfile
import blz
import numpy as np
MAXLINES=100*1000*1000 # a protection against loading too many lines
rawdir = 'eng-1M-3gram' # the directory where all the .zip files are
def create_blz(blzname):
# Read just the the -0 and -1 files
zfiles = glob.glob(os.path.join(rawdir, '*-[0-1].csv.zip'))
# Create the container
dt = np.dtype([('ngram', 'S64'), ('year', 'i2'), ('occur', 'i2'),
('pages', 'i2'), ('books', 'i2')])
bt = blz.btable(np.empty(0, dtype=dt), mode='w',
expectedlen=MAXLINES,
rootdir=blzname)
lines = 0
errs = 0
for zfname in zfiles:
print("zfname:", zfname)
with zipfile.ZipFile(zfname, 'r') as myzip:
fname = zfname[len(rawdir)+1:-4] # remove the .zip extension
csvfile = myzip.open(fname, 'rU')
try:
for row in csvfile:
row = row[:-1].split('\t')
if len(row) == len(dt):
bt.append(row)
lines += 1
if lines >= MAXLINES:
break
except:
errs += 1
# Ignore all the errors
print "Offending line:", lines
bt.flush()
pass
bt.flush()
print "bt:", repr(bt)
# Some statistics
print "lines: %d, errs: %d" % (lines, errs)
# Create the BLZ database if necessary
blzname = rawdir+'.blz'
if not os.path.exists(blzname):
bt = create_blz(blzname)
('zfname:', 'eng-1M-3gram/googlebooks-eng-1M-3gram-20090715-0.csv.zip') ('zfname:', 'eng-1M-3gram/googlebooks-eng-1M-3gram-20090715-1.csv.zip') bt: btable((42561686,), [('ngram', 'S64'), ('year', '<i2'), ('occur', '<i2'), ('pages', '<i2'), ('books', '<i2')]) nbytes: 2.85 GB; cbytes: 143.26 MB; ratio: 20.40 bparams := bparams(clevel=5, shuffle=True, cname=blosclz) rootdir := 'eng-1M-3gram.blz' [('! ! That', 1731, 1, 1, 1) ('! ! That', 1791, 2, 2, 2) ('! ! That', 1802, 1, 1, 1) ..., ('"canal ', 1956, 11, 11, 11) ('"canal ', 1957, 6, 6, 5) ('"canal ', 1958, 9, 9, 9)] lines: 42561688, errs: 0
# Open the file
bt = blz.open(blzname)
print "bt:", bt
bt: [('! ! That', 1731, 1, 1, 1) ('! ! That', 1791, 2, 2, 2) ('! ! That', 1802, 1, 1, 1) ..., ('"canal ', 1956, 11, 11, 11) ('"canal ', 1957, 6, 6, 5) ('"canal ', 1958, 9, 9, 9)]
# Do some queries
print "results:", list(bt.where("contains(ngram, 'Eldest')"))
results: [row(ngram='The Eldest Daughter', year=1811, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1856, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1860, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1861, occur=3, pages=3, books=3), row(ngram='The Eldest Daughter', year=1863, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1867, occur=2, pages=2, books=1), row(ngram='The Eldest Daughter', year=1873, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1881, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1883, occur=4, pages=4, books=2), row(ngram='The Eldest Daughter', year=1884, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1885, occur=4, pages=4, books=4), row(ngram='The Eldest Daughter', year=1887, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1889, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1891, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1900, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1901, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1902, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1908, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1909, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1910, occur=2, pages=2, books=1), row(ngram='The Eldest Daughter', year=1914, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1916, occur=2, pages=2, books=1), row(ngram='The Eldest Daughter', year=1917, occur=14, pages=12, books=2), row(ngram='The Eldest Daughter', year=1918, occur=7, pages=7, books=4), row(ngram='The Eldest Daughter', year=1923, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1942, occur=4, pages=4, books=4), row(ngram='The Eldest Daughter', year=1944, occur=2, pages=2, books=2), row(ngram='The Eldest Daughter', year=1954, occur=13, pages=8, books=1), row(ngram='The Eldest Daughter', year=1955, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1957, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1983, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=1986, occur=1, pages=1, books=1), row(ngram='The Eldest Daughter', year=2006, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1824, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1829, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1839, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1840, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1842, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1846, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1847, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1848, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1850, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1852, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1855, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1863, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1864, occur=3, pages=3, books=3), row(ngram='""" Eldest of"', year=1865, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1866, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1869, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1872, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1875, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1876, occur=3, pages=3, books=3), row(ngram='""" Eldest of"', year=1880, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1883, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1884, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1885, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1892, occur=3, pages=3, books=3), row(ngram='""" Eldest of"', year=1908, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1912, occur=4, pages=4, books=4), row(ngram='""" Eldest of"', year=1916, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1920, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1921, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1922, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1923, occur=3, pages=3, books=3), row(ngram='""" Eldest of"', year=1927, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1928, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1929, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1932, occur=4, pages=4, books=4), row(ngram='""" Eldest of"', year=1933, occur=6, pages=6, books=1), row(ngram='""" Eldest of"', year=1934, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1936, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1940, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1944, occur=3, pages=3, books=3), row(ngram='""" Eldest of"', year=1948, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1954, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1965, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1967, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1976, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1977, occur=2, pages=2, books=2), row(ngram='""" Eldest of"', year=1991, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1998, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=1999, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=2002, occur=5, pages=5, books=2), row(ngram='""" Eldest of"', year=2003, occur=1, pages=1, books=1), row(ngram='""" Eldest of"', year=2007, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1851, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1861, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1863, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1868, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1871, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1872, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1873, occur=8, pages=8, books=8), row(ngram='as the Eldest', year=1874, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1875, occur=6, pages=6, books=6), row(ngram='as the Eldest', year=1877, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1880, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1882, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1889, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1891, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1892, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1893, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1897, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1899, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1900, occur=5, pages=5, books=4), row(ngram='as the Eldest', year=1901, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1902, occur=4, pages=4, books=3), row(ngram='as the Eldest', year=1903, occur=8, pages=8, books=8), row(ngram='as the Eldest', year=1905, occur=2, pages=1, books=1), row(ngram='as the Eldest', year=1906, occur=4, pages=4, books=4), row(ngram='as the Eldest', year=1907, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1909, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1910, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1911, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1912, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1913, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1919, occur=4, pages=4, books=4), row(ngram='as the Eldest', year=1921, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1924, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1925, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1929, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1936, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1937, occur=3, pages=3, books=3), row(ngram='as the Eldest', year=1938, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1940, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1945, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1949, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1950, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1951, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1953, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1954, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1955, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1958, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1960, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1961, occur=2, pages=2, books=1), row(ngram='as the Eldest', year=1979, occur=3, pages=3, books=1), row(ngram='as the Eldest', year=1980, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1987, occur=3, pages=3, books=1), row(ngram='as the Eldest', year=1989, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1990, occur=4, pages=3, books=2), row(ngram='as the Eldest', year=1991, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1995, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=1997, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1998, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=1999, occur=2, pages=2, books=2), row(ngram='as the Eldest', year=2002, occur=1, pages=1, books=1), row(ngram='as the Eldest', year=2005, occur=2, pages=1, books=1), row(ngram='as the Eldest', year=2008, occur=1, pages=1, books=1)]
# Do a timing
%time len(list(bt.where("contains(ngram, 'Eldest')")))
CPU times: user 7.88 s, sys: 248 ms, total: 8.13 s Wall time: 3.39 s
147
%time len([i for i in bt if 'Eldest' in i[0]])
CPU times: user 43.3 s, sys: 443 ms, total: 43.8 s Wall time: 42.6 s
147
%time list(bt.where("(contains(ngram, 'Einstein')) & (year < 1920)"))
CPU times: user 9.01 s, sys: 250 ms, total: 9.26 s Wall time: 4.22 s
[row(ngram=". Einstein's general", year=1905, occur=6, pages=6, books=5), row(ngram=". Einstein's general", year=1916, occur=1, pages=1, books=1), row(ngram=". Einstein's general", year=1919, occur=2, pages=2, books=2), row(ngram='Einstein equation to', year=1899, occur=1, pages=1, books=1), row(ngram='Einstein published his', year=1905, occur=8, pages=8, books=6), row(ngram='Einstein published his', year=1914, occur=2, pages=2, books=2), row(ngram="Einstein's equation was", year=1905, occur=2, pages=2, books=2), row(ngram='and Einstein is', year=1911, occur=3, pages=3, books=3), row(ngram='and Einstein is', year=1912, occur=1, pages=1, books=1), row(ngram='Einstein puts it', year=1911, occur=4, pages=4, books=4)]