import glob
import os.path
import zipfile
import blz
import numpy as np

MAXLINES=100*1000*1000   # a protection against loading too many lines
rawdir = 'eng-1M-3gram'  # the directory where all the .zip files are
def create_blz(blzname):
    # Read just the the -0 and -1 files
    zfiles = glob.glob(os.path.join(rawdir, '*-[0-1].csv.zip'))
    # Create the container
    dt = np.dtype([('ngram', 'S64'), ('year', 'i2'), ('occur', 'i2'),
                   ('pages', 'i2'), ('books', 'i2')])
    bt = blz.btable(np.empty(0, dtype=dt), mode='w',
                    expectedlen=MAXLINES,
                    rootdir=blzname)
    lines = 0
    errs = 0
    for zfname in zfiles:
        print("zfname:", zfname)
        with zipfile.ZipFile(zfname, 'r') as myzip:
            fname = zfname[len(rawdir)+1:-4]  # remove the .zip extension
            csvfile = myzip.open(fname, 'rU')
            try:
                for row in csvfile:
                    row = row[:-1].split('\t')
                    if len(row) == len(dt):
                        bt.append(row)
                    lines += 1
                    if lines >= MAXLINES:
                        break
            except:
                errs += 1
                # Ignore all the errors
                print "Offending line:", lines
                bt.flush()
                pass

    bt.flush()
    print "bt:", repr(bt)
    # Some statistics
    print "lines: %d, errs: %d" % (lines, errs)


# Create the BLZ database if necessary
blzname = rawdir+'.blz'
if not os.path.exists(blzname):
    bt = create_blz(blzname)

# Open the file
bt = blz.open(blzname)
print "bt:", bt

# Do some queries
print "results:", list(bt.where("contains(ngram, 'Eldest')"))

# Do a timing
%time len(list(bt.where("contains(ngram, 'Eldest')")))
Okay, doing a query in a dataset with more than 42 Mrows in less than 4 seconds is impressing.  How much would it take using pure Python 'in' operator?
%time len([i for i in bt if 'Eldest' in i[0]])
As can be seen, the new 'contains' can be more than 10 times faster than a regular 'in' operator.Of course, more complex queries can be done.  For example, let's see how many mentions to Einstein before year 1920 there are in our subset of the Google books:
%time list(bt.where("(contains(ngram, 'Einstein')) & (year < 1920)"))
Ok, a few mentions to Einstein in Google books, including 5 books as soon as 1905.  Supposedly some of these refers to his seminals articles from 1905. And using 4 seconds (at ~10 Mrow/s) for doing the new, more complex query is pretty good too.