import glob import os.path import zipfile import blz import numpy as np MAXLINES=100*1000*1000 # a protection against loading too many lines rawdir = 'eng-1M-3gram' # the directory where all the .zip files are def create_blz(blzname): # Read just the the -0 and -1 files zfiles = glob.glob(os.path.join(rawdir, '*-[0-1].csv.zip')) # Create the container dt = np.dtype([('ngram', 'S64'), ('year', 'i2'), ('occur', 'i2'), ('pages', 'i2'), ('books', 'i2')]) bt = blz.btable(np.empty(0, dtype=dt), mode='w', expectedlen=MAXLINES, rootdir=blzname) lines = 0 errs = 0 for zfname in zfiles: print("zfname:", zfname) with zipfile.ZipFile(zfname, 'r') as myzip: fname = zfname[len(rawdir)+1:-4] # remove the .zip extension csvfile = myzip.open(fname, 'rU') try: for row in csvfile: row = row[:-1].split('\t') if len(row) == len(dt): bt.append(row) lines += 1 if lines >= MAXLINES: break except: errs += 1 # Ignore all the errors print "Offending line:", lines bt.flush() pass bt.flush() print "bt:", repr(bt) # Some statistics print "lines: %d, errs: %d" % (lines, errs) # Create the BLZ database if necessary blzname = rawdir+'.blz' if not os.path.exists(blzname): bt = create_blz(blzname) # Open the file bt = blz.open(blzname) print "bt:", bt # Do some queries print "results:", list(bt.where("contains(ngram, 'Eldest')")) # Do a timing %time len(list(bt.where("contains(ngram, 'Eldest')"))) Okay, doing a query in a dataset with more than 42 Mrows in less than 4 seconds is impressing. How much would it take using pure Python 'in' operator? %time len([i for i in bt if 'Eldest' in i[0]]) As can be seen, the new 'contains' can be more than 10 times faster than a regular 'in' operator.Of course, more complex queries can be done. For example, let's see how many mentions to Einstein before year 1920 there are in our subset of the Google books: %time list(bt.where("(contains(ngram, 'Einstein')) & (year < 1920)")) Ok, a few mentions to Einstein in Google books, including 5 books as soon as 1905. Supposedly some of these refers to his seminals articles from 1905. And using 4 seconds (at ~10 Mrow/s) for doing the new, more complex query is pretty good too.