In this notebook, I will demonstrate rudamentry benchmarking of a MongoDB populated with mock citation data. Please, feel free to make comments or contact me if you have any recommendations!
import string
import random
import pprint
def generate_string(min_size=3, max_size=10):
"""generates a string of random upper/lower case letters default length between 3 and 10 chars"""
return ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase)
for x in range(random.randint(min_size, max_size)))
def generate_fullname():
"""generates a dict representing a name consisting of a given and family name"""
return {"given": generate_string() + ' ' + random.choice(string.ascii_uppercase),
"family": generate_string()
}
def generate_title_or_journal(words=None):
"""generates a title or journal based on random words and ending with punctuation"""
if words is None:
num_of_words = random.randint(5, 12)
return ' '.join(generate_string() for x in range(num_of_words)) + random.choice(['.', '!', '?'])
generate_string()
'ExnxxO'
generate_title_or_journal()
'nXLTmxx ewknz mPTkwwVY QCCX wIFyVbo uUbfPnqtD kkVsVVXD iSPiyBHgn!'
generate_fullname()
{'family': 'ewsiTpt', 'given': 'TLefUicJ M'}
# populate a list with 500 fake authors
author_pool = []
for x in range(500):
author_pool.append(generate_fullname())
# populate a list with 100 fake journals
journal_pool = []
for x in range(100):
journal_pool.append(generate_title_or_journal())
def generate_citation():
"""generates a fake citation in Citation Style Language format"""
# pick between 2 and 5 authors randomly from pool
num_of_authors = random.randint(2, 5)
authors = []
for x in range(num_of_authors):
authors.append(random.choice(author_pool))
# package and return the citation
return {"title": generate_title_or_journal(),
"authors": authors,
"container-title": random.choice(journal_pool),
"date": random.randint(1955, 2013)
}
Scholarly standard CSL consist of a primary citation, a list of citations which it references, and meta-data for tracking and metrics.
def generate_csl():
"""generates a fake Scholarly standard CSL"""
# pick between 2 and 5 references
num_of_references = random.randint(2, 5)
references = []
for x in range(num_of_references):
references.append(generate_citation())
# package and return CSL
return {"citation": generate_citation(),
"references": references,
"meta-data": {}
}
pprint.pprint(generate_csl())
{'citation': {'authors': [{'family': 'HJrDn', 'given': 'fsBHqfKEjn I'}, {'family': 'sMXUNbysMs', 'given': 'mAbCf T'}, {'family': 'pzt', 'given': 'aRcUPpVMc Y'}, {'family': 'lAyAMah', 'given': 'NemCpF U'}, {'family': 'sjaxpYD', 'given': 'FHnslrxmLS F'}], 'container-title': 'IyPZyDVaNg TGrFJnpIhn hbwf RmFOPbj jBxOgWpwcH ROZo IaBEgX.', 'date': 1996, 'title': 'IGDgHNhtgj yojhNrZ vLTR jfVmB IINh.'}, 'meta-data': {}, 'references': [{'authors': [{'family': 'zDNPRssZ', 'given': 'NEx Q'}, {'family': 'FtTW', 'given': 'xCSCCPtNdP V'}, {'family': 'jdqHFnfQK', 'given': 'BHhiw K'}, {'family': 'sVWmKmEN', 'given': 'ERz Z'}, {'family': 'TZSp', 'given': 'yyEI A'}], 'container-title': 'YllZ OoonIIDxey oqEqGDJKH dLTV EDnglty wdK Fdec raFPHRuKk rADrXsA vMcKM rrC DvEUXx?', 'date': 1958, 'title': 'YBa suMfU QmcjXMJ plNcC qscEJHas Msgr fDlq uPc tAYSP?'}, {'authors': [{'family': 'tIDS', 'given': 'sBqAJO C'}, {'family': 'XAAbAzk', 'given': 'nGR G'}, {'family': 'TWHFv', 'given': 'PmMHSeue Z'}, {'family': 'FAMll', 'given': 'YCzVtyLaua P'}, {'family': 'OGs', 'given': 'TcYjVf A'}], 'container-title': 'ltqVKNOm UVudMWX kYtAVZpgGv WHsz okAw mmjxcvU adTIcW wjmn NfukbsKla!', 'date': 1967, 'title': 'XNAzF yynbt gOLMpEYds UIvk IkPSXO rUJjAYxz YPQsOmGHz QHf XvmOBpR sEx QPbVHTuCq ulXscfk?'}, {'authors': [{'family': 'TzndvzQ', 'given': 'ZkKdj E'}, {'family': 'iSHabLvm', 'given': 'LmnGjximf L'}, {'family': 'ENpoY', 'given': 'XJRv J'}, {'family': 'sqgy', 'given': 'CQr M'}], 'container-title': 'OASbcEcS FgaVRHgWzf oDujV PwHe ulVyHYGgEV zccikpstAN YVkm DVde GSLinN ojxcRoQ rGSp IfbpFs.', 'date': 1971, 'title': 'AipYA FNBw qIumehV VVU nKvp MnpSmoYZvm QYqlaGeD eNLpPCjA!'}, {'authors': [{'family': 'kdjLNKODAb', 'given': 'qOhjJGZv S'}, {'family': 'dmqVg', 'given': 'vuLYMMMR L'}], 'container-title': 'aXN RHPQUVuscP JrOAXxTRW BptUDPXR XkdZYYdGc LoAZmL SNw kMXYoBDKA IPWBQRcUO AKK nNRe?', 'date': 2002, 'title': 'nuy VYI eRBQYkzOBI qAMVCsu NjHW AVXbF eci VDP xHFQRLT VZb?'}, {'authors': [{'family': 'mPd', 'given': 'GPgOto J'}, {'family': 'oYcLg', 'given': 'mfcA F'}, {'family': 'xzaXF', 'given': 'BdmDFN F'}], 'container-title': 'vNy etDnU tiN owgjx dCiKH NAvMscrIl NnUmxq daO.', 'date': 1988, 'title': 'XVOqdhCEE puwVr grfbfxmmk yRxpzrwIP rsOntjGVc pMXMDvwPj!'}]}
from pymongo import MongoClient
# setup a connection to the local db
client = MongoClient('localhost', 27018)
# create a new db for the benchmarking
db = client.benchmark_test
# create a new collection to store the articles
articles = db.articles
def addcsls(niter):
"""inserts n number of articles into the collection"""
for x in range(niter):
articles.insert(generate_csl())
runtimes = ''
%%capture runtimes
for niter in [10, 100, 1000, 10000, 100000]:
%timeit addcsls(niter)
articles.remove()
runtimes.stdout
u'100 loops, best of 3: 8.38 ms per loop\n10 loops, best of 3: 118 ms per loop\n1 loops, best of 3: 1.15 s per loop\n1 loops, best of 3: 11.9 s per loop\n1 loops, best of 3: 96.6 s per loop\n'
for x in range(450000):
articles.insert(generate_csl())
articles.count()
450000
Notes:
E.g. citati on.authors.family will look for:
{"citation": {"authors": { "family" : "...here..." } } }
cell magics e.g.: %%timeit must be at the top of the cell or they will not function. Yes, even above comments.
%%time vs %timeit -- I chose to use %%timeit because it reports the average time of some number of iterations. Although you can manually specify how many iterations via the -n parameter, it's not always necessary. The IPython guys were smart enough to make this scale automatically. Inexpensive actions such as the query below may loop 100,000 times but expensive actions i.g. creating 100,000 articles and inserting them into the DB may only run 3 times.
%%timeit
# time a query for one author
one_author = articles.find({'citation.authors.family': author_pool[23]['family']})
100000 loops, best of 3: 7.68 us per loop
%%timeit
# time a query for two authors
two_authors = articles.find({
'citation.authors.family': author_pool[23]['family'],
'citation.authors.family': author_pool[51]['family']
})
100000 loops, best of 3: 7.86 us per loop
%%timeit
# now, lets query two authors from the primary citation and then a third within the references
three_authors = articles.find({
'citation.authors.family': author_pool[23]['family'],
'citation.authors.family': author_pool[51]['family'],
'references.authors.family': author_pool[3]['family']
})
100000 loops, best of 3: 7.86 us per loop
# grab the ObjectID and title from a random article
sample_article = articles.find_one()
uid = sample_article['_id']
title = sample_article['citation']['title']
# time a query based on ObjectID
%timeit articles.find({'_id': uid})
100000 loops, best of 3: 7.88 us per loop
# time a query vased on title
%timeit articles.find({'citation.title': title})
100000 loops, best of 3: 7.79 us per loop
%%timeit
# query for all articles writtein in 2000
article_query_one = articles.find({'citation.date': 2000 } )
100000 loops, best of 3: 7.61 us per loop
%%timeit
# query for all articles written after 2000
article_query_two = articles.find({'citation.date': { "$gt": 2000} })
100000 loops, best of 3: 7.93 us per loop
%%timeit
# query for all articles written after 1960 and before 2011
article_query_three = articles.find({'citation.date': { '$gte': 1960, '$lte': 2010 }})
100000 loops, best of 3: 7.77 us per loop
We were initially concerned with response times for database queries. But, as the results show, even with roughly half of a million articles querying deep within individual citations was rather quick. Not a single query had an average time of over 10 micro seconds. Not too bad.