import string import random import pprint def generate_string(min_size=3, max_size=10): """generates a string of random upper/lower case letters default length between 3 and 10 chars""" return ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase) for x in range(random.randint(min_size, max_size))) def generate_fullname(): """generates a dict representing a name consisting of a given and family name""" return {"given": generate_string() + ' ' + random.choice(string.ascii_uppercase), "family": generate_string() } def generate_title_or_journal(words=None): """generates a title or journal based on random words and ending with punctuation""" if words is None: num_of_words = random.randint(5, 12) return ' '.join(generate_string() for x in range(num_of_words)) + random.choice(['.', '!', '?']) generate_string() generate_title_or_journal() generate_fullname() # populate a list with 500 fake authors author_pool = [] for x in range(500): author_pool.append(generate_fullname()) # populate a list with 100 fake journals journal_pool = [] for x in range(100): journal_pool.append(generate_title_or_journal()) def generate_citation(): """generates a fake citation in Citation Style Language format""" # pick between 2 and 5 authors randomly from pool num_of_authors = random.randint(2, 5) authors = [] for x in range(num_of_authors): authors.append(random.choice(author_pool)) # package and return the citation return {"title": generate_title_or_journal(), "authors": authors, "container-title": random.choice(journal_pool), "date": random.randint(1955, 2013) } def generate_csl(): """generates a fake Scholarly standard CSL""" # pick between 2 and 5 references num_of_references = random.randint(2, 5) references = [] for x in range(num_of_references): references.append(generate_citation()) # package and return CSL return {"citation": generate_citation(), "references": references, "meta-data": {} } pprint.pprint(generate_csl()) from pymongo import MongoClient # setup a connection to the local db client = MongoClient('localhost', 27018) # create a new db for the benchmarking db = client.benchmark_test # create a new collection to store the articles articles = db.articles def addcsls(niter): """inserts n number of articles into the collection""" for x in range(niter): articles.insert(generate_csl()) runtimes = '' %%capture runtimes for niter in [10, 100, 1000, 10000, 100000]: %timeit addcsls(niter) articles.remove() runtimes.stdout for x in range(450000): articles.insert(generate_csl()) articles.count() %%timeit # time a query for one author one_author = articles.find({'citation.authors.family': author_pool[23]['family']}) %%timeit # time a query for two authors two_authors = articles.find({ 'citation.authors.family': author_pool[23]['family'], 'citation.authors.family': author_pool[51]['family'] }) %%timeit # now, lets query two authors from the primary citation and then a third within the references three_authors = articles.find({ 'citation.authors.family': author_pool[23]['family'], 'citation.authors.family': author_pool[51]['family'], 'references.authors.family': author_pool[3]['family'] }) # grab the ObjectID and title from a random article sample_article = articles.find_one() uid = sample_article['_id'] title = sample_article['citation']['title'] # time a query based on ObjectID %timeit articles.find({'_id': uid}) # time a query vased on title %timeit articles.find({'citation.title': title}) %%timeit # query for all articles writtein in 2000 article_query_one = articles.find({'citation.date': 2000 } ) %%timeit # query for all articles written after 2000 article_query_two = articles.find({'citation.date': { "$gt": 2000} }) %%timeit # query for all articles written after 1960 and before 2011 article_query_three = articles.find({'citation.date': { '$gte': 1960, '$lte': 2010 }})