import string
import random
import pprint

def generate_string(min_size=3, max_size=10):
    """generates a string of random upper/lower case letters default length between 3 and 10 chars"""
    return ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase) 
                       for x in range(random.randint(min_size, max_size)))

def generate_fullname():
    """generates a dict representing a name consisting of a given and family name"""
    return {"given": generate_string() + ' ' + random.choice(string.ascii_uppercase),
            "family": generate_string()
            }

def generate_title_or_journal(words=None):
    """generates a title or journal based on random words and ending with punctuation"""
    if words is None:
        num_of_words = random.randint(5, 12)
    return ' '.join(generate_string() for x in range(num_of_words)) + random.choice(['.', '!', '?'])
    
        

generate_string()

generate_title_or_journal()

generate_fullname()

# populate a list with 500 fake authors
author_pool = []
for x in range(500):
    author_pool.append(generate_fullname())

# populate a list with 100 fake journals
journal_pool = []
for x in range(100):
    journal_pool.append(generate_title_or_journal())

def generate_citation():
    """generates a fake citation in Citation Style Language format"""
    # pick between 2 and 5 authors randomly from pool
    num_of_authors = random.randint(2, 5)
    authors = []
    for x in range(num_of_authors):
        authors.append(random.choice(author_pool))
    
    # package and return the citation
    return {"title": generate_title_or_journal(),
            "authors": authors,
            "container-title": random.choice(journal_pool),
            "date": random.randint(1955, 2013)
            }

def generate_csl():
    """generates a fake Scholarly standard CSL"""
    # pick between 2 and 5 references
    num_of_references = random.randint(2, 5)
    references = []
    for x in range(num_of_references):
        references.append(generate_citation())
    
    # package and return CSL
    return {"citation": generate_citation(),
            "references": references,
            "meta-data": {}
            }

pprint.pprint(generate_csl())

from pymongo import MongoClient
# setup a connection to the local db
client = MongoClient('localhost', 27018)
# create a new db for the benchmarking
db = client.benchmark_test
# create a new collection to store the articles
articles = db.articles

def addcsls(niter):
    """inserts n number of articles into the collection"""
    for x in range(niter):
        articles.insert(generate_csl())

runtimes = ''

%%capture runtimes
for niter in [10, 100, 1000, 10000, 100000]:
    %timeit addcsls(niter)
    articles.remove()

runtimes.stdout

for x in range(450000):
    articles.insert(generate_csl())
articles.count()

%%timeit
# time a query for one author
one_author = articles.find({'citation.authors.family': author_pool[23]['family']})

%%timeit
# time a query for two authors
two_authors = articles.find({
                             'citation.authors.family': author_pool[23]['family'], 
                             'citation.authors.family': author_pool[51]['family']
                             })

%%timeit
# now, lets query two authors from the primary citation and then a third within the references
three_authors = articles.find({
                               'citation.authors.family': author_pool[23]['family'], 
                               'citation.authors.family': author_pool[51]['family'],
                               'references.authors.family': author_pool[3]['family']
                               })

# grab the ObjectID and title from a random article
sample_article = articles.find_one()
uid = sample_article['_id']
title = sample_article['citation']['title']

# time a query based on ObjectID
%timeit articles.find({'_id': uid})

# time a query vased on title
%timeit articles.find({'citation.title': title})

%%timeit
# query for all articles writtein in 2000
article_query_one = articles.find({'citation.date': 2000 } )

%%timeit
# query for all articles written after 2000
article_query_two = articles.find({'citation.date': { "$gt": 2000} })

%%timeit
# query for all articles written after 1960 and before 2011
article_query_three = articles.find({'citation.date': { '$gte': 1960, '$lte': 2010 }})