import nltk import numpy as np import pandas as pd from csv import QUOTE_ALL from datetime import datetime from io import BytesIO from random import choice, random, randrange, sample, randint from urllib2 import urlopen from zipfile import ZipFile with ZipFile(BytesIO(urlopen('http://www2.census.gov/geo/gazetteer/2013_Gazetteer/2013_Gaz_place_national.zip').read())) as zip_file: gaz_place_national_2013_df = pd.read_csv(zip_file.open('2013_Gaz_place_national.txt'), sep='\t') gaz_place_national_2013_df.head() state_df = pd.read_csv(urlopen('http://www.census.gov/geo/reference/docs/state.txt'), sep='|', dtype={'STATE': 'str'}) state_df.head() places_df = pd.merge(gaz_place_national_2013_df, state_df[['STATE_NAME', 'STUSAB']], left_on='USPS', right_on='STUSAB')[['USPS', 'NAME', 'STATE_NAME']] places_df.head() with ZipFile(BytesIO(urlopen('https://www.census.gov/genealogy/www/data/2000surnames/names.zip').read())) as zip_file: app_c_df = pd.read_csv(zip_file.open('app_c.csv')) app_c_df_50 = app_c_df[:50][['name', 'count']] app_c_df_50['prop'] = app_c_df_50['count'].apply(lambda x : x.astype(float) / app_c_df_50['count'].sum()) app_c_df_50['cfd'] = app_c_df_50['prop'].cumsum() dist_female_first_df = pd.read_fwf( urlopen('http://www.census.gov/genealogy/www/data/1990surnames/dist.female.first'), col_specs=((0, 15),(15, 20),(21, 27),(28, 35)), header=None, names=('name', 'freq_in_percent', 'cumulative_freq_in_percent', 'rank') ) dist_female_first_df_50 = dist_female_first_df[:50][['name', 'freq_in_percent']] dist_female_first_df_50['prop'] = dist_female_first_df_50['freq_in_percent'].apply(lambda x : x.astype(float) / dist_female_first_df_50['freq_in_percent'].sum()) dist_female_first_df_50['cfd'] = dist_female_first_df_50['prop'].cumsum() dist_female_first_df_50.head() dist_male_first_df = pd.read_fwf( urlopen('http://www.census.gov/genealogy/www/data/1990surnames/dist.male.first'), col_specs=((0, 15),(15, 20),(21, 27),(28, 35)), header=None, names=('name', 'freq_in_percent', 'cumulative_freq_in_percent', 'rank') ) dist_male_first_df_50 = dist_male_first_df[:50][['name', 'freq_in_percent']] dist_male_first_df_50['prop'] = dist_male_first_df_50['freq_in_percent'].apply(lambda x : x.astype(float) / dist_male_first_df_50['freq_in_percent'].sum()) dist_male_first_df_50['cfd'] = dist_male_first_df_50['prop'].cumsum() dist_male_first_df_50.head() users = [] emails = set() email_domains = ('@gmail.com', '@yahoo.com', '@hotmail.com', '@outlook.com', '@mail.com', '@inbox.com', '@yandex.com') for i in range(500): user = dict() # Name random_lastname = random() user['last_name'] = app_c_df_50[random_lastname < app_c_df_50.cfd].iloc[0]['name'].capitalize() random_gender = random() random_name = random() if random_gender < 0.5: user['first_name'] = dist_female_first_df_50[random_name < dist_female_first_df_50.cfd].iloc[0]['name'].capitalize() else: user['first_name'] = dist_male_first_df_50[random_name < dist_male_first_df_50.cfd].iloc[0]['name'].capitalize() # E-mail email_domain = choice(email_domains) email = '{0}.{1}{2}'.format(user['first_name'].lower(), user['last_name'].lower(), email_domain) if not email in emails: user['email'] = email else: user['email'] = '{0}.{1}_{2:4x}{3}'.format( user['first_name'].lower(), user['last_name'].lower(), randrange(16**4), email_domain ) emails.add(user['email']) # Place place = places_df.ix[np.random.choice(places_df.index.values)] user['place'] = '{0}, {1}'.format(place['NAME'], place['STATE_NAME']).title() users.append(user) users_df = pd.DataFrame(users) users_df.head() frankenstein_sentences = nltk.sent_tokenize(urlopen('http://www.gutenberg.org/ebooks/84.txt.utf-8').read().replace('\r\n', ' ')) start_datetime = datetime(year=1994,month=9,day=4).toordinal() end_datetime = datetime(year=1995,month=1,day=4).toordinal() comments = [] for i in range(1000): comment = dict() comment['timestamp'] = randrange(start_datetime, end_datetime) comment['text'] = ' '.join(sample(frankenstein_sentences, randint(1, 5))) user = users_df.ix[np.random.choice(users_df.index.values)] comment['email'] = user['email'] comment['first_name'] = user['first_name'] comment['last_name'] = user['last_name'] comment['place'] = user['place'] comments.append(comment) comments_df = pd.DataFrame(sorted(comments, key=lambda p: p['timestamp'])) comments_df.index.name = 'id' comments_df.head() comments_df.to_csv('comments_df.csv', quoting=QUOTE_ALL)