Snapchat let 4.6M usernames and phone numbers get out. This notebook takes that data and explores what usernames are most prevelant in each area code. Usernames with "love" in them for example, are more common in California and Boston. "lynn" is more common in the South. "5280" is very common in Denver, which is in an interesting one to investigate (it's an elevation).
This notebook goes along with this algorithmshop.com post. Check out the post for animated visualizations of this data on a map of the US.
The leaked usernames and phone numbers came from here
The geocoding came from taking the area code names from that file and running them through this Geocode. You will have to geocode these places yourself, since there are licensing issues.
import pandas as pd
locations = pd.read_csv('locations.tsv', delimiter='\t')
all_users = pd.read_csv('schat.csv',
names=['numbers', 'username', 'location'],
index_col='location',
header=None)
# We just need the location names
del all_users['numbers']
all_users[0:10]
username | |
---|---|
location | |
Manhattan | slthornton |
Manhattan | strict_daddy4u |
Manhattan | whoknew69 |
Manhattan | testingtesting |
Manhattan | s.fullb13 |
Manhattan | gavan_smith |
Manhattan | thismyusername |
Manhattan | erinspickles |
Manhattan | flyinghorses |
Manhattan | saraelizabeth98 |
from collections import defaultdict
def get_subs(s, n):
"""Get all substrings of s of length n"""
for i in range(len(s) - n + 1):
yield s[i:i+n]
def get_len_n_dict(n, lower_bound):
sub_counts = defaultdict(int)
for name in all_users.username:
for s in get_subs(name, n):
sub_counts[s] += 1
return dict((k,v) for (k,v) in sub_counts.items() if v >= lower_bound)
# Find substrings of a certain lenght that occur a minimal
# number of times:
SUBSTRING_LENGTH = 4
MIN_OCCURENCE = 60
substring_counts = get_len_n_dict(SUBSTRING_LENGTH, MIN_OCCURENCE)
'5820' in substring_counts
False
# Example of the most common substrings
from heapq import nlargest
nlargest(10, substring_counts.items(), key=lambda x: x[1])
[('love', 51034), ('mari', 48834), ('anna', 33225), ('elle', 31469), ('alex', 28967), ('chri', 28713), ('hris', 28330), ('arie', 26568), ('stin', 26260), ('chel', 26169)]
counts_per_area = all_users.groupby(level=0).agg(len)
counts_per_area.rename(columns={'username': 'users'}, inplace=True)
counts_per_area[0:10]
users | |
---|---|
location | |
Arkansas | 28940 |
Boston | 41857 |
Boulder-Denver | 139265 |
Bronx, Queens, Brooklyn | 51086 |
Buffalo | 144939 |
Canadian territories in the Arctic far north | 31 |
Central Arizona | 35631 |
Central Florida | 3258 |
Central Georgia | 1396 |
Central Texas | 1542 |
# Now get counts of each substring by state
def counts_in_group(g, n, total_counts):
output_counts = defaultdict(int)
for name in g.username:
for s in get_subs(name, n):
if s in total_counts:
output_counts[s] += 1
return pd.DataFrame([output_counts])
groupby_fn = lambda x: counts_in_group(x, SUBSTRING_LENGTH, substring_counts)
by_state = all_users.groupby(level=0).apply(groupby_fn)
# I always end up doing this with pandas "apply". Halp me. I don't want to do it like this...
by_state = by_state.reset_index(level=1, drop=True)
by_state[0:3][['.mar', 'love', 'zzzz']]
.mar | love | zzzz | |
---|---|---|---|
location | |||
Arkansas | 11 | 286 | 4 |
Boston | 18 | 670 | 17 |
Boulder-Denver | 63 | 1092 | 74 |
# Filter out the small data
by_state_normalized = by_state / counts_per_area
normalized = by_state.div(counts_per_area.users, axis='index')
drop_small = normalized[counts_per_area.users > 2000].T
# We get regional strings by looking for high variance
#in the ratio of users with a given substring for a state.
VARIANCE_QUANTILE = .933
variances = drop_small.var(axis=1)
large_variance = drop_small[variances > variances.quantile(q=VARIANCE_QUANTILE)]
from random import randint
class StreamSampler(object):
def __init__(self, num_samples=1):
self.num_samples = num_samples
self.saved = []
self.num_seen = 0
def present(self, item):
if len(self.saved) < self.num_samples:
self.saved.append(item)
self.num_seen += 1
return
else:
v = random.randint(0, self.num_seen)
if v < self.num_samples:
self.saved[v] = item
self.num_seen += 1
def samples(self):
return self.saved
# For each substring, get 2 example usernames per area code.
desired_words = large_variance.index
example_users = defaultdict(lambda: defaultdict(lambda: StreamSampler(2)))
for (area_code, r) in all_users.iterrows():
name = r['username']
for s in get_subs(name, 4):
if s in desired_words:
example_users[s][area_code].present(name)
def print_report(word):
print('\nSUBSTRING {}'.format(word))
c = large_variance.ix[word].copy()
c.sort(ascending=False)
print(c)
for s in ('love', 'girl', 'baby', 'lynn', 'ngel', '1234', '5280'):
print_report(s)
SUBSTRING love location Southeastern California 0.022592 Downtown Los Angeles 0.022140 Eastern Los Angeles 0.019226 Eastern San Francisco 0.018228 Southern California 0.017534 Boston 0.016007 Oakland 0.015008 San Fernando Valley 0.014406 Denver-Boulder 0.014058 Central Florida 0.012584 Los Angeles 0.012254 Southeastern Virginia 0.011904 Florida 0.011734 New York City 0.011320 Southeastern Colorado 0.011114 Southeastern Michigan incl. Ann Arbor 0.010739 Central Arizona 0.010553 Miami 0.010417 Fort Lauderdale 0.010315 South Carolina 0.010262 Pennsylvania 0.010250 Chicago 0.010140 Arkansas 0.009883 Eastern part of Southern New Jersey 0.009638 Manhattan 0.009591 Western and Northern Colorado 0.009569 Idaho 0.009468 Buffalo 0.009452 Northwestern Arkansas 0.009041 San Francisco 0.008973 Bronx, Queens, Brooklyn 0.008965 Southern New York State 0.008729 Eastern Ohio 0.008557 Northern New York 0.008301 Mountain View 0.008281 Southwestern Wisconsin 0.008119 Indianapolis 0.008097 Chicago Suburbs 0.008048 Southeastern Ohio 0.007933 Boulder-Denver 0.007841 Southwest Connecticut 0.007752 Northeastern New York State 0.007708 Southern Illinois 0.007680 Northern Louisiana 0.007620 Seattle 0.007531 Champaign-Urbana 0.006832 Westchester County, NY 0.006559 Northern Chicago Suburbs 0.006492 Southern Michigan 0.006390 Maine 0.005827 Minnesota 0.004887 Manitoba 0.002774 Name: love, Length: 52, dtype: float64 SUBSTRING girl location South Carolina 0.008779 Northwestern Arkansas 0.008767 Arkansas 0.008086 Southeastern Virginia 0.007936 Eastern San Francisco 0.007661 Central Florida 0.007366 Florida 0.007192 Central Arizona 0.007157 Southeastern Colorado 0.007073 Western and Northern Colorado 0.007003 Idaho 0.006747 Maine 0.006617 Eastern Ohio 0.006540 Southern Illinois 0.006474 Southern California 0.006425 Southeastern Ohio 0.006381 Southwestern Wisconsin 0.006315 Denver-Boulder 0.006145 Champaign-Urbana 0.006081 Chicago Suburbs 0.005955 Pennsylvania 0.005857 Northeastern New York State 0.005737 Northern Louisiana 0.005487 Eastern Los Angeles 0.005342 Northern New York 0.005317 Eastern part of Southern New Jersey 0.005178 Seattle 0.005177 Southeastern California 0.005153 Southeastern Michigan incl. Ann Arbor 0.005087 Boulder-Denver 0.005055 Fort Lauderdale 0.004992 Southern New York State 0.004946 Minnesota 0.004887 Buffalo 0.004754 Chicago 0.004384 Oakland 0.004374 Northern Chicago Suburbs 0.004206 Boston 0.004205 Westchester County, NY 0.004176 Indianapolis 0.004171 Los Angeles 0.004169 Southwest Connecticut 0.004156 New York City 0.004081 Downtown Los Angeles 0.003945 San Fernando Valley 0.003916 Bronx, Queens, Brooklyn 0.003856 Mountain View 0.003823 Manhattan 0.003548 San Francisco 0.003490 Miami 0.003302 Southern Michigan 0.002925 Manitoba 0.001664 Name: girl, Length: 52, dtype: float64 SUBSTRING baby location Eastern San Francisco 0.008718 Florida 0.007949 Oakland 0.007761 Central Florida 0.007060 Southern California 0.006170 Boston 0.006068 Southeastern Colorado 0.005586 Southeastern California 0.005549 Indianapolis 0.005521 Eastern Los Angeles 0.005355 Eastern Ohio 0.005195 Denver-Boulder 0.005194 Buffalo 0.005140 Northern New York 0.005032 Fort Lauderdale 0.005004 Southeastern Michigan incl. Ann Arbor 0.004946 South Carolina 0.004934 Downtown Los Angeles 0.004912 Chicago 0.004879 Northern Louisiana 0.004877 Central Arizona 0.004631 Pennsylvania 0.004588 Southeastern Virginia 0.004535 Northeastern New York State 0.004368 Idaho 0.004324 San Fernando Valley 0.004320 Eastern part of Southern New Jersey 0.004315 Chicago Suburbs 0.004256 Champaign-Urbana 0.004152 Arkansas 0.004077 Southern New York State 0.004073 Southern Illinois 0.004048 Western and Northern Colorado 0.004048 Southeastern Ohio 0.003880 Northwestern Arkansas 0.003836 Miami 0.003792 New York City 0.003758 San Francisco 0.003720 Mountain View 0.003622 Manhattan 0.003558 Bronx, Queens, Brooklyn 0.003445 Los Angeles 0.003426 Boulder-Denver 0.003325 Southwest Connecticut 0.003282 Southwestern Wisconsin 0.003157 Northern Chicago Suburbs 0.002914 Southern Michigan 0.002889 Minnesota 0.002793 Westchester County, NY 0.002727 Maine 0.002666 Seattle 0.002636 Manitoba 0.001664 Name: baby, Length: 52, dtype: float64 SUBSTRING lynn location Southern New York State 0.010474 Northern Louisiana 0.009144 Florida 0.009084 Maine 0.008888 Southeastern Ohio 0.008364 Southeastern Michigan incl. Ann Arbor 0.007348 Northern New York 0.007304 Southern Illinois 0.007083 Arkansas 0.007049 Northwestern Arkansas 0.006712 Buffalo 0.006327 Champaign-Urbana 0.006184 Northeastern New York State 0.006165 Eastern Ohio 0.005990 Idaho 0.005815 Minnesota 0.005585 Southeastern Colorado 0.005363 Eastern part of Southern New Jersey 0.005178 Chicago Suburbs 0.005103 Southwestern Wisconsin 0.004962 Western and Northern Colorado 0.004888 Manitoba 0.004576 Indianapolis 0.004417 Pennsylvania 0.004393 Central Arizona 0.004322 Southern California 0.004145 Southeastern Virginia 0.003921 Eastern San Francisco 0.003875 Southwest Connecticut 0.003761 Central Florida 0.003683 Denver-Boulder 0.003643 South Carolina 0.003633 Southeastern California 0.003567 Boulder-Denver 0.003547 Eastern Los Angeles 0.002988 Fort Lauderdale 0.002847 Northern Chicago Suburbs 0.002777 Seattle 0.002730 Westchester County, NY 0.002401 Bronx, Queens, Brooklyn 0.002192 Southern Michigan 0.001945 San Fernando Valley 0.001868 New York City 0.001758 Los Angeles 0.001753 Chicago 0.001669 Oakland 0.001647 Boston 0.001625 Mountain View 0.001525 Miami 0.001453 Manhattan 0.001431 Downtown Los Angeles 0.001376 San Francisco 0.001212 Name: lynn, Length: 52, dtype: float64 SUBSTRING ngel location Eastern San Francisco 0.005988 Southeastern California 0.005945 Downtown Los Angeles 0.004805 Central Florida 0.004604 Eastern Los Angeles 0.004355 Denver-Boulder 0.004281 Southern California 0.004215 Oakland 0.004007 Bronx, Queens, Brooklyn 0.003739 New York City 0.003702 Southwestern Wisconsin 0.003608 Southeastern Colorado 0.003517 San Fernando Valley 0.003483 Florida 0.003407 Fort Lauderdale 0.003342 Miami 0.003342 Central Arizona 0.003340 Boston 0.003321 Eastern part of Southern New Jersey 0.003308 Los Angeles 0.003206 Southern New York State 0.003200 Manhattan 0.003200 Mountain View 0.003029 Southeastern Virginia 0.003023 South Carolina 0.002906 San Francisco 0.002875 Minnesota 0.002793 Chicago 0.002772 Chicago Suburbs 0.002760 Western and Northern Colorado 0.002756 Pennsylvania 0.002733 Buffalo 0.002732 Eastern Ohio 0.002659 Idaho 0.002647 Manitoba 0.002635 Southeastern Michigan incl. Ann Arbor 0.002543 Arkansas 0.002522 Westchester County, NY 0.002469 Northeastern New York State 0.002456 Northern New York 0.002292 Boulder-Denver 0.002269 Southwest Connecticut 0.002260 Northern Chicago Suburbs 0.002251 Southern Michigan 0.002139 Southern Illinois 0.002114 Indianapolis 0.002086 Southeastern Ohio 0.002070 Champaign-Urbana 0.001958 Northwestern Arkansas 0.001781 Northern Louisiana 0.001422 Seattle 0.001412 Maine 0.001284 Name: ngel, Length: 52, dtype: float64 SUBSTRING 1234 location South Carolina 0.003421 Northwestern Arkansas 0.003288 Southeastern Ohio 0.003190 Southeastern Michigan incl. Ann Arbor 0.002967 Southern New York State 0.002910 Western and Northern Colorado 0.002877 Eastern Ohio 0.002567 Northern Louisiana 0.002540 Arkansas 0.002522 Denver-Boulder 0.002480 Northern New York 0.002414 Minnesota 0.002374 Maine 0.002370 Northeastern New York State 0.002325 Northern Chicago Suburbs 0.002302 Boston 0.002246 Buffalo 0.002208 Southwest Connecticut 0.002177 Seattle 0.002165 Eastern part of Southern New Jersey 0.002158 Westchester County, NY 0.002143 Chicago Suburbs 0.002139 Boulder-Denver 0.002133 Southern Illinois 0.002114 Southern Michigan 0.002046 Southeastern Colorado 0.002001 Idaho 0.001976 Central Arizona 0.001965 Chicago 0.001938 Champaign-Urbana 0.001929 Mountain View 0.001874 Indianapolis 0.001840 New York City 0.001785 Pennsylvania 0.001757 Fort Lauderdale 0.001686 Eastern San Francisco 0.001673 Miami 0.001637 Southeastern California 0.001585 Central Florida 0.001535 Southern California 0.001515 Manhattan 0.001511 San Fernando Valley 0.001508 Bronx, Queens, Brooklyn 0.001507 Southeastern Virginia 0.001464 Los Angeles 0.001425 Eastern Los Angeles 0.001413 Manitoba 0.001387 San Francisco 0.001286 Downtown Los Angeles 0.001240 Oakland 0.001126 Florida 0.000757 Southwestern Wisconsin 0.000451 Name: 1234, Length: 52, dtype: float64 SUBSTRING 5280 location Denver-Boulder 0.003272 Boulder-Denver 0.002599 Western and Northern Colorado 0.000173 Southeastern Colorado 0.000087 Southern Illinois 0.000035 Arkansas 0.000035 Southwest Connecticut 0.000033 Eastern Ohio 0.000031 Southern Michigan 0.000029 Champaign-Urbana 0.000022 Westchester County, NY 0.000017 Northern Chicago Suburbs 0.000015 Los Angeles 0.000014 Chicago 0.000014 Downtown Los Angeles 0.000012 San Francisco 0.000009 New York City 0.000009 Oakland 0.000008 Northeastern New York State 0.000007 Buffalo 0.000007 Northern New York 0.000007 Fort Lauderdale 0.000006 Southern California 0.000005 San Fernando Valley 0.000005 Boston NaN Bronx, Queens, Brooklyn NaN Central Arizona NaN Central Florida NaN Chicago Suburbs NaN Eastern Los Angeles NaN Eastern San Francisco NaN Eastern part of Southern New Jersey NaN Florida NaN Idaho NaN Indianapolis NaN Maine NaN Manhattan NaN Manitoba NaN Miami NaN Minnesota NaN Mountain View NaN Northern Louisiana NaN Northwestern Arkansas NaN Pennsylvania NaN Seattle NaN South Carolina NaN Southeastern California NaN Southeastern Michigan incl. Ann Arbor NaN Southeastern Ohio NaN Southeastern Virginia NaN Southern New York State NaN Southwestern Wisconsin NaN Name: 5280, Length: 52, dtype: float64
# This is just for generating json for use in algorithmshop.com's visualization:
# http://algorithmshop.com/20140102-snapchat-leak.html
import json
import math
from random import shuffle
PATH_PREFIX = '/post-files/20140202-snapchat'
# Some canadian things sneak in...
BLACKLIST = {'Manitoba'}
output_blobs = []
for (sub, r) in large_variance.iterrows():
blob = {}
blob['substring'] = sub
blob['location_data'] = [{'location': location,
'frequency': frequency,
'example_users': example_users[sub][location].samples()}
for (location, frequency) in r.iteritems()
if (not math.isnan(frequency) and location not in BLACKLIST)]
path = 'blobs/blob-{}.json'.format(sub)
with open(path, 'wt') as f:
json.dump(blob, f)
output_blobs.append({'fragment': str(abs(hash(sub))),
'path': '{}/{}'.format(PATH_PREFIX, path)})
shuffle(output_blobs)
with open('blobs/all_blobs.json', 'wt') as f:
json.dump(output_blobs, f)
with open('blobs/locations.json', 'wt') as f:
all_locations = []
for (_, r) in locations[:-1].iterrows():
single_location = {'location': r['name'],
'lat': r['latitude'],
'lon': r['longitude']}
all_locations.append(single_location)
json.dump(all_locations, f)