import json
DATA_PATH = 'cleaned_data.json'
all_data = json.load(open(DATA_PATH))
import pandas as pd
import numpy as np
answers = pd.DataFrame(all_data)
answers['duration'] = answers.submit_time - answers.srv_time
answers.user_id = answers.user_id.astype(np.int)
# Clamp response time to 2 minutes.
answers.duration[answers.duration > 120] = 120
num_answered = answers.user_id.value_counts()
completions = num_answered[num_answered > 10].index
answers = answers[answers.user_id.isin(completions)]
# Some of the survey experiments did not get many responses, filter them out
source_counts = answers['type'].value_counts()
# So like 3 people completed surveys from twitter :)...
source_counts
Mechanical Turk 5 Cents 4984 Turk, asking for Fast 2597 Facebook 198 Mechanical Turk 2 Cents 143 Twitter 33 Mechanical Turk High Skill Request 11 AdWords, asking for 2 cents. 11 Tiffany 11 dtype: int64
answers[0:3]
answer | city | country_name | ip_hash | latitude | longitude | question_id | region_name | srv_time | submit_time | survey_order_id | type | user_id | version | duration | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | Obama speaking at his inauguration. | Visalia | United States | 3412077616522023084 | 36.2958 | -119.3812 | 3 | California | 1.389912e+09 | 1.389912e+09 | 21ejkavsdh | Mechanical Turk High Skill Request | 42403892847 | 0.1 | 38.466112 |
10 | I think they both should exercise equally. | Visalia | United States | 3412077616522023084 | 36.2958 | -119.3812 | 4 | California | 1.389912e+09 | 1.389912e+09 | 21ejkavsdh | Mechanical Turk High Skill Request | 42403892847 | 0.1 | 25.887507 |
11 | USA | Visalia | United States | 3412077616522023084 | 36.2958 | -119.3812 | 5 | California | 1.389912e+09 | 1.389912e+09 | 21ejkavsdh | Mechanical Turk High Skill Request | 42403892847 | 0.1 | 4.623164 |
answers = answers[answers.type.isin(source_counts[source_counts > 100].index)]
import numpy as np
grouped = answers.groupby(['type', 'question_id'])['duration'].agg({'mean': np.mean, 'count': len, 'std': np.std})
grouped
count | std | mean | ||
---|---|---|---|---|
type | question_id | |||
1 | 18 | 6.528073 | 10.140472 | |
2 | 18 | 22.227931 | 23.996327 | |
3 | 18 | 30.022063 | 37.650166 | |
4 | 18 | 28.509485 | 34.013478 | |
5 | 18 | 3.045837 | 6.149543 | |
6 | 18 | 18.436770 | 42.643649 | |
7 | 18 | 25.016560 | 40.394385 | |
8 | 18 | 10.808506 | 18.092980 | |
9 | 18 | 17.671644 | 26.297783 | |
10 | 18 | 17.864162 | 18.963150 | |
11 | 18 | 22.103053 | 35.284498 | |
Mechanical Turk 2 Cents | 1 | 13 | 22.553344 | 15.914699 |
2 | 13 | 9.001608 | 22.677917 | |
3 | 13 | 31.036975 | 41.914349 | |
4 | 13 | 30.803747 | 28.500610 | |
5 | 13 | 6.284513 | 8.555612 | |
6 | 13 | 13.852425 | 34.589414 | |
7 | 13 | 16.696772 | 28.217106 | |
8 | 13 | 10.241031 | 16.047161 | |
9 | 13 | 11.543667 | 21.571579 | |
10 | 13 | 19.762374 | 20.000313 | |
11 | 13 | 28.408319 | 27.910834 | |
Mechanical Turk 5 Cents | 1 | 453 | 12.450826 | 14.048011 |
2 | 454 | 18.235401 | 23.120424 | |
3 | 453 | 24.965651 | 36.441400 | |
4 | 453 | 27.328623 | 34.381162 | |
5 | 453 | 7.824829 | 8.795530 | |
6 | 453 | 21.744677 | 41.797269 | |
7 | 453 | 21.407686 | 35.237853 | |
8 | 453 | 17.072572 | 19.110877 | |
9 | 453 | 26.264251 | 32.945017 | |
10 | 453 | 23.375196 | 25.831645 | |
11 | 453 | 18.884085 | 29.102843 | |
Turk, asking for Fast | 1 | 236 | 11.803632 | 10.964369 |
2 | 237 | 13.104876 | 15.321082 | |
3 | 236 | 21.295649 | 26.113626 | |
4 | 236 | 19.836304 | 21.603866 | |
5 | 236 | 9.395384 | 7.354463 | |
6 | 236 | 19.971123 | 33.152513 | |
7 | 236 | 18.643996 | 27.216647 | |
8 | 236 | 7.632557 | 12.295015 | |
9 | 236 | 18.172701 | 20.654331 | |
10 | 236 | 16.994212 | 15.822472 | |
11 | 236 | 12.158383 | 22.395870 |
questions = json.load(open('../app/survey.json'))
text_by_id = {q['id']: q['question'] for q in questions['questions']}
for q in questions['questions']:
if 'prompt' not in q:
continue
answers.answer[(answers.question_id == q['id']) & (answers['answer'] == q['prompt'])] = 'DEFAULT'
print(answers['type'].unique())
answers['type'].value_counts()
['Turk, asking for Fast' 'Mechanical Turk 2 Cents' 'Facebook' 'Mechanical Turk 5 Cents']
Mechanical Turk 5 Cents 4984 Turk, asking for Fast 2597 Facebook 198 Mechanical Turk 2 Cents 143 dtype: int64
I'm just going to work on finding the timing differences between the 5 cent turk answers and the "Do it fast" turk answers.
%pylab inline
import matplotlib.pyplot as plt
def plot_single_question(question_id, survey_answers, answer_types,
normalize=False, ax=None):
durations = [survey_answers[(survey_answers.type == t) &
(survey_answers.question_id == question_id)].duration
for t in answer_types]
if normalize:
weights = [np.ones_like(d) / len(d) for d in durations]
else:
weights = None
plt.hist(durations, label=list(answer_types), weights=weights)
ax.legend(bbox_to_anchor=(1.7, .95))
title(text_by_id[question_id][:80] + ' (question {})'.format(question_id))
def plot_all_questions(survey_answers, answer_types, question_ids, normalize=False):
size = 5
plt.figure(figsize=(size, len(question_ids) * size))
for (i, q) in enumerate(question_ids):
ax = plt.subplot(len(question_ids), 1, i + 1)
plot_single_question(q, survey_answers, answer_types, normalize=normalize, ax=ax)
Populating the interactive namespace from numpy and matplotlib
/home/justinvf/anaconda/envs/blogging/lib/python3.3/site-packages/matplotlib/mathtext.py:46: UserWarning: Due to a bug in pyparsing <= 2.0.0 on Python 3.x, packrat parsing has been disabled. Mathtext rendering will be much slower as a result. Install pyparsing 2.0.0 or later to improve performance. warn("Due to a bug in pyparsing <= 2.0.0 on Python 3.x, packrat parsing "
plot_all_questions(answers[answers.duration < 50],
['Turk, asking for Fast', 'Mechanical Turk 5 Cents'],
list(1 + i for i in range(11)),
normalize=True)
from scipy.stats import gaussian_kde
(fraudy, legit) = ('Turk, asking for Fast', 'Mechanical Turk 5 Cents')
fraudy_timings = {}
legit_timings = {}
for i in range(11):
q_id = i + 1
q_frame = answers.duration[answers.question_id == q_id]
fraudy_timings[q_id] = gaussian_kde(q_frame[answers.type == fraudy])
legit_timings[q_id] = gaussian_kde(q_frame[answers.type == legit])
SIZE = 5
NUM_QUESTIONS = len(legit_timings)
plt.figure(figsize=(SIZE, SIZE * NUM_QUESTIONS))
for i in sorted(fraudy_timings):
plt.subplot(NUM_QUESTIONS, 1, i)
title(text_by_id[i][:80] + ' (question {})'.format(i))
fraud_kde = fraudy_timings[i]
legit_kde = legit_timings[i]
x = np.arange(0, 120, .1)
plt.yticks([])
plt.xlabel('Seconds to answer')
plt.plot(x, fraud_kde.evaluate(x), 'r', label='more fraudulent')
plt.plot(x, legit_kde.evaluate(x), 'g', label='more good')
plt.legend()
Here the start of digging into some of the actual answer data, just for fun.
import re
political_regex = re.compile('.*(obama|jfk|kennedy|ronald|reagan|regan|clinton|bill cl|'
'george washington|george w|dukakis|saddam|'
'bush|carter|nixon|modi|gorbachev|lincoln|trudeau|'
'brezhnev|perot|'
'mahatma gandhi|nehru|gingrich|martin luther king|mlk|'
'rajiv gandhi|ford|rajive gandhi|eisenhower|'
'rahul gandhi|indira gandhi|nelson mandela|white house|'
'gandhi|thatcher).*', re.IGNORECASE)
answers['figure'] = answers[answers.question_id == 3]['answer'].str.match(political_regex)
def get_first(l):
if type(l) == tuple and l:
return l[0].lower()
answers.figure_clean = answers.figure.apply(get_first)
answers.figure_clean.value_counts()
clinton 102 bush 80 obama 59 reagan 47 gandhi 41 nixon 20 modi 20 kennedy 18 white house 15 regan 12 carter 10 nehru 7 eisenhower 6 jfk 6 nelson mandela 5 ronald 4 thatcher 4 gorbachev 3 george washington 3 trudeau 3 martin luther king 3 perot 3 ford 2 dukakis 2 lincoln 2 george w 1 saddam 1 bill cl 1 gingrich 1 brezhnev 1 dtype: int64
# Someone famous in India who I had no clue about:
answers.answer[answers.figure_clean == 'modi']
445 narendra modi 753 modi 839 narendra modi speaking at a conferrence 946 Narendra modi speech in stage 1093 narendra modi 1352 Narendran modi 1358 NARENRA MODI 1967 modi spoken at mumbai grounds 2420 modi 2560 narendra modi\r\n 2586 Narendra Modi speech at Bihar 3192 narendra modi 3663 Narendra Modi 4619 MR.NARENDRA MODI 5286 While in past I heard the speach of Modi 5586 narendra modi is speaking at gujarat 6009 Narendra modi announced as a Prime minister ca... 8383 Modi spoken about Tea Shop worker 8527 Narendra modi 8548 Modi speaking in dias Name: answer, dtype: object
# Find the folks not captured by that regex:
def empty_tuple(x):
return type(x) == type([]) and not x
list(answers[answers.figure.apply(empty_tuple) & (answers.answer != 'DEFAULT')].answer)
['yes', 'ARVIND KEJRIWAL', 'The president Salinas giving an speech on tv', 'Quite moderate.', 'Mr.karunanudhi addressing a speech in tamil nadu', 'suresh', "'95", 'speak about economy', 'i hate politics', 'congressman speaking at school', 'cant recall', '"Read my lips. No new taxes."', 'moderatly coserevative', 'idk', 'Rajai speaking for DMK', '5', 'Voting', 'BEHAVIOUR', 'vijaykanth', 'Delhi election', 'full of problems and pressure', 'liberally', '', 'i saw one of them on tv.', 'DR.MANMOHAN SINGH SMILING AT SOMEONE IN THE AUDIENCE', 'prefer not to say', ' My earliest memory is of the angry bullies who lived next door to us. ... anger he channeled toward political figures was rooted in something other than that', 'prathipa patel', 'Mahatma speaking in front of a crowd which I saw as a video on Tv', 'Abdul Kalam', 'no idea', 'election', 'subash chandra bos', 'manmohan singh (Indian prime minister) speaking in his 3rd ever conference in the last 2 decades.', 'Abdul Kalam', 'Flag hoisting', 'MGR ', 'a meeting', 'Arvind k in rajya sabha', 'Prime Minister taking action', 'NIL', 'Moi', 'Sheikh Hasina', 'C P Muhammed MLA speaking in Pattambi', 'Conference', 'bharatiya janata party', 'J Jayalalitha meet in my hometown.', 'none', 'sonia', 'ntng', 'usa', 'Waste', 'M.G.R', 'LITTLE', 'no idea', 'Atal Bihari Vajpayee speaking at BJP party office', "Rahul's campaign speech ", 'obema', 'ana hazare speking of black mone', 'aravind kejrival', 'A P J Abdul Kalam', 'Karunanithy', 'Mr.karunanidhi, Former Chief Minister of Tamil Nadu speaking in a public meeting', 'I AM NOT INTRESTED IN THAT', 'attending a governors speech at age 8 or 9', 'Smith\r\n', 'abdul kalam', '', 'Kejariwal becomes chief minister of Delhi', 'SASI THAROOR', 'Lyndon Johnson on the TV news', 'senate', 'Nothing', 'i hate politics', '', 'Aravind Kejriwal speech at Parliament', 'Recent activity about devayani in the visa case', 'mick ', 'english', 'jayalalitha ruling', '8', 'None', 'I have no idea.', 'kejriwal became mp.', 'Aravind kejarival won election at delhi', 'bjp', 'Regean being in trouble for Iran-Contra', 'london', 'My earliest memory of a political figure is vijayakandh spoke at Assembly.', 'Good management', 'hard to collect, but the one speach given by Atal Bihari was unforgetable.', 'yes\r\n', '', 'dishonesty', 'nothing', 'politics is simply like a useless material which is handed over to old guy to damage even more.', 'LIBERAL', 'chicago park, never', 'benezir bhutto killed during campaign', 'ldf', 'Cris Daly of the SF Board of Supervisors cursing someone out.', 'memorial services', 'america', '', '4th grade', 'aam aadmi', 'none', 'Indian primeminister Atal Bihai Vajpayee speech', 'I DONT LIKE POLITICS', 'speaking to parliment during question time', 'Mahatha ghandhi biography', 'Learning that James Bulger was the Prime Minister of New Zealand in 1992.', 'As a little kid, wondering why Jean Chretien had a sideways mouth and talked sideways, after seeing him on tv.', 'Abraham Lincon', 'Pres. Johnson on TV when I was a young child, discussing Vietnam. . ', 'The past leader of Malaysia, Tun Doktor Mahathir.', 'interview', 'My stepdad being upset over RFK being killed,', 'Medical camp of TDP on NTR annual day.', "I was sitting in the waiting room of a VA Hospital. I must have been 5 or younger. I just remember some politician talking about China. I'm not sure, but I think it was communist conversion. Really vague memory.", 'not interested', 'hema', 'Nothing', 'Manmohan singh as finance minister', 'english', 'nil', 'n/a', 'i dont', 'Chandra babu speaking in assembly', '0', '1978', 'AAP speech', 'I remember the italian President, Oscar Luigi Scalfaro', '9', 'adal bihari vaajpaayi is prime minister', '7', 'jayalalitha', 'nothing', 'Seeing IKE on tv as a small boy', 'martin ', 'nothing', 'America, Work', 'None', "I don't remember who it was exactly, but the politician who killed himself live on TV.", 'Central election party AAP', 'vajpai', 'Eating bread', "Don't care", 'liberal ', 'nice', 'reagon', '48', 'High school', '1994 elections', 'CKINTON', 'The first president', "Don't have any not really into watching or reading anything political.", 'None.', 'when i was 14', 'washington', 'Bill getting his "surprise" ', 'today and yesterday\r\n', 'whitehouse', "Don't know", 'kalainger', 'infuencial', 'na', "Clint's big scandal", 'na', 'john', 'zulfiqar bhutto', 'Our leader giving lecture instead of doing work.', 'Atal bihari vajpayee speech at parliament', 'Raegan Breaking down wall.', 'When I was in 2nd grade.', 'center', 'JEYA LALITHA ', 'our C.M is introducing lot of welfare to people', 'BJP winning in India', 'mani', 'manmohanshing', 'No idea', 'Hitler', 'karunanithi', 'regarding political instability', 'California regarding the financial cresis', 'watching the political debates in tenth grade', 'Kalaignar many speeches in political', 'miss.j. jeyalalitha provided laptop to school and college students', 'the president', 'my sister', 'Chandra Babu naidu winning the electoins', 'nope', 'waste. I hate democracy', 'an Italian politician', "congress leader's murder", 'arvind kejriwal', 'Attal Bihari vajpeyi visited the Tsunami affected areas.']
def score_for_user(user_id, initial_fraud_probability=.1):
fraud_probability = initial_fraud_probability
nonfraud_probability = 1 - initial_fraud_probability
the_data = answers[answers.user_id == user_id].sort(columns=('question_id',))[[
'question_id', 'answer', 'duration']]
partial_results = []
for r in the_data.iterrows():
(question_id, answer, duration) = r[1]
fraud_likelihood = fraudy_timings[question_id].evaluate(duration)[0]
fraud_probability *= fraud_likelihood
nonfraud_likelihood = legit_timings[question_id].evaluate(duration)[0]
nonfraud_probability *= nonfraud_likelihood
normalizer = nonfraud_probability + fraud_probability
fraud_probability /= normalizer
nonfraud_probability /= normalizer
partial_results.append({'question_id': question_id,
'duration': duration,
'answer': answer,
'fraud_p': fraud_probability,
'nonfraud_p': nonfraud_probability})
return partial_results
score_for_user(87397087779)
[{'answer': 'DEFAULT', 'duration': 17.01549005508423, 'fraud_p': 0.078448550659764929, 'nonfraud_p': 0.92155144934023514, 'question_id': 1}, {'answer': 'know', 'duration': 16.06438899040222, 'fraud_p': 0.078327530808187787, 'nonfraud_p': 0.92167246919181234, 'question_id': 2}, {'answer': 'yes', 'duration': 7.856693983078003, 'fraud_p': 0.1601986332351793, 'nonfraud_p': 0.83980136676482064, 'question_id': 3}, {'answer': 'no', 'duration': 17.965824127197266, 'fraud_p': 0.18695987381608123, 'nonfraud_p': 0.81304012618391885, 'question_id': 4}, {'answer': 'ethiopia', 'duration': 11.613417863845825, 'fraud_p': 0.14424050342554573, 'nonfraud_p': 0.85575949657445438, 'question_id': 5}, {'answer': 'immigration:1 transportation:1 healthcare:3 education:3 warfare:2', 'duration': 37.87310600280762, 'fraud_p': 0.13353764904151283, 'nonfraud_p': 0.86646235095848712, 'question_id': 6}, {'answer': 'radish:2 lettuce:3 eggplant:6 tomato:6 aubergine:2 kiwi:8', 'duration': 38.65355896949768, 'fraud_p': 0.082402883940254051, 'nonfraud_p': 0.91759711605974603, 'question_id': 7}, {'answer': 'tiger', 'duration': 17.080639123916626, 'fraud_p': 0.060961446981524434, 'nonfraud_p': 0.93903855301847561, 'question_id': 8}, {'answer': '2 bad kids', 'duration': 16.7669038772583, 'fraud_p': 0.082194120577559093, 'nonfraud_p': 0.91780587942244085, 'question_id': 9}, {'answer': 'no', 'duration': 14.701430082321167, 'fraud_p': 0.091859498736522979, 'nonfraud_p': 0.90814050126347701, 'question_id': 10}, {'answer': 'Nabokov:1 Obama:4 Fidel Castro:1 Your favorite TV host:3 Babe Ruth:1', 'duration': 32.44860100746155, 'fraud_p': 0.062823282273206923, 'nonfraud_p': 0.93717671772679312, 'question_id': 11}]
answers.duration[answers.duration < 10].round(0).value_counts()
8 322 7 317 6 305 9 277 5 268 4 172 10 129 3 78 1 45 2 37 0 4 dtype: int64
all_scores = []
for u in answers.user_id.unique():
all_scores.append(score_for_user(u))
plt.hist([d[-1]['fraud_p'] for d in all_scores])
plt.xlabel('Final probability of fraud')
plt.ylabel('Number of people')
<matplotlib.text.Text at 0x7f2b58c9d810>
#Looking at some example
from pprint import pprint
def get_instance(collection, lower, upper, function):
g = list(c for c in collection if lower <= function(c) <= upper)
if g:
return g[random.randint(0, len(g) - 1)]
get_fraud_p = lambda c: c[-1]['fraud_p']
print('Good surveys')
short_questions = {1: "First name",
2: "People with your name honest?",
3: "Earliest political memory?",
4: "Men or women need more exercise?",
5: "What country do you live in?",
6: "Allocating money to different departments",
7: "How saw would you be if various plants went away?",
8: "What animal would you not want to leave with a sheep?",
9: "10 kids, 1 evil kid, 0 kids, or 2 bad kids?",
10: "Do you have any idea what the word 'Telluride' means?",
11: "Who would your parents like?"}
def to_table(answers):
rows = ['<tr><td>{}</td><td>{}</td><td>{:0.3f}</td><td>{:0.5f}</td></tr>'
.format(short_questions[r['question_id']], r['answer'], r['duration'], r['fraud_p'])
for r in answers]
return ('<table>\n'
'<th><td>Answer</td><td>Duration</td><td>Fraud Probability</td></th>\n'
'\n{}\n'
'</table>'.format('\n'.join(rows)))
print('Good table\n', to_table(get_instance(all_scores, .0, .1, get_fraud_p)))
print('Bad table\n', to_table(get_instance(all_scores, .9, 1, get_fraud_p)))
Good surveys Good table <table> <th><td>Answer</td><td>Duration</td><td>Fraud Probability</td></th> <tr><td>First name</td><td>Satia</td><td>6.209</td><td>0.11934</td></tr> <tr><td>People with your name honest?</td><td>As much as anyone with any other name.</td><td>22.490</td><td>0.09134</td></tr> <tr><td>Earliest political memory?</td><td>Nixon being impeached.</td><td>27.930</td><td>0.07606</td></tr> <tr><td>Men or women need more exercise?</td><td>I think they need the same amount.</td><td>20.970</td><td>0.07745</td></tr> <tr><td>What country do you live in?</td><td>US</td><td>5.400</td><td>0.07367</td></tr> <tr><td>Allocating money to different departments</td><td>immigration:2 healthcare:3 education:3 warfare:0 transportation:2</td><td>31.218</td><td>0.07538</td></tr> <tr><td>How saw would you be if various plants went away?</td><td>radish:3 lettuce:10 eggplant:6 tomato:10 aubergine:6 kiwi:3</td><td>27.000</td><td>0.07947</td></tr> <tr><td>What animal would you not want to leave with a sheep?</td><td>A wolf</td><td>13.068</td><td>0.09462</td></tr> <tr><td>10 kids, 1 evil kid, 0 kids, or 2 bad kids?</td><td>I already have 3 children but I definitely don't want anymore.</td><td>22.825</td><td>0.09188</td></tr> <tr><td>Do you have any idea what the word 'Telluride' means?</td><td>It's something doing with science and a metal, I think, if I remember correctly.</td><td>39.526</td><td>0.03235</td></tr> <tr><td>Who would your parents like?</td><td>Nabokov:2 Obama:4 Fidel Castro:1 Your favorite TV host:1 Babe Ruth:1</td><td>26.100</td><td>0.02641</td></tr> </table> Bad table <table> <th><td>Answer</td><td>Duration</td><td>Fraud Probability</td></th> <tr><td>First name</td><td>Natalie</td><td>6.862</td><td>0.11502</td></tr> <tr><td>People with your name honest?</td><td>Yes, they are.</td><td>8.962</td><td>0.17751</td></tr> <tr><td>Earliest political memory?</td><td>President Ronald Reagan speaking on television.</td><td>20.161</td><td>0.19511</td></tr> <tr><td>Men or women need more exercise?</td><td>Men over 50 need about the same amount of exercise as women over 45.</td><td>17.454</td><td>0.23138</td></tr> <tr><td>What country do you live in?</td><td>USA</td><td>4.092</td><td>0.25967</td></tr> <tr><td>Allocating money to different departments</td><td>immigration:1 healthcare:3 education:5 warfare:0 transportation:2</td><td>13.936</td><td>0.41175</td></tr> <tr><td>How saw would you be if various plants went away?</td><td>radish:9 lettuce:10 eggplant:6 tomato:7 aubergine:6 kiwi:5</td><td>18.782</td><td>0.51566</td></tr> <tr><td>What animal would you not want to leave with a sheep?</td><td>A wolf</td><td>5.541</td><td>0.68619</td></tr> <tr><td>10 kids, 1 evil kid, 0 kids, or 2 bad kids?</td><td>0 kids</td><td>5.290</td><td>0.80988</td></tr> <tr><td>Do you have any idea what the word 'Telluride' means?</td><td>No, never heard of it</td><td>6.407</td><td>0.88554</td></tr> <tr><td>Who would your parents like?</td><td>Nabokov:1 Obama:1 Fidel Castro:1 Your favorite TV host:2 Babe Ruth:3</td><td>15.959</td><td>0.92446</td></tr> </table>
pdf_data = {'range': [0, 120],
'step_size': .25}
pdf_values = {}
x = np.arange(pdf_data['range'][0], pdf_data['range'][1], pdf_data['step_size'])
for i in range(1, 12):
fraud_kde = fraudy_timings[i]
legit_kde = legit_timings[i]
pdf_values[i] = {'legit': list(legit_kde.evaluate(x)),
'fraudy': list(fraud_kde.evaluate(x))}
pdf_data['values'] = pdf_values
json.dump(pdf_data, open('fraud_model_pdf.json', 'wt'))