import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline
%precision 2
pd.set_option('display.precision', 3)
import ndl,sim
from zt import ztnbinom
Set up compute cluster and initialize its environment. (Make sure it's got all the right versions of the files, especially sim.py!)
from IPython.parallel import Client
rc = Client(profile='home')
dview = rc.direct_view()
dview.block = True
lview = rc.load_balanced_view()
lview.block = True
rc.ids
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
%%px
import sys
sys.path = ['/home1/malouf/learning'] + sys.path
import sim
#from sim import Simulation
Create data that's distributed like the corpus counts in Ramscar et al.'s PNAS paper (see the Input Modeling notebook for details).
def cues(N):
card = ztnbinom.rvs(3,.6)
feats = range(card) + ['exactly%d'%card]
return [feats,codeFunc(card)]
ns = [ztnbinom.rvs(3,.6) for i in xrange(10000)]
data = np.zeros((max(ns)))
for i in ns:
data[i-1] += 1
data
array([ 3.31e+03, 2.57e+03, 1.77e+03, 1.08e+03, 6.19e+02, 3.13e+02, 1.65e+02, 7.60e+01, 5.30e+01, 2.00e+01, 7.00e+00, 3.00e+00, 2.00e+00, 1.00e+00, 1.00e+00])
data = pd.DataFrame(data,columns=['Frequency'],index=range(1,len(data)+1))
data['Cues'] = [range(1,i+1) + ['exactly%d'%i] for i in data.index]
data['Number'] = data.index
data
Frequency | Cues | Number | |
---|---|---|---|
1 | 3313 | [1, exactly1] | 1 |
2 | 2570 | [1, 2, exactly2] | 2 |
3 | 1773 | [1, 2, 3, exactly3] | 3 |
4 | 1084 | [1, 2, 3, 4, exactly4] | 4 |
5 | 619 | [1, 2, 3, 4, 5, exactly5] | 5 |
6 | 313 | [1, 2, 3, 4, 5, 6, exactly6] | 6 |
7 | 165 | [1, 2, 3, 4, 5, 6, 7, exactly7] | 7 |
8 | 76 | [1, 2, 3, 4, 5, 6, 7, 8, exactly8] | 8 |
9 | 53 | [1, 2, 3, 4, 5, 6, 7, 8, 9, exactly9] | 9 |
10 | 20 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, exactly10] | 10 |
11 | 7 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, exactly11] | 11 |
12 | 3 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, exactl... | 12 |
13 | 2 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ex... | 13 |
14 | 1 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | 14 |
15 | 1 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | 15 |
15 rows × 3 columns
%%time
r = sim.experiment(data, P=200, view=lview)
CPU times: user 43.2 s, sys: 12.6 s, total: 55.8 s Wall time: 3min 56s
sim.all_results(r)
Now add a background feature (basically an intercept)
data['Cues'] = [['background'] + cues for cues in data['Cues']]
data
Frequency | Cues | Number | Outcomes | |
---|---|---|---|---|
1 | 3313 | [background, 1, exactly1] | 1 | notdu |
2 | 2570 | [background, 1, 2, exactly2] | 2 | du |
3 | 1773 | [background, 1, 2, 3, exactly3] | 3 | notdu |
4 | 1084 | [background, 1, 2, 3, 4, exactly4] | 4 | notdu |
5 | 619 | [background, 1, 2, 3, 4, 5, exactly5] | 5 | notdu |
6 | 313 | [background, 1, 2, 3, 4, 5, 6, exactly6] | 6 | notdu |
7 | 165 | [background, 1, 2, 3, 4, 5, 6, 7, exactly7] | 7 | notdu |
8 | 76 | [background, 1, 2, 3, 4, 5, 6, 7, 8, exactly8] | 8 | notdu |
9 | 53 | [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, exactly9] | 9 | notdu |
10 | 20 | [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ex... | 10 | notdu |
11 | 7 | [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11... | 11 | notdu |
12 | 3 | [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11... | 12 | notdu |
13 | 2 | [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11... | 13 | notdu |
14 | 1 | [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11... | 14 | notdu |
15 | 1 | [background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11... | 15 | notdu |
15 rows × 4 columns
%%time
r2 = sim.experiment(data, P=200, view=lview)
sim.all_results(r2)
CPU times: user 47.1 s, sys: 11.5 s, total: 58.7 s Wall time: 4min 12s
ns = [ztnbinom.rvs(3,.45) for i in xrange(10000)]
data2 = np.zeros((max(ns)))
for i in ns:
data2[i-1] += 1
data2 = pd.DataFrame(data2,columns=['Frequency'],index=range(1,len(data2)+1))
data2['Cues'] = [range(1,i+1) + ['exactly%d'%i] for i in data2.index]
data2['Number'] = data2.index
data2
Frequency | Cues | Number | |
---|---|---|---|
1 | 1653 | [1, exactly1] | 1 |
2 | 1877 | [1, 2, exactly2] | 2 |
3 | 1637 | [1, 2, 3, exactly3] | 3 |
4 | 1363 | [1, 2, 3, 4, exactly4] | 4 |
5 | 1035 | [1, 2, 3, 4, 5, exactly5] | 5 |
6 | 793 | [1, 2, 3, 4, 5, 6, exactly6] | 6 |
7 | 559 | [1, 2, 3, 4, 5, 6, 7, exactly7] | 7 |
8 | 361 | [1, 2, 3, 4, 5, 6, 7, 8, exactly8] | 8 |
9 | 265 | [1, 2, 3, 4, 5, 6, 7, 8, 9, exactly9] | 9 |
10 | 149 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, exactly10] | 10 |
11 | 108 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, exactly11] | 11 |
12 | 79 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, exactl... | 12 |
13 | 47 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ex... | 13 |
14 | 36 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | 14 |
15 | 14 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | 15 |
16 | 10 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | 16 |
17 | 5 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | 17 |
18 | 5 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | 18 |
19 | 0 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | 19 |
20 | 3 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | 20 |
21 | 1 | [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | 21 |
21 rows × 3 columns
%%time
r3 = sim.experiment(data, P=200, view=lview)
sim.all_results(r3)
CPU times: user 43 s, sys: 12.6 s, total: 55.5 s Wall time: 3min 59s