In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

%matplotlib inline

%precision 2
pd.set_option('display.precision', 3)

import ndl,sim
from zt import ztnbinom

Set up compute cluster and initialize its environment. (Make sure it's got all the right versions of the files, especially sim.py!)

In [14]:

from IPython.parallel import Client

rc = Client(profile='home')
dview = rc.direct_view()
dview.block = True
lview = rc.load_balanced_view()
lview.block = True
rc.ids

Out[14]:

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [15]:

%%px

import sys
sys.path = ['/home1/malouf/learning'] + sys.path
import sim
#from sim import Simulation

Create data that's distributed like the corpus counts in Ramscar et al.'s PNAS paper (see the Input Modeling notebook for details).

In [4]:

def cues(N):
    card = ztnbinom.rvs(3,.6)
    feats = range(card) + ['exactly%d'%card]
    return [feats,codeFunc(card)]    

In [5]:

ns = [ztnbinom.rvs(3,.6) for i in xrange(10000)]
data = np.zeros((max(ns)))
for i in ns:
    data[i-1] += 1
data

Out[5]:

array([  3.31e+03,   2.57e+03,   1.77e+03,   1.08e+03,   6.19e+02,
         3.13e+02,   1.65e+02,   7.60e+01,   5.30e+01,   2.00e+01,
         7.00e+00,   3.00e+00,   2.00e+00,   1.00e+00,   1.00e+00])

In [18]:

data = pd.DataFrame(data,columns=['Frequency'],index=range(1,len(data)+1))
data['Cues'] = [range(1,i+1) + ['exactly%d'%i] for i in data.index]
data['Number'] = data.index
data

Out[18]:

	Frequency	Cues	Number
1	3313	[1, exactly1]	1
2	2570	[1, 2, exactly2]	2
3	1773	[1, 2, 3, exactly3]	3
4	1084	[1, 2, 3, 4, exactly4]	4
5	619	[1, 2, 3, 4, 5, exactly5]	5
6	313	[1, 2, 3, 4, 5, 6, exactly6]	6
7	165	[1, 2, 3, 4, 5, 6, 7, exactly7]	7
8	76	[1, 2, 3, 4, 5, 6, 7, 8, exactly8]	8
9	53	[1, 2, 3, 4, 5, 6, 7, 8, 9, exactly9]	9
10	20	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, exactly10]	10
11	7	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, exactly11]	11
12	3	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, exactl...	12
13	2	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ex...	13
14	1	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	14
15	1	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	15

15 rows × 3 columns

In [19]:

%%time
r = sim.experiment(data, P=200, view=lview)

CPU times: user 43.2 s, sys: 12.6 s, total: 55.8 s
Wall time: 3min 56s

In [20]:

sim.all_results(r)

Now add a background feature (basically an intercept)

In [8]:

data['Cues'] = [['background'] + cues for cues in data['Cues']]
data

Out[8]:

	Frequency	Cues	Number	Outcomes
1	3313	[background, 1, exactly1]	1	notdu
2	2570	[background, 1, 2, exactly2]	2	du
3	1773	[background, 1, 2, 3, exactly3]	3	notdu
4	1084	[background, 1, 2, 3, 4, exactly4]	4	notdu
5	619	[background, 1, 2, 3, 4, 5, exactly5]	5	notdu
6	313	[background, 1, 2, 3, 4, 5, 6, exactly6]	6	notdu
7	165	[background, 1, 2, 3, 4, 5, 6, 7, exactly7]	7	notdu
8	76	[background, 1, 2, 3, 4, 5, 6, 7, 8, exactly8]	8	notdu
9	53	[background, 1, 2, 3, 4, 5, 6, 7, 8, 9, exactly9]	9	notdu
10	20	[background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ex...	10	notdu
11	7	[background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...	11	notdu
12	3	[background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...	12	notdu
13	2	[background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...	13	notdu
14	1	[background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...	14	notdu
15	1	[background, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...	15	notdu

15 rows × 4 columns

In [9]:

%%time
r2 = sim.experiment(data, P=200, view=lview)
sim.all_results(r2)

CPU times: user 47.1 s, sys: 11.5 s, total: 58.7 s
Wall time: 4min 12s

In [16]:

ns = [ztnbinom.rvs(3,.45) for i in xrange(10000)]
data2 = np.zeros((max(ns)))
for i in ns:
    data2[i-1] += 1
data2 = pd.DataFrame(data2,columns=['Frequency'],index=range(1,len(data2)+1))
data2['Cues'] = [range(1,i+1) + ['exactly%d'%i] for i in data2.index]
data2['Number'] = data2.index
data2

Out[16]:

	Frequency	Cues	Number
1	1653	[1, exactly1]	1
2	1877	[1, 2, exactly2]	2
3	1637	[1, 2, 3, exactly3]	3
4	1363	[1, 2, 3, 4, exactly4]	4
5	1035	[1, 2, 3, 4, 5, exactly5]	5
6	793	[1, 2, 3, 4, 5, 6, exactly6]	6
7	559	[1, 2, 3, 4, 5, 6, 7, exactly7]	7
8	361	[1, 2, 3, 4, 5, 6, 7, 8, exactly8]	8
9	265	[1, 2, 3, 4, 5, 6, 7, 8, 9, exactly9]	9
10	149	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, exactly10]	10
11	108	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, exactly11]	11
12	79	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, exactl...	12
13	47	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ex...	13
14	36	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	14
15	14	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	15
16	10	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	16
17	5	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	17
18	5	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	18
19	0	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	19
20	3	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	20
21	1	[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...	21

21 rows × 3 columns

In [17]:

%%time
r3 = sim.experiment(data, P=200, view=lview)
sim.all_results(r3)

CPU times: user 43 s, sys: 12.6 s, total: 55.5 s
Wall time: 3min 59s

In [ ]: