In [1]:
import pandas as pd
from datetime import datetime
import scipy.stats as sts
In [2]:
def parse(yr, mon):
    date = datetime(year=int(yr), day=2, month=int(mon))
    return date
In [3]:
url='http://www.cpc.ncep.noaa.gov/data/indices/sstoi.indices'

data = pd.read_table(url, sep=' ', header=0, skiprows=0, parse_dates = [['YR', 'MON']], skipinitialspace=True, index_col=0, date_parser=parse)                     
print data.describe
print '\n'
print data.index
<bound method DataFrame.describe of <class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 377 entries, 1982-01-02 00:00:00 to 2013-05-02 00:00:00
Data columns (total 9 columns):
NINO1+2        377  non-null values
ANOM           377  non-null values
NINO3          377  non-null values
ANOM.1         377  non-null values
NINO4          377  non-null values
ANOM.2         377  non-null values
NINO3.4        377  non-null values
ANOM.3         377  non-null values
Unnamed: 10    0  non-null values
dtypes: float64(9)>


<class 'pandas.tseries.index.DatetimeIndex'>
[1982-01-02 00:00:00, ..., 2013-05-02 00:00:00]
Length: 377, Freq: None, Timezone: None

Group by Year

In [4]:
grouped = data.groupby(lambda x: x.year)
grouped
Out[4]:
<pandas.core.groupby.DataFrameGroupBy at 0x1095e4e10>

Calculate std mean anomaly

(it's wrong i need to remove the 2013 from the calculation because in 2013 i don't have all the 12 months)

In [5]:
zscore = lambda x: (x - x.mean()) / x.std()
transformed = grouped.transform(zscore)
print transformed['ANOM.3']
YR_MON
1982-01-02   -0.986922
1982-02-02   -1.179216
1982-03-02   -1.179216
1982-04-02   -0.885119
1982-05-02   -0.376105
1982-06-02    0.087664
1982-07-02   -0.161188
1982-08-02    0.098975
1982-09-02    0.415695
1982-10-02    1.049134
1982-11-02    1.286674
1982-12-02    1.829622
1983-01-02    1.715072
1983-02-02    1.428598
1983-03-02    0.976272
...
2012-03-02   -0.999284
2012-04-02   -0.663736
2012-05-02   -0.063283
2012-06-02    0.572491
2012-07-02    0.961020
2012-08-02    1.314227
2012-09-02    0.925699
2012-10-02    0.537170
2012-11-02    0.660793
2012-12-02   -0.169245
2013-01-02   -1.001483
2013-02-02   -0.924445
2013-03-02    0.462223
2013-04-02    1.386668
2013-05-02    0.077037
Name: ANOM.3, Length: 377, dtype: float64

This is what i'm trying to achieve

In [6]:
npdata = np.genfromtxt(url, skip_header=1)
unique_enso_year = [int(value) for value in set(npdata[:, 0])]
nin34 = np.zeros(len(unique_enso_year))
for ind, year in enumerate(unique_enso_year):
    indexes = np.flatnonzero(npdata[:, 0]==year)
    if len(indexes) == 12:
        nin34[ind] = np.mean(npdata[indexes, 9])
    else:
        nin34[ind] = np.nan

nin34x = (nin34 - sts.nanmean(nin34)) / sts.nanstd(nin34)
nin34y = np.vstack((unique_enso_year, nin34x))
nin34y
Out[6]:
array([[  1.98200000e+03,   1.98300000e+03,   1.98400000e+03,
          1.98500000e+03,   1.98600000e+03,   1.98700000e+03,
          1.98800000e+03,   1.98900000e+03,   1.99000000e+03,
          1.99100000e+03,   1.99200000e+03,   1.99300000e+03,
          1.99400000e+03,   1.99500000e+03,   1.99600000e+03,
          1.99700000e+03,   1.99800000e+03,   1.99900000e+03,
          2.00000000e+03,   2.00100000e+03,   2.00200000e+03,
          2.00300000e+03,   2.00400000e+03,   2.00500000e+03,
          2.00600000e+03,   2.00700000e+03,   2.00800000e+03,
          2.00900000e+03,   2.01000000e+03,   2.01100000e+03,
          2.01200000e+03,   2.01300000e+03],
       [  1.52721525e+00,   7.79877334e-01,  -9.70046905e-01,
         -1.01299736e+00,   1.93296839e-01,   1.97880860e+00,
         -1.60325932e+00,  -1.17375478e+00,   2.44837384e-01,
          9.67632179e-01,   9.77449426e-01,   5.68806528e-01,
          6.17892762e-01,  -2.70568072e-01,  -6.84119592e-01,
          1.85732017e+00,  -1.90802942e-01,  -1.71861197e+00,
         -1.28788027e+00,  -3.49106046e-01,   1.10630079e+00,
          4.66952593e-01,   5.85986710e-01,   1.96978306e-01,
          2.73061969e-01,  -7.51613164e-01,  -1.06085644e+00,
          5.73715152e-01,  -6.42396294e-01,  -1.20075220e+00,
          6.33370760e-04,              nan]])

The same result with pandas is given by :

In [7]:
attempt_cheating = (grouped.mean()['ANOM.3'][:-1] - sts.nanmean(grouped.mean()['ANOM.3'][:-1])) / sts.nanstd(grouped.mean()['ANOM.3'][:-1])
In [8]:
attempt_cheating - nin34x[:-1]
Out[8]:
1982    0
1983    0
1984    0
1985    0
1986    0
1987    0
1988    0
1989    0
1990    0
1991    0
1992    0
1993    0
1994    0
1995    0
1996    0
1997    0
1998    0
1999    0
2000    0
2001    0
2002    0
2003    0
2004    0
2005    0
2006    0
2007    0
2008    0
2009    0
2010    0
2011    0
2012    0
Name: ANOM.3, dtype: float64

This query get rid of the 2013 (removing the values from the group with less than 12 elements (months))

but it loose the time-stamp indexes .. so i can't do enymore the group by year :(

In [9]:
grouped2 = data.groupby(lambda x: x.year).apply(lambda sdf: sdf if len(sdf) > 11 else None).reset_index(drop=True)
print grouped2
print '\n'
print grouped2.index
<class 'pandas.core.frame.DataFrame'>
Int64Index: 372 entries, 0 to 371
Data columns (total 9 columns):
NINO1+2        372  non-null values
ANOM           372  non-null values
NINO3          372  non-null values
ANOM.1         372  non-null values
NINO4          372  non-null values
ANOM.2         372  non-null values
NINO3.4        372  non-null values
ANOM.3         372  non-null values
Unnamed: 10    0  non-null values
dtypes: float64(9)


Int64Index([0  , 1  , 2  , 3  , 4  , 5  , 6  , 7  , 8  , 9  , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371], dtype=int64)

In [10]:
(grouped.mean()['ANOM.3'][:-1] - sts.nanmean(grouped.mean()['ANOM.3'][:-1])) / sts.nanstd(grouped.mean()['ANOM.3'][:-1])
Out[10]:
1982    1.527215
1983    0.779877
1984   -0.970047
1985   -1.012997
1986    0.193297
1987    1.978809
1988   -1.603259
1989   -1.173755
1990    0.244837
1991    0.967632
1992    0.977449
1993    0.568807
1994    0.617893
1995   -0.270568
1996   -0.684120
1997    1.857320
1998   -0.190803
1999   -1.718612
2000   -1.287880
2001   -0.349106
2002    1.106301
2003    0.466953
2004    0.585987
2005    0.196978
2006    0.273062
2007   -0.751613
2008   -1.060856
2009    0.573715
2010   -0.642396
2011   -1.200752
2012    0.000633
Name: ANOM.3, dtype: float64
In [10]:
gist = !jist -p test.ipynb
nbviewer = gist[0].replace('https://gist.github.com/','http://nbviewer.ipython.org/')
print nbviewer
In [10]: