In [1]:

```
import pandas as pd
import sys
```

In [2]:

```
print 'Python version ' + sys.version
print 'Pandas version: ' + pd.__version__
```

In [3]:

```
# Create a dataframe with dates as your index
States = ['NY', 'NY', 'NY', 'NY', 'FL', 'FL', 'GA', 'GA', 'FL', 'FL']
data = [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10]
idx = pd.date_range('1/1/2012', periods=10, freq='MS')
df1 = pd.DataFrame(data, index=idx, columns=['Revenue'])
df1['State'] = States
# Create a second dataframe
data2 = [10.0, 10.0, 9, 9, 8, 8, 7, 7, 6, 6]
idx2 = pd.date_range('1/1/2013', periods=10, freq='MS')
df2 = pd.DataFrame(data2, index=idx2, columns=['Revenue'])
df2['State'] = States
```

In [4]:

```
# Combine dataframes
df = pd.concat([df1,df2])
df
```

Out[4]:

Note: Average and Standard Deviation are only valid for gaussian distributions.

In [5]:

```
# Method 1
# make a copy of original df
newdf = df.copy()
newdf['x-Mean'] = abs(newdf['Revenue'] - newdf['Revenue'].mean())
newdf['1.96*std'] = 1.96*newdf['Revenue'].std()
newdf['Outlier'] = abs(newdf['Revenue'] - newdf['Revenue'].mean()) > 1.96*newdf['Revenue'].std()
newdf
```

Out[5]:

In [6]:

```
# Method 2
# Group by item
# make a copy of original df
newdf = df.copy()
State = newdf.groupby('State')
newdf['Outlier'] = State.transform( lambda x: abs(x-x.mean()) > 1.96*x.std() )
newdf['x-Mean'] = State.transform( lambda x: abs(x-x.mean()) )
newdf['1.96*std'] = State.transform( lambda x: 1.96*x.std() )
newdf
```

Out[6]:

In [7]:

```
# Method 2
# Group by multiple items
# make a copy of original df
newdf = df.copy()
StateMonth = newdf.groupby(['State', lambda x: x.month])
newdf['Outlier'] = StateMonth.transform( lambda x: abs(x-x.mean()) > 1.96*x.std() )
newdf['x-Mean'] = StateMonth.transform( lambda x: abs(x-x.mean()) )
newdf['1.96*std'] = StateMonth.transform( lambda x: 1.96*x.std() )
newdf
```

Out[7]:

In [8]:

```
# Method 3
# Group by item
# make a copy of original df
newdf = df.copy()
State = newdf.groupby('State')
def s(group):
group['x-Mean'] = abs(group['Revenue'] - group['Revenue'].mean())
group['1.96*std'] = 1.96*group['Revenue'].std()
group['Outlier'] = abs(group['Revenue'] - group['Revenue'].mean()) > 1.96*group['Revenue'].std()
return group
Newdf2 = State.apply(s)
Newdf2
```

Out[8]:

In [9]:

```
# Method 3
# Group by multiple items
# make a copy of original df
newdf = df.copy()
StateMonth = newdf.groupby(['State', lambda x: x.month])
def s(group):
group['x-Mean'] = abs(group['Revenue'] - group['Revenue'].mean())
group['1.96*std'] = 1.96*group['Revenue'].std()
group['Outlier'] = abs(group['Revenue'] - group['Revenue'].mean()) > 1.96*group['Revenue'].std()
return group
Newdf2 = StateMonth.apply(s)
Newdf2
```

Out[9]:

Assumign a non gaussian distribution (if you plot it, it will not look like a normal distribution)

In [10]:

```
# make a copy of original df
newdf = df.copy()
State = newdf.groupby('State')
newdf['Lower'] = State['Revenue'].transform( lambda x: x.quantile(q=.25) - (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
newdf['Upper'] = State['Revenue'].transform( lambda x: x.quantile(q=.75) + (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
newdf['Outlier'] = (newdf['Revenue'] < newdf['Lower']) | (newdf['Revenue'] > newdf['Upper'])
newdf
```

Out[10]:

**Author:** David Rojas LLC