%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Following is optional: set plotting styles
import seaborn; seaborn.set()

L = [1, 2, 3, 4, 5]

# Zero-based Indexing
print(L[0], L[1])

# Indexing from the end
print(L[-1], L[-2])

# Slicing
L[0:3]

# The 0 can be left-out
L[:3]

# Slicing by a step size
L[0:5:2]

# Reversing with a negative step size
L[::-1]

# Lists of multiple types
L2 = [1, 'two', 3.14]

# Adding lists together will append them:
L + L2

import math

# make a large list of theta values
theta = [0.01 * i for i in range(1000000)]
sin_theta = [math.sin(t) for t in theta]
sin_theta[:10]

%timeit [math.sin(t) for t in theta]

import numpy as np

theta = 0.01 * np.arange(1E6)

sin_theta = np.sin(theta)
sin_theta[:10]

%timeit np.sin(theta)

# from a list
np.array([1, 2, 3, 4])

# range of numbers, like Python's range()
np.arange(0, 10, 0.5)

# range of numbers between two limits
np.linspace(0, 10, 5)

# array of zeros
np.zeros(10)

# array of ones
np.ones(10)

# array of random values
np.random.rand(10)

# define some arrays
x = np.arange(5)
y = np.random.random(5)

# addition – add 1 to each
x + 1

# multiplication – multiply each by 2
y * 2

# two arrays: everything is element-wise
x / y

# exponentiation
np.exp(x)

# trigonometric functions
np.sin(x)

# combining operations
np.cos(x) + np.sin(2 * np.pi * (x - y))

x

x[0], x[1]

x[:3]

x[::2]

x[::-1]

M = np.arange(20).reshape(4, 5)
M

M[1, 2]

M[:2, :2]

M[:, 1:3]

M

M < 8

M[M < 8] = 0
M

M[M == 12] *= 2
M

M[M % 2 == 0] = 999
M

s = pd.Series([0.1, 0.2, 0.3, 0.4])

s.index

s[0]

s2 = pd.Series(np.arange(4), index=['a', 'b', 'c', 'd'])
s2

s2['c']

pop_dict = {'California': 38332521,
            'Texas': 26448193,
            'New York': 19651127,
            'Florida': 19552860,
            'Illinois': 12882135}
populations = pd.Series(pop_dict)
populations

populations['California']

populations['California':'Illinois']

data = {'state': ['California', 'Texas', 'New York', 'Florida', 'Illinois'],
        'population': [38332521, 26448193, 19651127, 19552860, 12882135],
        'area':[423967, 695662, 141297, 170312, 149995]}
states = pd.DataFrame(data)
states

states = states.set_index('state')
states

states['area']

states.loc['California']

states['density'] = states['population'] / states['area']
states

states[states['density'] > 100]

states.sort_index(by='density', ascending=False)[:3]

states.describe()

# !curl -O http://www.ssa.gov/oact/babynames/names.zip

# !mkdir -p data/names
# !mv names.zip data/names/
# !cd data/names/ && unzip names.zip

!ls data/names

!head data/names/yob1880.txt

names1880 = pd.read_csv('data/names/yob1880.txt')
names1880.head()

names1880 = pd.read_csv('data/names/yob1880.txt',
                        names=['name', 'gender', 'births'])
names1880.head()

males = names1880[names1880.gender == 'M']
females = names1880[names1880.gender == 'F']

males.births.sum(), females.births.sum()

grouped = names1880.groupby('gender')
grouped

grouped.sum()

grouped.size()

grouped.mean()

grouped.describe()

def load_year(year):
    data = pd.read_csv('data/names/yob{0}.txt'.format(year),
                       names=['name', 'gender', 'births'])
    data['year'] = year
    return data

names = pd.concat([load_year(year) for year in range(1880, 2014)])
names.head()

births = names.groupby('year').births.sum()
births.head()

births.plot();

names.groupby('year').births.count().plot();

def add_frequency(group):
    group['birth_freq'] = group.births / group.births.sum()
    return group

names = names.groupby(['year', 'gender']).apply(add_frequency)
names.head()

men = names[names.gender == 'M']
women = names[names.gender == 'W']

births = names.pivot_table('births',
                           index='year', columns='gender',
                           aggfunc=sum)
births.head()

births.plot(title='Total Births');

names_to_check = ['Allison', 'Alison']

# filter on just the names we're interested in
births = names[names.name.isin(names_to_check)]

# pivot table to get year vs. gender
births = births.pivot_table('births', index='year', columns='gender')

# fill all NaNs with zeros
births = births.fillna(0)

# normalize along columns
births = births.div(births.sum(1), axis=0)

births.plot(title='Fraction of babies named Allison');

pd.rolling_mean(births, 5).plot(title="Allisons: 5-year moving average");