%matplotlib inline import matplotlib.pyplot as plt import numpy as np import pandas as pd # Following is optional: set plotting styles import seaborn; seaborn.set() L = [1, 2, 3, 4, 5] # Zero-based Indexing print(L[0], L[1]) # Indexing from the end print(L[-1], L[-2]) # Slicing L[0:3] # The 0 can be left-out L[:3] # Slicing by a step size L[0:5:2] # Reversing with a negative step size L[::-1] # Lists of multiple types L2 = [1, 'two', 3.14] # Adding lists together will append them: L + L2 import math # make a large list of theta values theta = [0.01 * i for i in range(1000000)] sin_theta = [math.sin(t) for t in theta] sin_theta[:10] %timeit [math.sin(t) for t in theta] import numpy as np theta = 0.01 * np.arange(1E6) sin_theta = np.sin(theta) sin_theta[:10] %timeit np.sin(theta) # from a list np.array([1, 2, 3, 4]) # range of numbers, like Python's range() np.arange(0, 10, 0.5) # range of numbers between two limits np.linspace(0, 10, 5) # array of zeros np.zeros(10) # array of ones np.ones(10) # array of random values np.random.rand(10) # define some arrays x = np.arange(5) y = np.random.random(5) # addition – add 1 to each x + 1 # multiplication – multiply each by 2 y * 2 # two arrays: everything is element-wise x / y # exponentiation np.exp(x) # trigonometric functions np.sin(x) # combining operations np.cos(x) + np.sin(2 * np.pi * (x - y)) x x[0], x[1] x[:3] x[::2] x[::-1] M = np.arange(20).reshape(4, 5) M M[1, 2] M[:2, :2] M[:, 1:3] M M < 8 M[M < 8] = 0 M M[M == 12] *= 2 M M[M % 2 == 0] = 999 M s = pd.Series([0.1, 0.2, 0.3, 0.4]) s.index s[0] s2 = pd.Series(np.arange(4), index=['a', 'b', 'c', 'd']) s2 s2['c'] pop_dict = {'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135} populations = pd.Series(pop_dict) populations populations['California'] populations['California':'Illinois'] data = {'state': ['California', 'Texas', 'New York', 'Florida', 'Illinois'], 'population': [38332521, 26448193, 19651127, 19552860, 12882135], 'area':[423967, 695662, 141297, 170312, 149995]} states = pd.DataFrame(data) states states = states.set_index('state') states states['area'] states.loc['California'] states['density'] = states['population'] / states['area'] states states[states['density'] > 100] states.sort_index(by='density', ascending=False)[:3] states.describe() # !curl -O http://www.ssa.gov/oact/babynames/names.zip # !mkdir -p data/names # !mv names.zip data/names/ # !cd data/names/ && unzip names.zip !ls data/names !head data/names/yob1880.txt names1880 = pd.read_csv('data/names/yob1880.txt') names1880.head() names1880 = pd.read_csv('data/names/yob1880.txt', names=['name', 'gender', 'births']) names1880.head() males = names1880[names1880.gender == 'M'] females = names1880[names1880.gender == 'F'] males.births.sum(), females.births.sum() grouped = names1880.groupby('gender') grouped grouped.sum() grouped.size() grouped.mean() grouped.describe() def load_year(year): data = pd.read_csv('data/names/yob{0}.txt'.format(year), names=['name', 'gender', 'births']) data['year'] = year return data names = pd.concat([load_year(year) for year in range(1880, 2014)]) names.head() births = names.groupby('year').births.sum() births.head() births.plot(); names.groupby('year').births.count().plot(); def add_frequency(group): group['birth_freq'] = group.births / group.births.sum() return group names = names.groupby(['year', 'gender']).apply(add_frequency) names.head() men = names[names.gender == 'M'] women = names[names.gender == 'W'] births = names.pivot_table('births', index='year', columns='gender', aggfunc=sum) births.head() births.plot(title='Total Births'); names_to_check = ['Allison', 'Alison'] # filter on just the names we're interested in births = names[names.name.isin(names_to_check)] # pivot table to get year vs. gender births = births.pivot_table('births', index='year', columns='gender') # fill all NaNs with zeros births = births.fillna(0) # normalize along columns births = births.div(births.sum(1), axis=0) births.plot(title='Fraction of babies named Allison'); pd.rolling_mean(births, 5).plot(title="Allisons: 5-year moving average");