%pylab inline from pandas import * # Subplotting example a = randn(50).cumsum() b = randn(50).cumsum() c = randn(50).cumsum() df=DataFrame({'a':a,'b':b,'c':c}) df.plot(subplots=True) df.plot() # Adding options #Generate a random graph a = randn(50).cumsum() df = DataFrame(a) # convert it to a dataframe df.plot(title='Random Plot',kind='line') # Pandas data frames have a built in plot function with defaults: df = DataFrame(np.random.randn(10, 4).cumsum(0), columns=['A', 'B', 'C', 'D'], index=np.arange(0, 100, 10)) df df.plot(title='Plot automatically finds Indexes and Places Labels') titanic = read_csv("data/titanic.csv") titanic SurvivedByClass=titanic.groupby('pclass').survived.sum() SurvivedByClass SurvivedByClass.plot(kind='bar',title='Titanic Survivors by Class') # Combine multiple groups and plot titanic.groupby(['sex','pclass']).survived.sum().plot(kind='barh',title='Titanic Survivors by Class and Gender') # Plot CrossTabs death_counts = crosstab([titanic.pclass, titanic.sex], titanic.survived.astype(bool)) death_counts.plot(kind='bar', stacked=True, title='Passenger Outcomes by Gender and Class', color=['white','blue'], grid=False) # Histogram of a particular column titanic.fare.hist(grid=False,bins=25,range=(0,100)) # Density plot -- Estimate the 'true' distribution titanic.fare.dropna().plot(kind='kde', xlim=(0,100)) titanic.fare.hist(bins=25, normed=True, color='lightseagreen') titanic.fare.plot(kind='kde', xlim=(0,600), style='r--') titanic.boxplot(column='fare',by='pclass',grid=False) # Combine pandas plots by running them together -- if pandas thinks they can be overlapped, it will titanic.fare.hist(bins=25, normed=True, color='lightseagreen') titanic.fare.plot(kind='kde', xlim=(0,100), style='r--') titanic.boxplot(column='fare',by='pclass',grid=False) import matplotlib.pyplot as plt x = np.arange(50) y = np.random.randn(50) plt.scatter(x,y) %pylab inline #Single Plot a = randn(50).cumsum() plt.plot(a, color='red') fig = figure() # Two rows, one column, first plot ax1 = fig.add_subplot(2,1,1) ax1.plot(a, color='red') #Two rows, one column, second plot ax2 = fig.add_subplot(2,1,2) ax2.scatter(np.arange(50), randn(50)) # Exercise: Try 1 row, two columns # Exercise: Try 1 row, one column fig, ax = plt.subplots(2,3) ax[0,1].plot(randn(50), color='green', linestyle='-') ax[1,2].scatter(np.arange(50), randn(50), color='red') ax[1,0].hist(randn(1000)) plt.show() #Similar to print() # Exercise: Swap the top row and bottom row plots # Excecise: Change the line style to dotted and add circles as markers # Exercise: change the bins to 100 for the histogram fig = plt.figure(); ax = fig.add_subplot(1, 1, 1) ax.plot(randn(1000).cumsum(), 'k', label='one') ax.plot(randn(1000).cumsum(), 'k--', label='two') ax.plot(randn(1000).cumsum(), 'k.', label='three') ax.set_title('Three random lines') ax.legend(loc='best') plt.show() #Exercise: Change the color and line style of each line using the minimum amount of characters # Save the file plt.savefig('ThreeRandomLines.pdf') %pylab inline import pandas as pd ### ScatterPlots from Data Frames # Combine pandas with matplotlib for scatterplots baseball = pd.read_csv("data/baseball.csv") #Offensive Statistics from MLB -- 2008. Glossary: http://www.baseballprospectus.com/glossary/index.php?context=all&category=true baseball.head() fig = plt.figure() ax = fig.add_subplot(1,1,1) # one row, one column, first plot ax.set_title("Hits vs. At Bats") ax.set_xlabel("At Bats") ax.set_ylabel("Hits") ax.scatter(baseball.ab, baseball.h) fig = plt.figure() ax = fig.add_subplot(1,1,1) # one row, one column, first plot ax.set_title("At Bats vs. Hits. Size = Home Runs") ax.set_xlabel("At Bats") ax.set_ylabel("Hits") plt.scatter(baseball.ab, baseball.h, s=baseball.hr*10, alpha=0.5) xlim(0, 700); ylim(0, 200) baseball xkcd() #plt.scatter(baseball.ab, baseball.h, c=baseball.hr, s=40, cmap='hot') fig = plt.figure() ax = fig.add_subplot(1,1,1) # one row, one column, first plot ax.set_title("At Bats vs. Hits, Size and Color = Home Runs") ax.set_xlabel("At Bats") ax.set_ylabel("Hits") plt.scatter(baseball.ab, baseball.h, c=baseball.hr,cmap='hot') xlim(0, 700); ylim(0, 200) #With Titanic Data import seaborn as sns sns.set(style="darkgrid") df = sns.load_dataset("titanic") pal = dict(male="#6495ED", female="#F08080") g = sns.lmplot("age", "survived", col="sex", hue="sex", data=df, palette=pal, y_jitter=.02, logistic=True) g.set(xlim=(0, 80), ylim=(-.05, 1.05)) # Seaborn with Anscombe Data import seaborn as sns sns.set(style="ticks") df = sns.load_dataset("anscombe") sns.lmplot("x", "y", col="dataset", hue="dataset", data=df, col_wrap=2, ci=None, palette="muted", size=4, scatter_kws={"s": 50, "alpha": 1})