import pandas as pd nba_df = pd.read_csv("NBA-Census-10.14.2013.csv") # Look at the first few parts of the dataframe nba_df.head() # ....or nba_df[:10] # Find out how many people are in each category # If you're dealing with numerical data, use .describe() nba_df["POS"].value_counts() # Get all of the people who match a certain characteristic nba_df[nba_df["POS"] == "F"].head() # Get all of the people who match a certain characteristic nba_df[(nba_df["POS"] == "F") & (nba_df["HS Only"] == "No") ].head() # Get all of the people who match one of any X characteristics nba_df[(nba_df["POS"] == "F") | (nba_df["POS"] == "G") ].head() # Retrieve what's nan/null/etc nba_df[pd.isnull(nba_df["Race"])].head() # Retrieve what's NOT nan/null/etc nba_df[~pd.isnull(nba_df["Race"])].head() # or this nba_df[pd.notnull(nba_df["Race"])].head() # Retrieve everyone who is not a guard nba_df[~(nba_df["POS"] == "G")].head() # Get numerical data on a column # If you're dealing with labels or groups, use .value_counts() nba_df["Age"].describe() # Get numerical data on grouped data nba_df.groupby("POS")["Age"].describe() # Remove columns that you HATE with .drop # Need to save it as a new (or the same) variable nba_df = nba_df.drop(["City"], axis=1) nba_df.columns # Calculate a new column from an existing column nba_df["Ht (Cm.)"] = nba_df["Ht (In.)"] * 2.54 nba_df[:2] # String manipulation on an entire column # Need to use .str to treat it as a string nba_df["Name"].str.lower() # Do more intense manipulation with .apply + an external function # You will always forget to do axis=1, so remember it! # Just treat row like a dictionary, it goes one at a time def do_i_like_them(row): if row["Age"] >= 31: return True else: return False nba_df["Liked"] = nba_df.apply(do_i_like_them, axis=1) nba_df["Liked"].value_counts() # OPEN QUESTION: HOW DO YOU ADD A ROW TO A DATAFRAME!!!!!!! # Get one column of a dataframe nba_df.ix[0] # Maybe sometimes use .iloc # For loops with dataframes # Can't do for row in nba_df, gotta use iterrows() for index, row in nba_df.iterrows(): print str(index) + ": " + row["Name"] # Grouping by as many as you want # Be sure to put the groupby stuff in square brackets nba_df.groupby(["POS", "Race"])["Age"].describe() # Histograms # Shows you the spread of one numerical value nba_df["Age"].hist() # Cathy says there should always be 30 mins nba_df["Age"].hist(bins=30) # Cathy is never wrong # Scatterplots show you the relationship of two numerical values # If you have a line they're related, otherwise nopers nba_df.plot("Ht (In.)","WT", kind='scatter') # You can also scatterplot like this plt.scatter(nba_df["Ht (In.)"], nba_df["WT"]) # Bar Chart pos_size = nba_df.groupby("POS").size() print pos_size pos_size.plot(kind='bar', title="Position")