import pandas as pd

nba_df = pd.read_csv("NBA-Census-10.14.2013.csv")

# Look at the first few parts of the dataframe
nba_df.head()

# ....or
nba_df[:10]

# Find out how many people are in each category
# If you're dealing with numerical data, use .describe()
nba_df["POS"].value_counts()

# Get all of the people who match a certain characteristic
nba_df[nba_df["POS"] == "F"].head()

# Get all of the people who match a certain characteristic
nba_df[(nba_df["POS"] == "F") & (nba_df["HS Only"] == "No") ].head()

# Get all of the people who match one of any X characteristics
nba_df[(nba_df["POS"] == "F") | (nba_df["POS"] == "G") ].head()

# Retrieve what's nan/null/etc
nba_df[pd.isnull(nba_df["Race"])].head()

# Retrieve what's NOT nan/null/etc
nba_df[~pd.isnull(nba_df["Race"])].head()

# or this
nba_df[pd.notnull(nba_df["Race"])].head()

# Retrieve everyone who is not a guard
nba_df[~(nba_df["POS"] == "G")].head()

# Get numerical data on a column
# If you're dealing with labels or groups, use .value_counts()
nba_df["Age"].describe()

# Get numerical data on grouped data
nba_df.groupby("POS")["Age"].describe()

# Remove columns that you HATE with .drop
# Need to save it as a new (or the same) variable
nba_df = nba_df.drop(["City"], axis=1)
nba_df.columns

# Calculate a new column from an existing column
nba_df["Ht (Cm.)"] = nba_df["Ht (In.)"] * 2.54
nba_df[:2]

# String manipulation on an entire column
# Need to use .str to treat it as a string
nba_df["Name"].str.lower()

# Do more intense manipulation with .apply + an external function
# You will always forget to do axis=1, so remember it!
# Just treat row like a dictionary, it goes one at a time
def do_i_like_them(row):
    if row["Age"] >= 31:
        return True
    else:
        return False

nba_df["Liked"] = nba_df.apply(do_i_like_them, axis=1)
nba_df["Liked"].value_counts()

# OPEN QUESTION: HOW DO YOU ADD A ROW TO A DATAFRAME!!!!!!!

# Get one column of a dataframe
nba_df.ix[0]
# Maybe sometimes use .iloc

# For loops with dataframes
# Can't do for row in nba_df, gotta use iterrows()
for index, row in nba_df.iterrows():
    print str(index) + ": " + row["Name"]

# Grouping by as many as you want
# Be sure to put the groupby stuff in square brackets
nba_df.groupby(["POS", "Race"])["Age"].describe()

# Histograms
# Shows you the spread of one numerical value
nba_df["Age"].hist()

# Cathy says there should always be 30 mins
nba_df["Age"].hist(bins=30)
# Cathy is never wrong

# Scatterplots show you the relationship of two numerical values
# If you have a line they're related, otherwise nopers
nba_df.plot("Ht (In.)","WT", kind='scatter')

# You can also scatterplot like this
plt.scatter(nba_df["Ht (In.)"], nba_df["WT"])

# Bar Chart
pos_size = nba_df.groupby("POS").size()
print pos_size
pos_size.plot(kind='bar', title="Position")