#!/usr/bin/env python
# coding: utf-8

# # An Tutorial for Data Science in Python
# 
# Python is an amazing language and here we show a comprehensive tutorial in it for usage in Data Science.
# 
# 
# ### Markdown Tip within Jupyter
# I can also write this text within Jupyter by changing Cell type to Markdown in dropdown. That's what I just did.
# For markdown changing size of font is easy by prefixing by #, or ## , or ### (more the number of # smaller the size of font) while for a non numbered list prefix by a -
# 

# ## Installation 
# Installation is done using pip or easy_install(from setup tools) . Here we show how to install Pandas package from the Jupyter Notebook itself. I use the --upgrade flag to upgrade it, and I install Bokeh using easy_tools. Pandas is the Python library for Data Analysis and Bokeh helps make interactive data analysis available. Note the ! sign before the sudo command- it helps me use the Terminal without leaving the comfort of my Jupyter Notebook
# 
# 

# In[2]:


get_ipython().system(' sudo pip install pandas --upgrade')


# In[3]:


get_ipython().system(' sudo easy_install bokeh')


# ## Loading a Python Package
# You can load a Python Package using the following ways
#  -  import PACKAGE
#  -  import PACKAGE as PK
#  -  from PACKAGE import FUN
#  
#  You can then invoke the function using
#  
#  PACKAGE.FUN , PK.FUN and FUN respectively
#  
#  

# In[4]:


from datetime import datetime
Starttime =datetime.now()
Starttime


# In[6]:


import pandas as pd


# ## Import Data
# Let's import some datasets. We will use Datasets bundled with R language  from https://vincentarelbundock.github.io/Rdatasets/datasets.html

# In[9]:


diamonds =pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv")


# ## Data Inspection

# In[14]:


diamonds.columns #Single Line Comment starts with # 
# name of variables is given by columns. In R we would use the command names(object)
# Note also R uses the FUNCTION(OBJECTNAME) syntax while Python uses OBJECTNAME.FUNCTION


# In[41]:


len(diamonds) #gives the number of rows


# In[46]:


0.0001*len(diamonds)


# In[47]:


round(0.0001*len(diamonds))


# In[15]:


'''Lets get some information on the object.
In R we would get this by str command (for structure). 
In Python str turns  the object to string
This was a multiple line comment using three single quote marks
'''
diamonds.info() 


# In[8]:


diamonds.head(10) #we check the first 10 rows in the dataset


# -  to refer to particular row in Python I can use index. 
# -  In R I refer to the object in i th row and jth column by OBJECTNAME[i,j]
# -  In R I refer to the  column name by OBJECTNAME$ColumnName
# - Note in Python Index starts with 0 while in R it starts with 1.
# 

# In[36]:


diamonds.ix[20:30]


# In[88]:


#To refer to a particular column I use it's name
# I can also chain the commands
diamonds.ix[20:25].cut


# In[34]:


diamonds.ix[20:25]["color"]


# ## Random Sample

# In[87]:


import numpy as np


# In[90]:


rows = np.random.choice(diamonds.index.values, round(0.0001*len(diamonds)))
print(rows)


# In[91]:


diamonds.ix[rows]


# In[92]:


##Mising Values 

diamonds= diamonds.dropna(how='any') 


# ## Summaries
# We now do summaries for numerical and categorical data.

# In[18]:


diamonds.describe()


# In[30]:


diamonds.price.describe()


# In[56]:


diamonds.corr() #Numerical Corelations


# In[58]:


diamonds.corr()>0.5


# In[60]:


diamonds['cut'].unique() #To get unique values


# In[59]:


diamonds['clarity'].unique()


# In[50]:


pd.value_counts(diamonds.cut)


# In[51]:


pd.value_counts(diamonds.color)


# In[52]:


pd.crosstab(diamonds.cut,diamonds.color)


# In[64]:


pd.crosstab(diamonds.cut,diamonds.color,margins='TRUE')


# In[80]:


pd.crosstab(diamonds.cut,diamonds.color,margins='TRUE')


# In[61]:


cutgroup=pd.groupby(diamonds,diamonds.cut)


# In[25]:


cutgroup


# In[28]:


cutgroup.price.median()


# In[67]:


cutgroup.price.median().reset_index()


# In[77]:


d=cutgroup.price.median().reset_index()
d.transpose()


# In[71]:


diamonds.groupby(['cut', "color"])


# In[72]:


diamonds.groupby(['cut', "color"]).price.median().reset_index()


# In[78]:


e=diamonds.groupby(['cut', "color"]).price.median().reset_index()
e.pivot(index='cut', columns='color', values='price')


# In[81]:


f=e.pivot(index='cut', columns='color', values='price')


# In[83]:


f>4000


# ## Data Visualization

# In[123]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
pd.options.display.mpl_style = 'default'
plt.style.use('ggplot')


# In[101]:


get_ipython().system('sudo pip install seaborn --upgrade')


# In[116]:


diamonds['price'].plot()


# In[119]:


plt.hist(diamonds.price)


# In[122]:


plt.figure();
diamonds['price'].plot(kind='hist', stacked=True, bins=20)


# In[105]:


plt.boxplot(diamonds.price)


# In[125]:


plt.figure();
diamonds['price'].plot(kind='box')


# In[131]:


diamonds.plot(kind='hexbin', x='price', y='carat', gridsize=8)


# In[132]:


from ggplot import *


# In[133]:


p = ggplot(aes(x='price', y='carat',color="clarity"), data=diamonds)
p + geom_point()


# In[134]:


p = ggplot(aes(x='price', y='carat',color="cut"), data=diamonds)
p + geom_point()


# ## Modeling
# Lets do some basic Regression Modeling

# In[93]:


import statsmodels.formula.api as sm


# In[94]:


result = sm.ols(formula="price ~ carat + color", data=diamonds).fit()


# In[97]:


result.summary()


# In[96]:


result.params


# In[ ]: