#!/usr/bin/env python # coding: utf-8 # # An Tutorial for Data Science in Python # # Python is an amazing language and here we show a comprehensive tutorial in it for usage in Data Science. # # # ### Markdown Tip within Jupyter # I can also write this text within Jupyter by changing Cell type to Markdown in dropdown. That's what I just did. # For markdown changing size of font is easy by prefixing by #, or ## , or ### (more the number of # smaller the size of font) while for a non numbered list prefix by a - # # ## Installation # Installation is done using pip or easy_install(from setup tools) . Here we show how to install Pandas package from the Jupyter Notebook itself. I use the --upgrade flag to upgrade it, and I install Bokeh using easy_tools. Pandas is the Python library for Data Analysis and Bokeh helps make interactive data analysis available. Note the ! sign before the sudo command- it helps me use the Terminal without leaving the comfort of my Jupyter Notebook # # # In[2]: get_ipython().system(' sudo pip install pandas --upgrade') # In[3]: get_ipython().system(' sudo easy_install bokeh') # ## Loading a Python Package # You can load a Python Package using the following ways # - import PACKAGE # - import PACKAGE as PK # - from PACKAGE import FUN # # You can then invoke the function using # # PACKAGE.FUN , PK.FUN and FUN respectively # # # In[4]: from datetime import datetime Starttime =datetime.now() Starttime # In[6]: import pandas as pd # ## Import Data # Let's import some datasets. We will use Datasets bundled with R language from https://vincentarelbundock.github.io/Rdatasets/datasets.html # In[9]: diamonds =pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv") # ## Data Inspection # In[14]: diamonds.columns #Single Line Comment starts with # # name of variables is given by columns. In R we would use the command names(object) # Note also R uses the FUNCTION(OBJECTNAME) syntax while Python uses OBJECTNAME.FUNCTION # In[41]: len(diamonds) #gives the number of rows # In[46]: 0.0001*len(diamonds) # In[47]: round(0.0001*len(diamonds)) # In[15]: '''Lets get some information on the object. In R we would get this by str command (for structure). In Python str turns the object to string This was a multiple line comment using three single quote marks ''' diamonds.info() # In[8]: diamonds.head(10) #we check the first 10 rows in the dataset # - to refer to particular row in Python I can use index. # - In R I refer to the object in i th row and jth column by OBJECTNAME[i,j] # - In R I refer to the column name by OBJECTNAME$ColumnName # - Note in Python Index starts with 0 while in R it starts with 1. # # In[36]: diamonds.ix[20:30] # In[88]: #To refer to a particular column I use it's name # I can also chain the commands diamonds.ix[20:25].cut # In[34]: diamonds.ix[20:25]["color"] # ## Random Sample # In[87]: import numpy as np # In[90]: rows = np.random.choice(diamonds.index.values, round(0.0001*len(diamonds))) print(rows) # In[91]: diamonds.ix[rows] # In[92]: ##Mising Values diamonds= diamonds.dropna(how='any') # ## Summaries # We now do summaries for numerical and categorical data. # In[18]: diamonds.describe() # In[30]: diamonds.price.describe() # In[56]: diamonds.corr() #Numerical Corelations # In[58]: diamonds.corr()>0.5 # In[60]: diamonds['cut'].unique() #To get unique values # In[59]: diamonds['clarity'].unique() # In[50]: pd.value_counts(diamonds.cut) # In[51]: pd.value_counts(diamonds.color) # In[52]: pd.crosstab(diamonds.cut,diamonds.color) # In[64]: pd.crosstab(diamonds.cut,diamonds.color,margins='TRUE') # In[80]: pd.crosstab(diamonds.cut,diamonds.color,margins='TRUE') # In[61]: cutgroup=pd.groupby(diamonds,diamonds.cut) # In[25]: cutgroup # In[28]: cutgroup.price.median() # In[67]: cutgroup.price.median().reset_index() # In[77]: d=cutgroup.price.median().reset_index() d.transpose() # In[71]: diamonds.groupby(['cut', "color"]) # In[72]: diamonds.groupby(['cut', "color"]).price.median().reset_index() # In[78]: e=diamonds.groupby(['cut', "color"]).price.median().reset_index() e.pivot(index='cut', columns='color', values='price') # In[81]: f=e.pivot(index='cut', columns='color', values='price') # In[83]: f>4000 # ## Data Visualization # In[123]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.mpl_style = 'default' plt.style.use('ggplot') # In[101]: get_ipython().system('sudo pip install seaborn --upgrade') # In[116]: diamonds['price'].plot() # In[119]: plt.hist(diamonds.price) # In[122]: plt.figure(); diamonds['price'].plot(kind='hist', stacked=True, bins=20) # In[105]: plt.boxplot(diamonds.price) # In[125]: plt.figure(); diamonds['price'].plot(kind='box') # In[131]: diamonds.plot(kind='hexbin', x='price', y='carat', gridsize=8) # In[132]: from ggplot import * # In[133]: p = ggplot(aes(x='price', y='carat',color="clarity"), data=diamonds) p + geom_point() # In[134]: p = ggplot(aes(x='price', y='carat',color="cut"), data=diamonds) p + geom_point() # ## Modeling # Lets do some basic Regression Modeling # In[93]: import statsmodels.formula.api as sm # In[94]: result = sm.ols(formula="price ~ carat + color", data=diamonds).fit() # In[97]: result.summary() # In[96]: result.params # In[ ]: