#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd #importing packages import os as os # In[2]: os.getcwd() #current working directory # In[3]: os.chdir('/home/ajay/Downloads') # In[4]: os.getcwd() # In[5]: a=os.getcwd() os.listdir(a) # In[8]: os.chdir('/home/ajay/Desktop') os.getcwd() # In[9]: a=os.getcwd() os.listdir(a) # In[105]: diamonds=pd.read_csv("diamonds.csv") #note header =0 means we take the first row as a header (default) else we can specify header=None # In[106]: diamonds.info() # In[36]: diamonds.head() # In[37]: diamonds.tail(10) # In[38]: diamonds.columns # In[92]: b=len(diamonds) #this is the total population size print(b) # In[93]: import numpy as np # In[98]: rows = np.random.choice(diamonds.index.values, 0.0001*b) print(rows) sampled_df = diamonds.ix[rows] # In[99]: sampled_df # In[108]: diamonds.describe() # In[109]: cut=diamonds.groupby("cut") # In[110]: cut.count() # In[114]: cut.mean() # In[115]: cut.median() # In[117]: pd.crosstab(diamonds.cut, diamonds.color) # In[121]: diamonds.corr() # In[164]: import matplotlib as mt get_ipython().run_line_magic('matplotlib', 'inline #this line makes sure plots are in same notebook') # In[166]: from ggplot import * # In[ ]: # In[169]: p = ggplot(aes(x='price', y='carat'), data=diamonds) p # In[171]: p + geom_point() # In[172]: p + geom_point() +facet_grid('cut') # In[173]: p = ggplot(aes(x='price', y='carat',color="cut"), data=diamonds) p + geom_point() # In[174]: p = ggplot(aes(x='price', y='carat',color="clarity"), data=diamonds) p + geom_point() # In[ ]: