#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd #importing packages
import os as os


# In[2]:


os.getcwd() #current working directory


# In[3]:


os.chdir('/home/ajay/Downloads')


# In[4]:


os.getcwd()


# In[5]:


a=os.getcwd()
os.listdir(a)


# In[8]:


os.chdir('/home/ajay/Desktop')
os.getcwd()


# In[9]:


a=os.getcwd()
os.listdir(a)


# In[105]:


diamonds=pd.read_csv("diamonds.csv")
#note header =0 means we take the first row as a header (default) else we can specify header=None


# In[106]:


diamonds.info()


# In[36]:


diamonds.head()


# In[37]:


diamonds.tail(10)


# In[38]:


diamonds.columns


# In[92]:


b=len(diamonds) #this is the total population size
print(b)


# In[93]:


import numpy as np


# In[98]:


rows = np.random.choice(diamonds.index.values, 0.0001*b)
print(rows)
sampled_df = diamonds.ix[rows]


# In[99]:


sampled_df


# In[108]:


diamonds.describe()


# In[109]:


cut=diamonds.groupby("cut")


# In[110]:


cut.count()


# In[114]:


cut.mean()


# In[115]:


cut.median()


# In[117]:


pd.crosstab(diamonds.cut, diamonds.color)


# In[121]:


diamonds.corr()


# In[164]:


import matplotlib as mt
get_ipython().run_line_magic('matplotlib', 'inline #this line makes sure plots are in same notebook')


# In[166]:


from ggplot import *


# In[ ]:


# In[169]:


p = ggplot(aes(x='price', y='carat'), data=diamonds)
p


# In[171]:


p + geom_point()


# In[172]:


p + geom_point() +facet_grid('cut')


# In[173]:


p = ggplot(aes(x='price', y='carat',color="cut"), data=diamonds)
p + geom_point()


# In[174]:


p = ggplot(aes(x='price', y='carat',color="clarity"), data=diamonds)
p + geom_point()


# In[ ]: