cd C:\Users\tk\Desktop %matplotlib inline import numpy as np import matplotlib.pyplot as plt import pandas as pd import brewer2mpl from matplotlib import rcParams #colorbrewer2 Dark2 qualitative color table dark2_cmap = brewer2mpl.get_map('Dark2', 'Qualitative', 7) dark2_colors = dark2_cmap.mpl_colors rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() pd.set_option('display.width', 500) pd.set_option('display.max_columns', 100) olive_oil = pd.read_csv('olive.csv') olive_oil.head(5) olive_oil.shape olive_oil.rename(columns = {olive_oil.columns[0]:'area_Idili'}, inplace = True) olive_oil.head(5) pd.DataFrame(olive_oil.columns) unique_in_region = olive_oil.region.unique() # We will find how many unique entries are there in region column. unique_in_area = olive_oil.area.unique() print unique_in_region print unique_in_area pd.crosstab(olive_oil.area, olive_oil.region) olive_oil.head(5) olive_oil.area_Idili = olive_oil.area_Idili.map(lambda x: x.split('.')[-1]) olive_oil.head() # How the split function works x = '1.northapulia' y = x.split('.') print y z = x.split('.')[-1] #-1 returns the last element of the list z olive_oil[['palmitic', 'palmitoleic']].head(5) # you can access subset of columns of a data frame. (http://bit.ly/1sPHf1u) olive_oil['palmitic'] print " the type of olive_oil[['palmitic']]: \t", type(olive_oil[['palmitic']]) print " the type of olive_oil['palmitic']: \t", type(olive_oil['palmitic']) olive_oil.palmitic # this is a convienient way to access a specific column list_of_acids =['palmitic', 'palmitoleic', 'stearic', 'oleic', 'linoleic', 'linolenic', 'arachidic', 'eicosenoic'] df = olive_oil[list_of_acids].apply(lambda x: x/100.0) df.head(5) olive_oil[list_of_acids] =df # we are replacing the acid list values in olive_oil olive_oil.head(5) plt.hist(olive_oil.palmitic) fig, axes=plt.subplots(figsize=(10,10), nrows=2, ncols=2) axes[0][0].plot(olive_oil.palmitic, olive_oil.linolenic) axes[0][1].plot(olive_oil.palmitic, olive_oil.linolenic, '.') axes[1][0].scatter(olive_oil.palmitic, olive_oil.linolenic) axes[1][1].hist(olive_oil.palmitic) fig.tight_layout() region_groupby = olive_oil.groupby('region') grp_reg=region_groupby.describe() grp_reg.head(20) olstd = olive_oil.groupby('region').std() olstd olmean=region_groupby.aggregate(np.mean) olmean.head() renamedict_std={k:k+"_std" for k in list_of_acids} renamedict_mean={k:k+"_mean" for k in list_of_acids} olstd.rename(columns=renamedict_std,inplace=True) olmean.rename(columns=renamedict_mean,inplace=True) olstd.head() olpalmiticmean = olmean[['palmitic_mean']] olpalmiticstd = olstd[['palmitic_std']] newolbyregion=olpalmiticmean.join(olpalmiticstd) newolbyregion eico=(olive_oil.eicosenoic < 0.05) eico new_data = pd.DataFrame({'Bigdata' : [12, 34, 99, 45, 13], \ 'Examiner' : [0.9, 0.8, 0.7, 0.6, None], 'Data science' \ : ['L', 'M', None, 'c', 'a']}) new_data new_data.dropna() data = pd.DataFrame([1., None, 3.5, None, 7]) data mean = data.mean() data.fillna(mean)