Iris Setosa | Iris Versicolor | Iris Virginica |
---|---|---|
![]() |
![]() |
![]() |
import urllib2
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
raw_csv = urllib2.urlopen(path)
feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')
all_names = feature_names + ('class',)
df = pd.read_csv(raw_csv, names=all_names)
df
df.describe()
iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')
df_group = df.groupby('class')['class']
print df_group.count()
Iris_Se_Sub_Df = df[df['class'] == iris_names[0]]
Iris_Ve_Sub_Df = df[df['class'] == iris_names[1]]
Iris_Vi_Sub_Df = df[df['class'] == iris_names[2]]
print
print Iris_Se_Sub_Df
unit_str = ' (cm)'
options = {
0: {
'data_x': feature_names[0],
'data_y': feature_names[1],
'label_x': feature_names[0] + unit_str,
'label_y': feature_names[1] + unit_str,
'ylim_min': 1.5,
'ylim_max': 5.0
},
1: {
'data_x': feature_names[0],
'data_y': feature_names[2],
'label_x': feature_names[0] + unit_str,
'label_y': feature_names[2] + unit_str,
'ylim_min': 0.0,
'ylim_max': 9.0
},
2: {
'data_x': feature_names[0],
'data_y': feature_names[3],
'label_x': feature_names[0] + unit_str,
'label_y': feature_names[3] + unit_str,
'ylim_min': -0.5,
'ylim_max': 3.5
},
3: {
'data_x': feature_names[1],
'data_y': feature_names[2],
'label_x': feature_names[1] + unit_str,
'label_y': feature_names[2] + unit_str,
'ylim_min': 0.0,
'ylim_max': 9.0
},
4: {
'data_x': feature_names[1],
'data_y': feature_names[3],
'label_x': feature_names[1] + unit_str,
'label_y': feature_names[3] + unit_str,
'ylim_min': 0.0,
'ylim_max': 3.5
},
5: {
'data_x': feature_names[2],
'data_y': feature_names[3],
'label_x': feature_names[2] + unit_str,
'label_y': feature_names[3] + unit_str,
'ylim_min': 0.0,
'ylim_max': 3.5
}
}
ax = []
fig = plt.figure(figsize=(17, 12))
for i in range(0,6):
ax.append(fig.add_subplot(230 + (i+1)))
for i in range(0,6):
se = ax[i].scatter(Iris_Se_Sub_Df[options[i]['data_x']], Iris_Se_Sub_Df[options[i]['data_y']], color='red')
ve = ax[i].scatter(Iris_Ve_Sub_Df[options[i]['data_x']], Iris_Ve_Sub_Df[options[i]['data_y']], color='blue')
vi = ax[i].scatter(Iris_Vi_Sub_Df[options[i]['data_x']], Iris_Vi_Sub_Df[options[i]['data_y']], color='green')
ax[i].set_xlabel(options[i]['label_x'])
ax[i].set_ylabel(options[i]['label_y'])
ax[i].set_ylim([options[i]['ylim_min'], options[i]['ylim_max']])
ax[i].legend((se, ve, vi), iris_names)
df2 = df.ix[:,0:4]
df2[0:5]
from pandas.tools.plotting import scatter_matrix
_ = scatter_matrix(df2, figsize=(9,9), diagonal='kde')
stats = {}
for i in range(0,4):
stats[i] = {}
stats[i]['mean'] = (Iris_Se_Sub_Df[feature_names[i]].mean(),
Iris_Ve_Sub_Df[feature_names[i]].mean(),
Iris_Vi_Sub_Df[feature_names[i]].mean())
stats[i]['std'] = (Iris_Se_Sub_Df[feature_names[i]].std(),
Iris_Ve_Sub_Df[feature_names[i]].std(),
Iris_Vi_Sub_Df[feature_names[i]].std())
ind = Series([0.5, 1.5, 2.5])
width = 0.5
fig = plt.figure(figsize=(20, 5))
ay = []
for i in range(0,4):
ay.append(fig.add_subplot(140 + (i+1)))
for i in range(0,4):
ay[i].bar(ind, stats[i]['mean'], 0.5, color='magenta', yerr=stats[i]['std'])
ay[i].set_xlim([0, 3.5])
ay[i].set_ylabel('Mean of ' + feature_names[i])
ay[i].set_xticks(ind + width/2)
ay[i].set_xticklabels(iris_names)
_ = df2.boxplot()
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
print type(iris)
iris.keys()
iris.target_names
iris.feature_names
print iris.DESCR
iris.data[0:5]
iris.target[0:5]
iris.data[50:55]
iris.target[50:55]
import findspark
findspark.init()
from pyspark import SparkContext, SparkFiles, SQLContext
if not 'sc' in locals():
sc = SparkContext()
import urllib
_ = urllib.urlretrieve ("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", "iris.data")
data_file = "./iris.data"
raw_data = sc.textFile(data_file)
# num of parallel cores
print sc.defaultParallelism
print raw_data.count()
raw_data = sc.textFile(data_file).filter(lambda x: x != '')
print raw_data.count()
print raw_data.take(5)
def parse_raw_data(line):
line_split = line.split(",")[0:4]
return np.array([float(x) for x in line_split])
vector_data = raw_data.map(parse_raw_data)
print vector_data.take(5)
from pyspark.mllib.stat import Statistics
from math import sqrt
# Compute column summary statistics.
summary = Statistics.colStats(vector_data)
print "Statistics:"
for i in range(4):
print " Mean - {}: {}".format(feature_names[i], round(summary.mean()[i],3))
print " St. Dev - {}: {}".format(feature_names[i], round(sqrt(summary.variance()[i]),3))
print " Max value - {}: {}".format(feature_names[i], round(summary.max()[i],3))
print " Min value - {}: {}".format(feature_names[i], round(summary.min()[i],3))
print " Number of non-zero values - {}: {}".format(feature_names[i], summary.numNonzeros()[i])
print
from pyspark.mllib.stat import Statistics
correlation_matrix = Statistics.corr(vector_data, method="spearman")
print type(correlation_matrix)
print pd.DataFrame(correlation_matrix)