using RDatasets
using Gadfly
using DataFrames
iris = dataset("datasets", "iris")
150x5 DataFrame: SepalLength SepalWidth PetalLength PetalWidth Species [1,] 5.1 3.5 1.4 0.2 "setosa" [2,] 4.9 3.0 1.4 0.2 "setosa" [3,] 4.7 3.2 1.3 0.2 "setosa" [4,] 4.6 3.1 1.5 0.2 "setosa" [5,] 5.0 3.6 1.4 0.2 "setosa" [6,] 5.4 3.9 1.7 0.4 "setosa" [7,] 4.6 3.4 1.4 0.3 "setosa" [8,] 5.0 3.4 1.5 0.2 "setosa" [9,] 4.4 2.9 1.4 0.2 "setosa" [10,] 4.9 3.1 1.5 0.1 "setosa" [11,] 5.4 3.7 1.5 0.2 "setosa" [12,] 4.8 3.4 1.6 0.2 "setosa" [13,] 4.8 3.0 1.4 0.1 "setosa" [14,] 4.3 3.0 1.1 0.1 "setosa" [15,] 5.8 4.0 1.2 0.2 "setosa" [16,] 5.7 4.4 1.5 0.4 "setosa" [17,] 5.4 3.9 1.3 0.4 "setosa" [18,] 5.1 3.5 1.4 0.3 "setosa" [19,] 5.7 3.8 1.7 0.3 "setosa" [20,] 5.1 3.8 1.5 0.3 "setosa" : [131,] 7.4 2.8 6.1 1.9 "virginica" [132,] 7.9 3.8 6.4 2.0 "virginica" [133,] 6.4 2.8 5.6 2.2 "virginica" [134,] 6.3 2.8 5.1 1.5 "virginica" [135,] 6.1 2.6 5.6 1.4 "virginica" [136,] 7.7 3.0 6.1 2.3 "virginica" [137,] 6.3 3.4 5.6 2.4 "virginica" [138,] 6.4 3.1 5.5 1.8 "virginica" [139,] 6.0 3.0 4.8 1.8 "virginica" [140,] 6.9 3.1 5.4 2.1 "virginica" [141,] 6.7 3.1 5.6 2.4 "virginica" [142,] 6.9 3.1 5.1 2.3 "virginica" [143,] 5.8 2.7 5.1 1.9 "virginica" [144,] 6.8 3.2 5.9 2.3 "virginica" [145,] 6.7 3.3 5.7 2.5 "virginica" [146,] 6.7 3.0 5.2 2.3 "virginica" [147,] 6.3 2.5 5.0 1.9 "virginica" [148,] 6.5 3.0 5.2 2.0 "virginica" [149,] 6.2 3.4 5.4 2.3 "virginica" [150,] 5.9 3.0 5.1 1.8 "virginica"
pwd()
"C:\\Users\\dell"
cd("C:\\Users\\dell\\Desktop")
pwd()
"C:\\Users\\dell\\Desktop"
readdir()
5-element Array{String,1}: "10154161_10154003554090471_1325653799668649187_n.jpg" "adult.data.txt" "desktop.ini" "IJulia.ipynb" "julia - Shortcut.lnk"
df=readtable("adult.data.txt",header=false)
DataFrame with 32561 rows, 15 columns Columns: x1 32561 non-null values x2 32561 non-null values x3 32561 non-null values x4 32561 non-null values x5 32561 non-null values x6 32561 non-null values x7 32561 non-null values x8 32561 non-null values x9 32561 non-null values x10 32561 non-null values x11 32561 non-null values x12 32561 non-null values x13 32561 non-null values x14 32561 non-null values x15 32561 non-null values
size(df)
(32561,15)
head(df)
DataFrame with 6 rows, 15 columns Columns: x1 6 non-null values x2 6 non-null values x3 6 non-null values x4 6 non-null values x5 6 non-null values x6 6 non-null values x7 6 non-null values x8 6 non-null values x9 6 non-null values x10 6 non-null values x11 6 non-null values x12 6 non-null values x13 6 non-null values x14 6 non-null values x15 6 non-null values
plot(df,x="x1" ,color="x15",Geom.histogram)
describe(df)
x1 Min 17.0 1st Qu. 28.0 Median 37.0 Mean 38.58164675532078 3rd Qu. 48.0 Max 90.0 NAs 0 NA% 0.0% x2 Length 32561 Type UTF8String NAs 0 NA% 0.0% Unique 9 x3 Min 12285.0 1st Qu. 117827.0 Median 178356.0 Mean 189778.36651208502 3rd Qu. 237051.0 Max 1.484705e6 NAs 0 NA% 0.0% x4 Length 32561 Type UTF8String NAs 0 NA% 0.0% Unique 16 x5 Min 1.0 1st Qu. 9.0 Median 10.0 Mean 10.0806793403151 3rd Qu. 12.0 Max 16.0 NAs 0 NA% 0.0% x6 Length 32561 Type UTF8String NAs 0 NA% 0.0% Unique 7 x7 Length 32561 Type UTF8String NAs 0 NA% 0.0% Unique 15 x8 Length 32561 Type UTF8String NAs 0 NA% 0.0% Unique 6 x9 Length 32561 Type UTF8String NAs 0 NA% 0.0% Unique 5 x10 Length 32561 Type UTF8String NAs 0 NA% 0.0% Unique 2 x11 Min 0.0 1st Qu. 0.0 Median 0.0 Mean 1077.6488437087312 3rd Qu. 0.0 Max 99999.0 NAs 0 NA% 0.0% x12 Min 0.0 1st Qu. 0.0 Median 0.0 Mean 87.303829734959 3rd Qu. 0.0 Max 4356.0 NAs 0 NA% 0.0% x13 Min 1.0 1st Qu. 40.0 Median 40.0 Mean 40.437455852092995 3rd Qu. 45.0 Max 99.0 NAs 0 NA% 0.0% x14 Length 32561 Type UTF8String NAs 0 NA% 0.0% Unique 42 x15 Length 32561 Type UTF8String NAs 0 NA% 0.0% Unique 2
plot(iris,x="SepalLength",y="SepalWidth",color="Species")
describe(df[:x10]) #refer to parts of a DataFrame
Length 32561 Type UTF8String NAs 0 NA% 0.0% Unique 2
describe(df[:x1])
Min 17.0 1st Qu. 28.0 Median 37.0 Mean 38.58164675532078 3rd Qu. 48.0 Max 90.0 NAs 0 NA% 0.0%
test1=(23,56,78,89)
(23,56,78,89)
mean(test1)
61.5
test3=(NA,34,67,89)
(NA,34,67,89)
mean(removeNA(test3))
no method removeNA((NAtype,Int64,Int64,Int64),) at In[23]:1
using DataArrays
test2=@data([23,56,78,89,NA]) #Only DataArrays can have missing values
5-element DataArray{Int64,1}: 23 56 78 89 NA
mean(test2)
NA
mean(removeNA(test2))
61.5
#Note removeNA has been changed to dropna in Julia 0.3
?removeNA #Notice the help
removeNA (generic function with 3 methods)
plot(df,y="x1",Geom.boxplot)
maximum: argument is empty in maximum at abstractarray.jl:1595
using PyPlot
boxplot(df[:x1])
{"fliers"=>{PyObject <matplotlib.lines.Line2D object at 0x00000000254F0FD0>,PyObject <matplotlib.lines.Line2D object at 0x00000000254F2550>},"whiskers"=>{PyObject <matplotlib.lines.Line2D object at 0x00000000254EB198>,PyObject <matplotlib.lines.Line2D object at 0x00000000254EB3C8>},"boxes"=>{PyObject <matplotlib.lines.Line2D object at 0x00000000254F04E0>},"medians"=>{PyObject <matplotlib.lines.Line2D object at 0x00000000254F0A58>},"caps"=>{PyObject <matplotlib.lines.Line2D object at 0x00000000254EB9B0>,PyObject <matplotlib.lines.Line2D object at 0x00000000254EBF28>}}
pwd()
"C:\\Users\\dell"