In [6]:
using RDatasets
using Gadfly
using DataFrames
In [7]:
iris = dataset("datasets", "iris")
Out[7]:
150x5 DataFrame:
          SepalLength SepalWidth PetalLength PetalWidth     Species
[1,]              5.1        3.5         1.4        0.2    "setosa"
[2,]              4.9        3.0         1.4        0.2    "setosa"
[3,]              4.7        3.2         1.3        0.2    "setosa"
[4,]              4.6        3.1         1.5        0.2    "setosa"
[5,]              5.0        3.6         1.4        0.2    "setosa"
[6,]              5.4        3.9         1.7        0.4    "setosa"
[7,]              4.6        3.4         1.4        0.3    "setosa"
[8,]              5.0        3.4         1.5        0.2    "setosa"
[9,]              4.4        2.9         1.4        0.2    "setosa"
[10,]             4.9        3.1         1.5        0.1    "setosa"
[11,]             5.4        3.7         1.5        0.2    "setosa"
[12,]             4.8        3.4         1.6        0.2    "setosa"
[13,]             4.8        3.0         1.4        0.1    "setosa"
[14,]             4.3        3.0         1.1        0.1    "setosa"
[15,]             5.8        4.0         1.2        0.2    "setosa"
[16,]             5.7        4.4         1.5        0.4    "setosa"
[17,]             5.4        3.9         1.3        0.4    "setosa"
[18,]             5.1        3.5         1.4        0.3    "setosa"
[19,]             5.7        3.8         1.7        0.3    "setosa"
[20,]             5.1        3.8         1.5        0.3    "setosa"
  :
[131,]            7.4        2.8         6.1        1.9 "virginica"
[132,]            7.9        3.8         6.4        2.0 "virginica"
[133,]            6.4        2.8         5.6        2.2 "virginica"
[134,]            6.3        2.8         5.1        1.5 "virginica"
[135,]            6.1        2.6         5.6        1.4 "virginica"
[136,]            7.7        3.0         6.1        2.3 "virginica"
[137,]            6.3        3.4         5.6        2.4 "virginica"
[138,]            6.4        3.1         5.5        1.8 "virginica"
[139,]            6.0        3.0         4.8        1.8 "virginica"
[140,]            6.9        3.1         5.4        2.1 "virginica"
[141,]            6.7        3.1         5.6        2.4 "virginica"
[142,]            6.9        3.1         5.1        2.3 "virginica"
[143,]            5.8        2.7         5.1        1.9 "virginica"
[144,]            6.8        3.2         5.9        2.3 "virginica"
[145,]            6.7        3.3         5.7        2.5 "virginica"
[146,]            6.7        3.0         5.2        2.3 "virginica"
[147,]            6.3        2.5         5.0        1.9 "virginica"
[148,]            6.5        3.0         5.2        2.0 "virginica"
[149,]            6.2        3.4         5.4        2.3 "virginica"
[150,]            5.9        3.0         5.1        1.8 "virginica"

In [8]:
pwd()
Out[8]:
"C:\\Users\\dell"
In [9]:
cd("C:\\Users\\dell\\Desktop")
In [10]:
pwd()
Out[10]:
"C:\\Users\\dell\\Desktop"
In [11]:
readdir()
Out[11]:
5-element Array{String,1}:
 "10154161_10154003554090471_1325653799668649187_n.jpg"
 "adult.data.txt"                                      
 "desktop.ini"                                         
 "IJulia.ipynb"                                        
 "julia - Shortcut.lnk"                                
In [12]:
df=readtable("adult.data.txt",header=false)
Out[12]:
DataFrame with 32561 rows, 15 columns
Columns:

x1       32561 non-null values    
x2       32561 non-null values    
x3       32561 non-null values    
x4       32561 non-null values    
x5       32561 non-null values    
x6       32561 non-null values    
x7       32561 non-null values    
x8       32561 non-null values    
x9       32561 non-null values    
x10      32561 non-null values    
x11      32561 non-null values    
x12      32561 non-null values    
x13      32561 non-null values    
x14      32561 non-null values    
x15      32561 non-null values    


In [13]:
size(df)
Out[13]:
(32561,15)
In [14]:
head(df)
Out[14]:
DataFrame with 6 rows, 15 columns
Columns:

x1       6 non-null values    
x2       6 non-null values    
x3       6 non-null values    
x4       6 non-null values    
x5       6 non-null values    
x6       6 non-null values    
x7       6 non-null values    
x8       6 non-null values    
x9       6 non-null values    
x10      6 non-null values    
x11      6 non-null values    
x12      6 non-null values    
x13      6 non-null values    
x14      6 non-null values    
x15      6 non-null values    


In [15]:
plot(df,x="x1" ,color="x15",Geom.histogram)
Out[15]:
In [16]:
describe(df)
x1
Min      17.0
1st Qu.  28.0
Median   37.0
Mean     38.58164675532078
3rd Qu.  48.0
Max      90.0
NAs      0
NA%      0.0%

x2
Length  32561
Type    UTF8String
NAs     0
NA%     0.0%
Unique  9

x3
Min      12285.0
1st Qu.  117827.0
Median   178356.0
Mean     189778.36651208502
3rd Qu.  237051.0
Max      1.484705e6
NAs      0
NA%      0.0%

x4
Length  32561
Type    UTF8String
NAs     0
NA%     0.0%
Unique  16

x5
Min      1.0
1st Qu.  9.0
Median   10.0
Mean     10.0806793403151
3rd Qu.  12.0
Max      16.0
NAs      0
NA%      0.0%

x6
Length  32561
Type    UTF8String
NAs     0
NA%     0.0%
Unique  7

x7
Length  32561
Type    UTF8String
NAs     0
NA%     0.0%
Unique  15

x8
Length  32561
Type    UTF8String
NAs     0
NA%     0.0%
Unique  6

x9
Length  32561
Type    UTF8String
NAs     0
NA%     0.0%
Unique  5

x10
Length  32561
Type    UTF8String
NAs     0
NA%     0.0%
Unique  2

x11
Min      0.0
1st Qu.  0.0
Median   0.0
Mean     1077.6488437087312
3rd Qu.  0.0
Max      99999.0
NAs      0
NA%      0.0%

x12
Min      0.0
1st Qu.  0.0
Median   0.0
Mean     87.303829734959
3rd Qu.  0.0
Max      4356.0
NAs      0
NA%      0.0%

x13
Min      1.0
1st Qu.  40.0
Median   40.0
Mean     40.437455852092995
3rd Qu.  45.0
Max      99.0
NAs      0
NA%      0.0%

x14
Length  32561
Type    UTF8String
NAs     0
NA%     0.0%
Unique  42

x15
Length  32561
Type    UTF8String
NAs     0
NA%     0.0%
Unique  2


In [17]:
plot(iris,x="SepalLength",y="SepalWidth",color="Species")
Out[17]:
In [18]:
describe(df[:x10]) #refer to parts of a DataFrame
Length  32561
Type    UTF8String
NAs     0
NA%     0.0%
Unique  2

In [19]:
describe(df[:x1])
Min      17.0
1st Qu.  28.0
Median   37.0
Mean     38.58164675532078
3rd Qu.  48.0
Max      90.0
NAs      0
NA%      0.0%

In [20]:
test1=(23,56,78,89)
Out[20]:
(23,56,78,89)
In [21]:
mean(test1)
Out[21]:
61.5
In [22]:
test3=(NA,34,67,89)
Out[22]:
(NA,34,67,89)
In [23]:
mean(removeNA(test3))


no method removeNA((NAtype,Int64,Int64,Int64),)
at In[23]:1


In [24]:
using DataArrays
test2=@data([23,56,78,89,NA]) #Only DataArrays can have missing values
Out[24]:
5-element DataArray{Int64,1}:
 23  
 56  
 78  
 89  
   NA
In [25]:
mean(test2)
Out[25]:
NA
In [26]:
mean(removeNA(test2))
Out[26]:
61.5
In [27]:
#Note removeNA has been changed to dropna in Julia 0.3
In [28]:
?removeNA #Notice the help
removeNA (generic function with 3 methods)

In [29]:
plot(df,y="x1",Geom.boxplot)


maximum: argument is empty
 in maximum at abstractarray.jl:1595
In [30]:
using PyPlot
In [31]:
boxplot(df[:x1])
Out[31]:
{"fliers"=>{PyObject <matplotlib.lines.Line2D object at 0x00000000254F0FD0>,PyObject <matplotlib.lines.Line2D object at 0x00000000254F2550>},"whiskers"=>{PyObject <matplotlib.lines.Line2D object at 0x00000000254EB198>,PyObject <matplotlib.lines.Line2D object at 0x00000000254EB3C8>},"boxes"=>{PyObject <matplotlib.lines.Line2D object at 0x00000000254F04E0>},"medians"=>{PyObject <matplotlib.lines.Line2D object at 0x00000000254F0A58>},"caps"=>{PyObject <matplotlib.lines.Line2D object at 0x00000000254EB9B0>,PyObject <matplotlib.lines.Line2D object at 0x00000000254EBF28>}}
In [5]:
pwd()
Out[5]:
"C:\\Users\\dell"
In []: