Notebook

In [2]:

using DataFrames
ID = [1:332]
sizeof(ID)/sizeof(ID[1])

Out[2]:

332.0

In [2]:

function corr(directory, threshold = 0)
    pcorr = Array(Float64,0)
    for (i = 1:332) 
        df = readtable(@sprintf("%s/%03d.csv",directory,i))
        v2 = Array(Float64,0) 
        v3 = Array(Float64,0)
        for j = 1:size(df)[1]
            if(!isna(df[j,2]) & !isna(df[j,3]))
                push!(v2, df[j,2])
                push!(v3, df[j,3])
            end
        end
        size(v2)[1] > threshold && push!(pcorr, cor(v2,v3))  
    end
    return(pcorr)
end

pcorr = corr("specdata")
@time corr("specdata") ;

elapsed time: 1.676303646 seconds (511259896 bytes allocated)

In [2]:

    function corr2(directory, threshold = 0)
        pcorr = Array(Float64,0)
        for (i = 1:332) 
            df = readcsv(@sprintf("%s/%03d.csv",directory,i),has_header=true)
            v2 = Array(Float64,0) 
            v3 = Array(Float64,0)
            for j = 1:size(df[1],1)
                if((df[1][j,2] != "NA") & (df[1][j,3] != "NA"))
                    push!(v2, float(df[1][j,2]))
                    push!(v3, float(df[1][j,3]))
                end
            end
            size(v2)[1] > threshold && push!(pcorr, cor(v2,v3))         
        end
        return(pcorr)
    end
    
    pcorr = corr2("specdata",400)
    @time corr2("specdata") ;

elapsed time: 1.157388875 seconds (374348556 bytes allocated)

In [4]:

    function corr3(directory, threshold = 0)
        pcorr = Array(Float64,0)
        for (i = 1:332) 
            df = readdlm(@sprintf("%s/%03d.csv",directory,i),has_header=true)[1]
            v2 = Array(Float64,0) 
            v3 = Array(Float64,0)
            for j = 1:size(df,1)
                #"2004-06-24",NA,NA,1
                # skip if you match a NA in the string
                if !ismatch(r"NA", df[j][14:end])
                    c2 = c1 = 0
                    # find the two commas in the string to find the numbers
                    for k = 14:length(df[j])
                        if df[j][k] == ','
                            if c1 == 0 c1 = k-1
                            else c2 = k-1
                            end
                        end
                    end
                    push!(v2,float(df[j][14:c1]))
                    push!(v3,float(df[j][c1+2:c2]))
                end
            end
            size(v2)[1] > threshold && push!(pcorr, cor(v2,v3))
        end
        return(pcorr)
    end
    
    pcorr = corr3("specdata",400)
    @time corr3("specdata") ;

elapsed time: 1.137150839 seconds (479843724 bytes allocated)

In [17]: