import pandas as pd import numpy as np import sklearn as sk print 'pandas version: ',pd.__version__ print 'numpy version:',np.__version__ print 'sklearn version:',sk.__version__ %cd ~/data/weather !ls -l !cat data-source.txt !head -2 SAMPLE_TMAX.csv # a Typical single line in the data file header=['station','measurement','year']+range(1,366) # D=pandas.DataFrame(columns=header) Data = pd.read_csv('SAMPLE_TMAX.csv',header=None,names=header) G=Data.ix[:,1:365] G[G<-400]=nan G[G>500]=nan G=G/10 Data.ix[:,1:365]=G G=G.transpose() Data.head() tmp=G.ix[:,:].unstack() print shape(tmp), type(tmp) tmp.hist(bins=50); title('overall distribution of temperatures') print tmp.min(),tmp.max() from datetime import date dates=[date.fromordinal(i) for i in range(1,366)] def YearlyPlots(T,ttl='',size=(10,7)): fig=plt.figure(1,figsize=size,dpi=300) #fig, ax = plt.subplots(1) if shape(T)[0] != 365: raise ValueError("First dimension of T should be 365. Shape(T)="+str(shape(T))) plot(dates,T); # rotate and align the tick labels so they look better fig.autofmt_xdate() ylabel('temperature') grid() title(ttl) YearlyPlots(Data.ix[20:30,1:365].transpose(),ttl='A sample of yearly plots') sydneyStations=['ASN00066' in station for station in Data['station']] print Data[sydneyStations]['station'].values #for station in sydneyStations: # print station,sum(Data['station']==station) tmp=Data[sydneyStations].transpose() YearlyPlots(tmp.ix[1:365,:],ttl='Sydney Stations') #tmp.ix[:,tmp.columns[7]] #Data[sydneyStations][['station','year']] # a simple scale function to normalize the data-frame row-by-row from numpy import mean, std def scale_temps(Din): matrix=Din.iloc[:,3:] Dout=Din.loc[:,['station','year']+range(1,366)] Mean=mean(matrix, axis=1).values Dout['Mean']=Mean Std= std(matrix, axis=1).values Dout['Std']=Std # Decided not to normalize each year to have mean zero and std 1 # tmp = pd.DataFrame((matrix.values - Mean[:,np.newaxis])/Std[:,newaxis],columns=range(1,366)) # print tmp.head() Dout.loc[:,1:365]=matrix.values return Dout Dout=scale_temps(Data) #reorder the columns Dout=Dout[['station','year','Mean','Std']+range(1,366)] Dout.head() Mean=mean(Dout.ix[:,1:365], axis=0) YearlyPlots(Mean,ttl='The global mean temperature for each day of the year') #import numpy, scipy.sparse #from sparsesvd import sparsesvd #smat = scipy.sparse.csc_matrix(Data.loc[:,1:365]) # convert to sparse CSC format #ut, s, vt = sparsesvd(smat, 10) # do SVD, asking for 10 factors #print shape(ut),shape(s),shape(vt) nan_per_row=sum(isnan(Dout.ix[:,1:365]),axis=1) nan_per_row.hist(bins=100) sum(nan_per_row>50) # demonstrating the use of the cell magic %%time, which measures the run-time of the cell. M=Dout.loc[:,1:365].transpose() M=M.dropna(axis=1) (columns,rows)=shape(M) Mean=mean(M, axis=1).values print (columns,rows), shape(Mean) C=np.zeros([columns,columns]) # Sum N=np.zeros([columns,columns]) # Counter of non-nan entries %%time for i in range(rows): if i % 1000==0: print i row=M.iloc[:,i]-Mean; outer=np.outer(row,row) valid=isnan(outer)==False C[valid]=C[valid]+outer[valid] # update C with the valid location in outer N[valid]=N[valid]+1 valid_outer=np.multiply(1-isnan(N),N>0) cov=np.divide(C,N) shape(cov) U,D,V=np.linalg.svd(cov) shape(U),shape(D),shape(V) plot(cumsum(D[:])/sum(D)) xlim([0,365]) grid() k=6 # number of components to show. YearlyPlots((U[:,:k]),ttl='The most significant eigen-vectors') legend(range(0,k)); # U1,D1,V1=np.linalg.svd(matrix[:100,:]) # running the svd on the whole matrix seems to crash because of NaN entries k=50 Eig=np.matrix(U[:,:k]) print [np.linalg.norm(U[:,i]) for i in range(k)] matrix=np.matrix(Dout.ix[:,1:365])-Mean matrix[isnan(matrix)]=0 print shape(Eig),shape(matrix) Prod=matrix*Eig; print shape(Prod) for i in range(k-1,-1,-1): Ser=pd.Series(np.array(Prod)[:,i],index=Dout.index) Dout.insert(4,'V'+str(i),Ser) Dout.head() !ls !cat ghcnd-readme.txt # uncomment to read the readme file. ------------------------------ Variable Columns Type ------------------------------ ID 1-11 Character LATITUDE 13-20 Real LONGITUDE 22-30 Real ELEVATION 32-37 Real STATE 39-40 Character NAME 42-71 Character GSNFLAG 73-75 Character HCNFLAG 77-79 Character WMOID 81-85 Character ------------------------------ # Make all lines be of length 90 to solve problem wilth read_fwf out=open('ghcnd-stations_buffered.txt','w') for line in open('ghcnd-stations.txt','r').readlines(): line=line.rstrip() string=line+' '*(90-len(line))+'\n' out.write(string) out.close() A grid to define the character locations of the fields AE000041196 25.3330 55.5170 34.0 SHARJAH INTER. AIRP GSN 41196 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890 0 1 2 3 4 5 6 7 8 9 colspecs = [(0, 11), (11, 21), (21, 31), (31, 38),(39,41),(41,72),(72,76),(76,80),(80,86)] stations = pd.read_fwf('ghcnd-stations_buffered.txt', colspecs=colspecs, header=None, index_col=0, names=['latitude','longitude','elevation','state','name','GSNFLAG','HCNFLAG','WMOID']) #stations['elevation'][stations['elevation']==-999.9]=0 # decided not to remove -999.9 because this confuses hist stations.head() Djoined=Dout.join(stations,on='station') Djoined.columns[-10:] Djoined['AbsLatitude']=abs(Djoined['latitude'].values) Djoined.ix[:5,['station',u'longitude','latitude',u'AbsLatitude','Mean','Std','V0','V1','V2']] Djoined[['latitude','elevation','Mean','Std','V0','V1','V2','V3','V4','V5']].cov() Djoined[['latitude','elevation','Mean','Std','V0','V1','V2','V3','V4','V5']].corr() # Choosing significance threshold so that none of the correlations between the Vi-s are significant. abs(Djoined[['latitude','elevation','Mean','Std','V0','V1','V2','V3','V4','V5']].corr())>0.2 from pandas.tools.plotting import scatter_matrix df = Djoined.ix[:,['latitude','elevation','Mean','Std','V0','V1','V2','V3','V4','V5']] scatter_matrix(df, alpha=0.03, figsize=(20, 20), diagonal='kde'); X='latitude' Djoined.ix[:,X].hist(bins=100); # Taken from http://pandasplotting.blogspot.com/ import matplotlib.pyplot as plt from pandas.tools import rplot #plt.figure() X='latitude';Y='Mean' dfTmp=Djoined[[X,Y]].dropna() dfTmp=dfTmp.iloc[:,:] p = rplot.RPlot(dfTmp,x=X,y=Y) p.add(rplot.GeomPoint()) p.add(rplot.GeomDensity2D()) p.render(); # To see the source of a method use ?? rplot.GeomDensity2D?? X='latitude';Y='Mean' scatter(Djoined.loc[:,X],Djoined.loc[:,Y],alpha=0.05) xlabel(X) ylabel(Y) #checking for an anomaly in the elevations of stations Djoined[['station','elevation']][Djoined['elevation']<-500].head() !grep ASN00010865 ghcnd-stations.txt lons=stations.ix[:,'longitude'].values lats=stations.ix[:,'latitude'].values station_names=stations.index.values ll=len(lons) lonmin=-180;lonmax=180;latsmin=-80;latsmax=80; select=(lons>lonmin) * (lonslatsmin)*(lats=rows*columns: break plot(Recon[:,i]) plot(Djoined.ix[i,1:365]); title(Djoined.ix[i,'station']+' / '+str(Djoined.ix[i,'year'])) xlim([0,365]) plot_reconstructions(range(50,100),rows=7,columns=3) hist(Djoined.ix[:,'V0'],bins=100); selection= [i for i in range(shape(Djoined)[0]) if Djoined.ix[i,'latitude']<-10] plot_reconstructions(selection,rows=7,columns=3) shape(selection)