In this notebook we are analyzing a sample out of data that was downloaded from http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/, the main file is ghcnd_all.tar.gz which is about 2.4 GB which becomes around 20GB when uncompressed.
The data contains about 1 million station-year recordings. That is too much to analyzer on single core machine, so we start by taking a sample of 20,000 recordings of the maximal daily temperatures for a period of a 365 days starting on January 1st (the last day of leap years is discarded).
import pandas as pd
import numpy as np
import sklearn as sk
print 'pandas version: ',pd.__version__
print 'numpy version:',np.__version__
print 'sklearn version:',sk.__version__
pandas version: 0.13.1 numpy version: 1.8.0 sklearn version: 0.14.1
Switch to the data directory and check it's contents
%cd ~/data/weather
!ls -l
/home/ubuntu/UCSD_BigData/data/weather total 40280 -rw-r--r-- 1 ubuntu ubuntu 218 Mar 14 03:31 data-source.txt -rw-r--r-- 1 ubuntu ubuntu 22422 Mar 14 03:31 ghcnd-readme.txt -rw-rw-r-- 1 ubuntu ubuntu 7760844 Apr 8 18:52 ghcnd-stations_buffered.txt -rw-r--r-- 1 ubuntu ubuntu 7334424 Mar 14 03:31 ghcnd-stations.txt -rw-r--r-- 1 ubuntu ubuntu 270 Mar 14 03:31 ghcnd-version.txt -rw-r--r-- 1 ubuntu ubuntu 26114979 Apr 7 21:24 SAMPLE_TMAX.csv
!cat data-source.txt
I got the data from this ftp site: ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ Opening the ftp site in a browser is helpful: you can easily browse the directory structure and look at txt and pdf files. Yoav Freund
!head -2 SAMPLE_TMAX.csv
# a Typical single line in the data file
USC00507570,TMAX,2005,67,44,61,17,-17,-22,-39,-44,-44,-78,-156,-156,-33,28,39,22,-89,-139,-156,44,61,50,-39,17,67,61,-61,-61,11,33,-72,-100,-150,-178,-150,-33,39,28,22,11,-150,-156,-150,50,44,28,39,50,56,39,67,50,39,56,50,56,44,22,6,17,17,61,83,67,67,89,78,61,61,83,56,44,67,89,67,89,100,83,56,17,22,50,39,28,-6,28,-6,-28,-50,-44,-33,-22,-22,-22,17,67,67,78,94,89,72,56,89,111,94,83,56,11,28,56,89,133,133,128,178,167,194,211,189,178,161,172,144,133,128,117,183,200,211,211,178,172,133,150,128,133,150,150,178,211,200,133,172,144,161,156,139,150,133,144,161,150,161,200,183,228,222,183,144,150,178,167,183,239,239,267,244,222,222,117,189,233,194,206,261,228,211,222,261,228,228,200,194,183,233,244,217,267,250,217,161,200,200,194,250,261,222,194,183,161,206,228,228,222,206,200,167,183,194,172,200,189,167,183,194,200,206,217,206,244,267,256,278,294,278,256,228,228,189,206,211,211,161,156,144,156,161,156,167,139,122,144,139,156,150,144,128,144,128,150,150,133,144,172,156,106,161,161,122,122,133,117,133,117,133,150,117,89,122,106,117,100,106,89,67,122,111,106,89,106,89,44,50,61,28,33,44,89,61,72,61,89,100,94,22,0,17,6,6,11,0,-22,-28,-44,-50,-33,-39,-89,-67,-22,-56,-100,-100,-56,-83,-56,-117,33,33,56,56,56,50,-17,-33,-100,-144,-178,-167,-161,-172,-67,-44,-28,-61,-106,-156,-133,33,67,78,67,72,22,6,-78,11,61,78,78,11,67,89,44,-28,-39,-33,-17,39,61,50,61,61,44,61 NOE00135018,TMAX,1959,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,72,123,95,121,94,50,69,95,55,52,12,65,52,110,129,131,170,166,69,84,93,82,110,111,92,95,95,148,102,135,142,120,100,100,106,120,142,134,172,180,201,190,242,223,245,225,142,125,205,166,165,155,167,213,201,190,156,170,186,172,165,151,189,196,191,187,205,189,160,160,166,214,231,230,228,218,149,189,180,174,175,196,215,252,245,257,264,286,261,208,226,222,205,202,205,236,215,220,225,220,243,234,221,181,206,196,220,212,240,250,266,248,255,280,260,260,273,265,252,230,220,235,250,240,239,235,220,265,252,223,234,242,255,253,215,246,228,250,234,236,261,230,230,218,244,232,260,232,202,182,166,161,196,215,202,205,220,206,200,218,215,212,225,250,231,199,170,206,190,159,210,185,177,162,150,164,171,142,90,111,106,149,128,117,125,126,116,132,114,109,83,115,81,89,125,105,112,149,149,112,65,100,119,140,90,79,136,120,91,100,114,103,71,81,81,50,105,96,55,50,24,28,84,90,72,70,63,72,86,72,48,-5,-25,9,28,69,90,101,90,82,61,40,4,80,60,59,56,35,28,-1,-30,-46,-46,-40,-40,-35,-35,-50,18,26,44,58,65,65,80,58,31,45,42,32,16,20,22,39,61
header=['station','measurement','year']+range(1,366)
# D=pandas.DataFrame(columns=header)
Data = pd.read_csv('SAMPLE_TMAX.csv',header=None,names=header)
G=Data.ix[:,1:365]
G[G<-400]=nan
G[G>500]=nan
G=G/10
Data.ix[:,1:365]=G
G=G.transpose()
Data.head()
--------------------------------------------------------------------------- IOError Traceback (most recent call last) <ipython-input-3-236e1fb1b618> in <module>() 1 header=['station','measurement','year']+range(1,366) 2 # D=pandas.DataFrame(columns=header) ----> 3 Data = pd.read_csv('SAMPLE_TMAX.csv',header=None,names=header) 4 G=Data.ix[:,1:365] 5 G[G<-400]=nan //anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, na_fvalues, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format) 418 infer_datetime_format=infer_datetime_format) 419 --> 420 return _read(filepath_or_buffer, kwds) 421 422 parser_f.__name__ = name //anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds) 216 217 # Create the parser. --> 218 parser = TextFileReader(filepath_or_buffer, **kwds) 219 220 if nrows is not None: //anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds) 500 self.options['has_index_names'] = kwds['has_index_names'] 501 --> 502 self._make_engine(self.engine) 503 504 def _get_options_with_defaults(self, engine): //anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine) 608 def _make_engine(self, engine='c'): 609 if engine == 'c': --> 610 self._engine = CParserWrapper(self.f, **self.options) 611 else: 612 if engine == 'python': //anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds) 970 kwds['allow_leading_cols'] = self.index_col is not False 971 --> 972 self._reader = _parser.TextReader(src, **kwds) 973 974 # XXX //anaconda/lib/python2.7/site-packages/pandas/parser.so in pandas.parser.TextReader.__cinit__ (pandas/parser.c:2965)() //anaconda/lib/python2.7/site-packages/pandas/parser.so in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:5304)() IOError: File SAMPLE_TMAX.csv does not exist
tmp=G.ix[:,:].unstack()
print shape(tmp), type(tmp)
tmp.hist(bins=50);
title('overall distribution of temperatures')
print tmp.min(),tmp.max()
(7300000,) <class 'pandas.core.series.Series'> -40.0 50.0
from datetime import date
dates=[date.fromordinal(i) for i in range(1,366)]
def YearlyPlots(T,ttl='',size=(10,7)):
fig=plt.figure(1,figsize=size,dpi=300)
#fig, ax = plt.subplots(1)
if shape(T)[0] != 365:
raise ValueError("First dimension of T should be 365. Shape(T)="+str(shape(T)))
plot(dates,T);
# rotate and align the tick labels so they look better
fig.autofmt_xdate()
ylabel('temperature')
grid()
title(ttl)
YearlyPlots(Data.ix[20:30,1:365].transpose(),ttl='A sample of yearly plots')
sydneyStations=['ASN00066' in station for station in Data['station']]
print Data[sydneyStations]['station'].values
#for station in sydneyStations:
# print station,sum(Data['station']==station)
tmp=Data[sydneyStations].transpose()
YearlyPlots(tmp.ix[1:365,:],ttl='Sydney Stations')
#tmp.ix[:,tmp.columns[7]]
#Data[sydneyStations][['station','year']]
['ASN00066124' 'ASN00066059' 'ASN00066194' 'ASN00066131' 'ASN00066195' 'ASN00066062']
And calculating the standard deviation. In this case we are not divi
# a simple scale function to normalize the data-frame row-by-row
from numpy import mean, std
def scale_temps(Din):
matrix=Din.iloc[:,3:]
Dout=Din.loc[:,['station','year']+range(1,366)]
Mean=mean(matrix, axis=1).values
Dout['Mean']=Mean
Std= std(matrix, axis=1).values
Dout['Std']=Std
# Decided not to normalize each year to have mean zero and std 1
# tmp = pd.DataFrame((matrix.values - Mean[:,np.newaxis])/Std[:,newaxis],columns=range(1,366))
# print tmp.head()
Dout.loc[:,1:365]=matrix.values
return Dout
Dout=scale_temps(Data)
#reorder the columns
Dout=Dout[['station','year','Mean','Std']+range(1,366)]
Dout.head()
station | year | Mean | Std | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | USC00507570 | 2005 | 8.531507 | 10.439819 | 6.7 | 4.4 | 6.1 | 1.7 | -1.7 | -2.2 | -3.9 | -4.4 | -4.4 | -7.8 | -15.6 | -15.6 | -3.3 | 2.8 | 3.9 | 2.2 | ... |
1 | NOE00135018 | 1959 | 14.494545 | 7.974400 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... |
2 | KZ000036546 | 1982 | 7.220513 | 14.897310 | NaN | NaN | -13.9 | NaN | NaN | -4.5 | NaN | NaN | -2.8 | NaN | -5.5 | NaN | -6.3 | NaN | NaN | NaN | ... |
3 | USC00054664 | 1964 | 18.576860 | 10.790115 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... |
4 | CUW00011706 | 1981 | 31.321370 | 1.588618 | 30.0 | 28.3 | 30.0 | 30.0 | 28.3 | 28.9 | 28.9 | 31.1 | 30.0 | 29.4 | 30.6 | 25.6 | 23.3 | 28.3 | 28.9 | 30.0 | ... |
5 rows × 369 columns
Mean=mean(Dout.ix[:,1:365], axis=0)
YearlyPlots(Mean,ttl='The global mean temperature for each day of the year')
Using a sparse svd solver directly, using https://pypi.python.org/pypi/sparsesvd/
#import numpy, scipy.sparse
#from sparsesvd import sparsesvd
#smat = scipy.sparse.csc_matrix(Data.loc[:,1:365]) # convert to sparse CSC format
#ut, s, vt = sparsesvd(smat, 10) # do SVD, asking for 10 factors
#print shape(ut),shape(s),shape(vt)
We find the distribution of missing values and decide how to deal with them. From the analysis below we see that most rows have some missing values. We therefor choose to perform the average more carefully, rather than discard rows with many missing values
nan_per_row=sum(isnan(Dout.ix[:,1:365]),axis=1)
nan_per_row.hist(bins=100)
sum(nan_per_row>50)
3783
We compute the empirical covariance matrix in a way that tolerates NaN values.
In the code below I remove all rows that have a nan in them. If you remve the command M.dropna(... then all rows are used. Can you get better results without removing the rows?
# demonstrating the use of the cell magic %%time, which measures the run-time of the cell.
M=Dout.loc[:,1:365].transpose()
M=M.dropna(axis=1)
(columns,rows)=shape(M)
Mean=mean(M, axis=1).values
print (columns,rows), shape(Mean)
C=np.zeros([columns,columns]) # Sum
N=np.zeros([columns,columns]) # Counter of non-nan entries
(365, 8866) (365,)
%%time
for i in range(rows):
if i % 1000==0:
print i
row=M.iloc[:,i]-Mean;
outer=np.outer(row,row)
valid=isnan(outer)==False
C[valid]=C[valid]+outer[valid] # update C with the valid location in outer
N[valid]=N[valid]+1
valid_outer=np.multiply(1-isnan(N),N>0)
cov=np.divide(C,N)
0 1000 2000 3000 4000 5000 6000 7000 8000 CPU times: user 1min, sys: 68 ms, total: 1min Wall time: 1min 1s
shape(cov)
(365, 365)
U,D,V=np.linalg.svd(cov)
shape(U),shape(D),shape(V)
((365, 365), (365,), (365, 365))
plot(cumsum(D[:])/sum(D))
xlim([0,365])
grid()
k=6 # number of components to show.
YearlyPlots((U[:,:k]),ttl='The most significant eigen-vectors')
legend(range(0,k));
# U1,D1,V1=np.linalg.svd(matrix[:100,:]) # running the svd on the whole matrix seems to crash because of NaN entries
k=50
Eig=np.matrix(U[:,:k])
print [np.linalg.norm(U[:,i]) for i in range(k)]
matrix=np.matrix(Dout.ix[:,1:365])-Mean
matrix[isnan(matrix)]=0
print shape(Eig),shape(matrix)
Prod=matrix*Eig;
print shape(Prod)
[0.99999999999999978, 0.99999999999999989, 1.0000000000000007, 1.0000000000000002, 1.0, 1.0, 1.0000000000000004, 1.0000000000000004, 0.99999999999999989, 0.99999999999999989, 1.0000000000000004, 1.0000000000000002, 0.99999999999999989, 1.0000000000000002, 1.0000000000000007, 1.0000000000000004, 1.0000000000000002, 0.99999999999999978, 1.0000000000000002, 1.0000000000000007, 1.0000000000000004, 1.0, 1.0, 1.0000000000000002, 0.99999999999999967, 0.99999999999999989, 1.0000000000000004, 1.0, 0.99999999999999978, 0.99999999999999967, 1.0, 0.99999999999999967, 1.0000000000000002, 1.0, 0.99999999999999989, 0.99999999999999967, 1.0000000000000002, 1.0000000000000002, 0.99999999999999956, 1.0000000000000002, 1.0, 1.0000000000000004, 1.0, 1.0000000000000002, 1.0000000000000004, 1.0, 1.0, 1.0, 1.0000000000000009, 1.0] (365, 50) (20000, 365) (20000, 50)
Insert coefficients for k top eigenvectors into the dataframe Dout
for i in range(k-1,-1,-1):
Ser=pd.Series(np.array(Prod)[:,i],index=Dout.index)
Dout.insert(4,'V'+str(i),Ser)
Dout.head()
station | year | Mean | Std | V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | USC00507570 | 2005 | 8.531507 | 10.439819 | 142.481893 | -52.954486 | 33.906654 | -1.390382 | -26.534812 | 32.464267 | 12.699405 | -30.462052 | 23.303143 | 22.385008 | 6.416568 | 20.211781 | 3.516132 | 4.649300 | 9.126348 | -4.326726 | ... |
1 | NOE00135018 | 1959 | 14.494545 | 7.974400 | 57.593976 | -51.171107 | 21.458607 | 18.718011 | 0.623031 | 7.390733 | -1.055171 | -13.843784 | 3.658847 | -5.918600 | 10.977109 | -1.364357 | 15.043134 | 6.147500 | -0.573275 | 2.896675 | ... |
2 | KZ000036546 | 1982 | 7.220513 | 14.897310 | 56.464932 | 27.415070 | 7.181212 | -4.198383 | 2.694539 | -8.357568 | 19.226334 | 9.057198 | -9.617446 | -5.577071 | -5.925625 | 5.029528 | -2.550279 | -7.811351 | 6.240000 | 3.789583 | ... |
3 | USC00054664 | 1964 | 18.576860 | 10.790115 | 23.514882 | 11.358559 | 23.872717 | 20.950578 | 6.612579 | -3.938206 | -2.763765 | -2.273217 | 17.054517 | -19.952890 | -9.211754 | 3.558375 | 4.806072 | -4.934253 | 5.880796 | -6.761959 | ... |
4 | CUW00011706 | 1981 | 31.321370 | 1.588618 | -314.692267 | -25.150375 | -9.395788 | -5.844722 | -4.984370 | 5.335082 | 4.429545 | 1.476513 | 0.428265 | -1.390674 | -0.117768 | -0.340112 | 5.185880 | 2.805358 | -2.527921 | -1.160484 | ... |
5 rows × 419 columns
Loading the station longitude/latitude and merging it into the Table
!ls
data-source.txt ghcnd-stations_buffered.txt ghcnd-version.txt ghcnd-readme.txt ghcnd-stations.txt SAMPLE_TMAX.csv
!cat ghcnd-readme.txt # uncomment to read the readme file.
README FILE FOR DAILY GLOBAL HISTORICAL CLIMATOLOGY NETWORK (GHCN-DAILY) Version 3.00 -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- I. DOWNLOAD QUICK START Start by downloading "ghcnd-stations.txt," which has metadata for all stations. Then download one of the following TAR files: - "ghcnd-all.tar.gz" if you want all of GHCN-Daily, OR - "ghcnd-gsn.tar.gz" if you only want the GCOS Surface Network (GSN), OR - "ghcnd-hcn.tar.gz" if you only want the U.S. Historical Climatology Network (U.S. HCN). Then uncompress and untar the contents of the tar file, e.g., by using the following Linux command: tar xzvf ghcnd_xxx.tar.gz Where "xxx" stands for "all", "hcn", or "gsn" as applicable. The files will be extracted into a subdirectory under the directory where the command is issued. ALTERNATIVELY, if you only need data for one station: - Find the station's name in "ghcnd-stations.txt" and note its station identification code (e.g., PHOENIX AP (Airport) is "USW00023183"); and - Download the data file (i.e., ".dly" file) that corresponds to this code (e.g., "USW00023183.dly" has the data for PHOENIX AP). Note that the ".dly" file is located in the "all" subdirectory. -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- II. CONTENTS OF ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily all: Directory with ".dly" files for all of GHCN-Daily gsn: Directory with ".dly" files for the GCOS Surface Network (GSN) hcn: Directory with ".dly" files for U.S. HCN by_year: Directory with GHCN Daily files parsed into yearly subsets with observation times where available. See the /by_year/readme.txt and /by_year/ghcn-daily-by_year-format.rtf files for further information grid: Directory with the GHCN-Daily gridded dataset known as HadGHCND papers: Directory with pdf versions of journal articles relevant to the GHCN-Daily dataset figures: Directory containing figures that summarize the inventory of GHCN-Daily station records ghcnd-all.tar.gz: TAR file of the GZIP-compressed files in the "all" directory ghcnd-gsn.tar.gz: TAR file of the GZIP-compressed "gsn" directory ghcnd-hcn.tar.gz: TAR file of the GZIP-compressed "hcn" directory ghcnd-countries.txt: List of country codes (FIPS) and names ghcnd-inventory.txt: File listing the periods of record for each station and element ghcnd-stations.txt: List of stations and their metadata (e.g., coordinates) ghcnd-states.txt: List of U.S. state and Canadian Province codes used in ghcnd-stations.txt ghcnd-version.txt: File that specifies the current version of GHCN Daily readme.txt: This file status.txt: Notes on the current status of GHCN-Daily -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- III. FORMAT OF DATA FILES (".dly" FILES) Each ".dly" file contains data for one station. The name of the file corresponds to a station's identification code. For example, "USC00026481.dly" contains the data for the station with the identification code USC00026481). Each record in a file contains one month of daily data. The variables on each line include the following: ------------------------------ Variable Columns Type ------------------------------ ID 1-11 Character YEAR 12-15 Integer MONTH 16-17 Integer ELEMENT 18-21 Character VALUE1 22-26 Integer MFLAG1 27-27 Character QFLAG1 28-28 Character SFLAG1 29-29 Character VALUE2 30-34 Integer MFLAG2 35-35 Character QFLAG2 36-36 Character SFLAG2 37-37 Character . . . . . . . . . VALUE31 262-266 Integer MFLAG31 267-267 Character QFLAG31 268-268 Character SFLAG31 269-269 Character ------------------------------ These variables have the following definitions: ID is the station identification code. Please see "ghcnd-stations.txt" for a complete list of stations and their metadata. YEAR is the year of the record. MONTH is the month of the record. ELEMENT is the element type. There are five core elements as well as a number of addition elements. The five core elements are: PRCP = Precipitation (tenths of mm) SNOW = Snowfall (mm) SNWD = Snow depth (mm) TMAX = Maximum temperature (tenths of degrees C) TMIN = Minimum temperature (tenths of degrees C) The other elements are: ACMC = Average cloudiness midnight to midnight from 30-second ceilometer data (percent) ACMH = Average cloudiness midnight to midnight from manual observations (percent) ACSC = Average cloudiness sunrise to sunset from 30-second ceilometer data (percent) ACSH = Average cloudiness sunrise to sunset from manual observations (percent) AWND = Average daily wind speed (tenths of meters per second) DAEV = Number of days included in the multiday evaporation total (MDEV) DAPR = Number of days included in the multiday precipiation total (MDPR) DASF = Number of days included in the multiday snowfall total (MDSF) DATN = Number of days included in the multiday minimum temperature (MDTN) DATX = Number of days included in the multiday maximum temperature (MDTX) DAWM = Number of days included in the multiday wind movement (MDWM) DWPR = Number of days with non-zero precipitation included in multiday precipitation total (MDPR) EVAP = Evaporation of water from evaporation pan (tenths of mm) FMTM = Time of fastest mile or fastest 1-minute wind (hours and minutes, i.e., HHMM) FRGB = Base of frozen ground layer (cm) FRGT = Top of frozen ground layer (cm) FRTH = Thickness of frozen ground layer (cm) GAHT = Difference between river and gauge height (cm) MDEV = Multiday evaporation total (tenths of mm; use with DAEV) MDPR = Multiday precipitation total (tenths of mm; use with DAPR and DWPR, if available) MDSF = Multiday snowfall total MDTN = Multiday minimum temperature (tenths of degrees C; use with DATN) MDTX = Multiday maximum temperature (tenths of degress C; use with DATX) MDWM = Multiday wind movement (km) MNPN = Daily minimum temperature of water in an evaporation pan (tenths of degrees C) MXPN = Daily maximum temperature of water in an evaporation pan (tenths of degrees C) PGTM = Peak gust time (hours and minutes, i.e., HHMM) PSUN = Daily percent of possible sunshine (percent) SN*# = Minimum soil temperature (tenths of degrees C) where * corresponds to a code for ground cover and # corresponds to a code for soil depth. Ground cover codes include the following: 0 = unknown 1 = grass 2 = fallow 3 = bare ground 4 = brome grass 5 = sod 6 = straw multch 7 = grass muck 8 = bare muck Depth codes include the following: 1 = 5 cm 2 = 10 cm 3 = 20 cm 4 = 50 cm 5 = 100 cm 6 = 150 cm 7 = 180 cm SX*# = Maximum soil temperature (tenths of degrees C) where * corresponds to a code for ground cover and # corresponds to a code for soil depth. See SN*# for ground cover and depth codes. THIC = Thickness of ice on water (tenths of mm) TOBS = Temperature at the time of observation (tenths of degrees C) TSUN = Daily total sunshine (minutes) WDF1 = Direction of fastest 1-minute wind (degrees) WDF2 = Direction of fastest 2-minute wind (degrees) WDF5 = Direction of fastest 5-second wind (degrees) WDFG = Direction of peak wind gust (degrees) WDFI = Direction of highest instantaneous wind (degrees) WDFM = Fastest mile wind direction (degrees) WDMV = 24-hour wind movement (km) WESD = Water equivalent of snow on the ground (tenths of mm) WESF = Water equivalent of snowfall (tenths of mm) WSF1 = Fastest 1-minute wind speed (tenths of meters per second) WSF2 = Fastest 2-minute wind speed (tenths of meters per second) WSF5 = Fastest 5-second wind speed (tenths of meters per second) WSFG = Peak guest wind speed (tenths of meters per second) WSFI = Highest instantaneous wind speed (tenths of meters per second) WSFM = Fastest mile wind speed (tenths of meters per second) WT** = Weather Type where ** has one of the following values: 01 = Fog, ice fog, or freezing fog (may include heavy fog) 02 = Heavy fog or heaving freezing fog (not always distinquished from fog) 03 = Thunder 04 = Ice pellets, sleet, snow pellets, or small hail 05 = Hail (may include small hail) 06 = Glaze or rime 07 = Dust, volcanic ash, blowing dust, blowing sand, or blowing obstruction 08 = Smoke or haze 09 = Blowing or drifting snow 10 = Tornado, waterspout, or funnel cloud 11 = High or damaging winds 12 = Blowing spray 13 = Mist 14 = Drizzle 15 = Freezing drizzle 16 = Rain (may include freezing rain, drizzle, and freezing drizzle) 17 = Freezing rain 18 = Snow, snow pellets, snow grains, or ice crystals 19 = Unknown source of precipitation 21 = Ground fog 22 = Ice fog or freezing fog WV** = Weather in the Vicinity where ** has one of the following values: 01 = Fog, ice fog, or freezing fog (may include heavy fog) 03 = Thunder 07 = Ash, dust, sand, or other blowing obstruction 18 = Snow or ice crystals 20 = Rain or snow shower VALUE1 is the value on the first day of the month (missing = -9999). MFLAG1 is the measurement flag for the first day of the month. There are ten possible values: Blank = no measurement information applicable B = precipitation total formed from two 12-hour totals D = precipitation total formed from four six-hour totals H = represents highest or lowest hourly temperature K = converted from knots L = temperature appears to be lagged with respect to reported hour of observation O = converted from oktas P = identified as "missing presumed zero" in DSI 3200 and 3206 T = trace of precipitation, snowfall, or snow depth W = converted from 16-point WBAN code (for wind direction) QFLAG1 is the quality flag for the first day of the month. There are fourteen possible values: Blank = did not fail any quality assurance check D = failed duplicate check G = failed gap check I = failed internal consistency check K = failed streak/frequent-value check L = failed check on length of multiday period M = failed megaconsistency check N = failed naught check O = failed climatological outlier check R = failed lagged range check S = failed spatial consistency check T = failed temporal consistency check W = temperature too warm for snow X = failed bounds check Z = flagged as a result of an official Datzilla investigation SFLAG1 is the source flag for the first day of the month. There are twenty seven possible values (including blank, upper and lower case letters): Blank = No source (i.e., data value missing) 0 = U.S. Cooperative Summary of the Day (NCDC DSI-3200) 6 = CDMP Cooperative Summary of the Day (NCDC DSI-3206) 7 = U.S. Cooperative Summary of the Day -- Transmitted via WxCoder3 (NCDC DSI-3207) A = U.S. Automated Surface Observing System (ASOS) real-time data (since January 1, 2006) a = Australian data from the Australian Bureau of Meteorology B = U.S. ASOS data for October 2000-December 2005 (NCDC DSI-3211) b = Belarus update E = European Climate Assessment and Dataset (Klein Tank et al., 2002) F = U.S. Fort data G = Official Global Climate Observing System (GCOS) or other government-supplied data H = High Plains Regional Climate Center real-time data I = International collection (non U.S. data received through personal contacts) K = U.S. Cooperative Summary of the Day data digitized from paper observer forms (from 2011 to present) M = Monthly METAR Extract (additional ASOS data) N = Community Collaborative Rain, Hail,and Snow (CoCoRaHS) Q = Data from several African countries that had been "quarantined", that is, withheld from public release until permission was granted from the respective meteorological services R = NCDC Reference Network Database (Climate Reference Network and Historical Climatology Network-Modernized) r = All-Russian Research Institute of Hydrometeorological Information-World Data Center S = Global Summary of the Day (NCDC DSI-9618) NOTE: "S" values are derived from hourly synoptic reports exchanged on the Global Telecommunications System (GTS). Daily values derived in this fashion may differ significantly from "true" daily data, particularly for precipitation (i.e., use with caution). T = SNOwpack TELemtry (SNOTEL) data obtained from the Western Regional Climate Center U = Remote Automatic Weather Station (RAWS) data obtained from the Western Regional Climate Center u = Ukraine update W = WBAN/ASOS Summary of the Day from NCDC's Integrated Surface Data (ISD). X = U.S. First-Order Summary of the Day (NCDC DSI-3210) Z = Datzilla official additions or replacements z = Uzbekistan update When data are available for the same time from more than one source, the highest priority source is chosen according to the following priority order (from highest to lowest): Z,R,0,6,X,W,K,7,F,B,M,r,E,z,u,b,a,G,Q,I,A,N,H,S VALUE2 is the value on the second day of the month MFLAG2 is the measurement flag for the second day of the month. QFLAG2 is the quality flag for the second day of the month. SFLAG2 is the source flag for the second day of the month. ... and so on through the 31st day of the month. Note: If the month has less than 31 days, then the remaining variables are set to missing (e.g., for April, VALUE31 = -9999, MFLAG31 = blank, QFLAG31 = blank, SFLAG31 = blank). -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- IV. FORMAT OF "ghcnd-stations.txt" ------------------------------ Variable Columns Type ------------------------------ ID 1-11 Character LATITUDE 13-20 Real LONGITUDE 22-30 Real ELEVATION 32-37 Real STATE 39-40 Character NAME 42-71 Character GSNFLAG 73-75 Character HCNFLAG 77-79 Character WMOID 81-85 Character ------------------------------ These variables have the following definitions: ID is the station identification code. Note that the first two characters denote the FIPS country code, the third character is a network code that identifies the station numbering system used, and the remaining eight characters contain the actual station ID. See "ghcnd-countries.txt" for a complete list of country codes. See "ghcnd-states.txt" for a list of state/province/territory codes. The network code has the following five values: 0 = unspecified (station identified by up to eight alphanumeric characters) 1 = Community Collaborative Rain, Hail,and Snow (CoCoRaHS) based identification number. To ensure consistency with with GHCN Daily, all numbers in the original CoCoRaHS IDs have been left-filled to make them all four digits long. In addition, the characters "-" and "_" have been removed to ensure that the IDs do not exceed 11 characters when preceded by "US1". For example, the CoCoRaHS ID "AZ-MR-156" becomes "US1AZMR0156" in GHCN-Daily C = U.S. Cooperative Network identification number (last six characters of the GHCN-Daily ID) E = Identification number used in the ECA&D non-blended dataset M = World Meteorological Organization ID (last five characters of the GHCN-Daily ID) N = Identification number used in data supplied by a National Meteorological or Hydrological Center R = U.S. Interagency Remote Automatic Weather Station (RAWS) identifier S = U.S. Natural Resources Conservation Service SNOwpack TELemtry (SNOTEL) station identifier W = WBAN identification number (last five characters of the GHCN-Daily ID) LATITUDE is latitude of the station (in decimal degrees). LONGITUDE is the longitude of the station (in decimal degrees). ELEVATION is the elevation of the station (in meters, missing = -999.9). STATE is the U.S. postal code for the state (for U.S. stations only). NAME is the name of the station. GSNFLAG is a flag that indicates whether the station is part of the GCOS Surface Network (GSN). The flag is assigned by cross-referencing the number in the WMOID field with the official list of GSN stations. There are two possible values: Blank = non-GSN station or WMO Station number not available GSN = GSN station HCNFLAG is a flag that indicates whether the station is part of the U.S. Historical Climatology Network (HCN). There are two possible values: Blank = non-HCN station HCN = HCN station WMOID is the World Meteorological Organization (WMO) number for the station. If the station has no WMO number, then the field is blank. -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- V. FORMAT OF "ghcnd-countries.txt" ------------------------------ Variable Columns Type ------------------------------ CODE 1-2 Character NAME 4-50 Character ------------------------------ These variables have the following definitions: CODE is the FIPS country code of the country where the station is located (from FIPS Publication 10-4 at www.cia.gov/cia/publications/factbook/appendix/appendix-d.html). NAME is the name of the country. -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- VI. FORMAT OF "ghcnd-states.txt" ------------------------------ Variable Columns Type ------------------------------ CODE 1-2 Character NAME 4-50 Character ------------------------------ These variables have the following definitions: CODE is the POSTAL code of the U.S. state/territory or Canadian province where the station is located NAME is the name of the state, territory or province. -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- VII. FORMAT OF "ghcnd-inventory.txt" ------------------------------ Variable Columns Type ------------------------------ ID 1-11 Character LATITUDE 13-20 Real LONGITUDE 22-30 Real ELEMENT 32-35 Character FIRSTYEAR 37-40 Integer LASTYEAR 42-45 Integer ------------------------------ These variables have the following definitions: ID is the station identification code. Please see "ghcnd-stations.txt" for a complete list of stations and their metadata. LATITUDE is the latitude of the station (in decimal degrees). LONGITUDE is the longitude of the station (in decimal degrees). ELEMENT is the element type. See section III for a definition of elements. FIRSTYEAR is the first year of unflagged data for the given element. LASTYEAR is the last year of unflagged data for the given element. -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- VIII. REFERENCES Klein Tank, A.M.G. and Coauthors, 2002. Daily dataset of 20th-century surface air temperature and precipitation series for the European Climate Assessment. Int. J. of Climatol., 22, 1441-1453. Data and metadata available at http://eca.knmi.nl For additional information, please send an e-mail to ncdc.ghcnd@noaa.gov.
# Make all lines be of length 90 to solve problem wilth read_fwf
out=open('ghcnd-stations_buffered.txt','w')
for line in open('ghcnd-stations.txt','r').readlines():
line=line.rstrip()
string=line+' '*(90-len(line))+'\n'
out.write(string)
out.close()
colspecs = [(0, 11), (11, 21), (21, 31), (31, 38),(39,41),(41,72),(72,76),(76,80),(80,86)]
stations = pd.read_fwf('ghcnd-stations_buffered.txt', colspecs=colspecs, header=None, index_col=0,
names=['latitude','longitude','elevation','state','name','GSNFLAG','HCNFLAG','WMOID'])
#stations['elevation'][stations['elevation']==-999.9]=0 # decided not to remove -999.9 because this confuses hist
stations.head()
latitude | longitude | elevation | state | name | GSNFLAG | HCNFLAG | WMOID | |
---|---|---|---|---|---|---|---|---|
ACW00011604 | 17.1167 | -61.7833 | 10.1 | NaN | ST JOHNS COOLIDGE FLD | NaN | NaN | NaN |
ACW00011647 | 17.1333 | -61.7833 | 19.2 | NaN | ST JOHNS | NaN | NaN | NaN |
AE000041196 | 25.3330 | 55.5170 | 34.0 | NaN | SHARJAH INTER. AIRP | GSN | NaN | 41196 |
AF000040930 | 35.3170 | 69.0170 | 3366.0 | NaN | NORTH-SALANG | GSN | NaN | 40930 |
AG000060390 | 36.7167 | 3.2500 | 24.0 | NaN | ALGER-DAR EL BEIDA | GSN | NaN | 60390 |
5 rows × 8 columns
Join the geographical information into Dout, creating a new dataframe called Djoined
Djoined=Dout.join(stations,on='station')
Djoined.columns[-10:]
Index([364, 365, u'latitude', u'longitude', u'elevation', u'state', u'name', u'GSNFLAG', u'HCNFLAG', u'WMOID'], dtype='object')
Djoined['AbsLatitude']=abs(Djoined['latitude'].values)
Djoined.ix[:5,['station',u'longitude','latitude',u'AbsLatitude','Mean','Std','V0','V1','V2']]
station | longitude | latitude | AbsLatitude | Mean | Std | V0 | V1 | V2 | |
---|---|---|---|---|---|---|---|---|---|
0 | USC00507570 | -154.3164 | 60.2036 | 60.2036 | 8.531507 | 10.439819 | 142.481893 | -52.954486 | 33.906654 |
1 | NOE00135018 | 10.3481 | 59.2300 | 59.2300 | 14.494545 | 7.974400 | 57.593976 | -51.171107 | 21.458607 |
2 | KZ000036546 | 83.6830 | 48.5500 | 48.5500 | 7.220513 | 14.897310 | 56.464932 | 27.415070 | 7.181212 |
3 | USC00054664 | -106.3681 | 40.0575 | 40.0575 | 18.576860 | 10.790115 | 23.514882 | 11.358559 | 23.872717 |
4 | CUW00011706 | -75.1500 | 19.9000 | 19.9000 | 31.321370 | 1.588618 | -314.692267 | -25.150375 | -9.395788 |
5 | KG000036982 | 78.2330 | 41.8830 | 41.8830 | -12.500000 | 2.500000 | 2.583140 | 1.701492 | 2.610502 |
6 rows × 9 columns
Djoined[['latitude','elevation','Mean','Std','V0','V1','V2','V3','V4','V5']].cov()
latitude | elevation | Mean | Std | V0 | V1 | V2 | V3 | V4 | V5 | |
---|---|---|---|---|---|---|---|---|---|---|
latitude | 357.338215 | 610.684430 | -84.563827 | 32.995352 | 1677.110745 | 490.487718 | -3.894549 | 47.114093 | -16.377163 | 23.548762 |
elevation | 610.684430 | 402304.438251 | -553.680558 | 147.390961 | 10074.869350 | 295.360309 | 870.066289 | 2576.703177 | 894.432121 | 1006.311773 |
Mean | -84.563827 | -553.680558 | 61.433472 | -12.770270 | -941.192271 | 52.100901 | -3.614234 | -5.444206 | 4.957587 | -3.803041 |
Std | 32.995352 | 147.390961 | -12.770270 | 10.885685 | 294.079400 | 85.942955 | 3.992569 | 0.456268 | -2.519408 | 0.569446 |
V0 | 1677.110745 | 10074.869350 | -941.192271 | 294.079400 | 18130.395583 | 119.263114 | 7.069433 | 115.150954 | -108.714807 | 75.407348 |
V1 | 490.487718 | 295.360309 | 52.100901 | 85.942955 | 119.263114 | 3193.165902 | -13.903190 | -20.520656 | -13.370889 | 15.652342 |
V2 | -3.894549 | 870.066289 | -3.614234 | 3.992569 | 7.069433 | -13.903190 | 553.284374 | 79.899477 | -3.477208 | -26.253103 |
V3 | 47.114093 | 2576.703177 | -5.444206 | 0.456268 | 115.150954 | -20.520656 | 79.899477 | 368.272027 | -5.695783 | -22.627756 |
V4 | -16.377163 | 894.432121 | 4.957587 | -2.519408 | -108.714807 | -13.370889 | -3.477208 | -5.695783 | 241.268316 | -1.128021 |
V5 | 23.548762 | 1006.311773 | -3.803041 | 0.569446 | 75.407348 | 15.652342 | -26.253103 | -22.627756 | -1.128021 | 230.746986 |
10 rows × 10 columns
The correlations between different $V_i$ components should be zero, which it isn't. Is this due to numerical roundoff errors? Are the correlations statistically significant for this sample size?
Djoined[['latitude','elevation','Mean','Std','V0','V1','V2','V3','V4','V5']].corr()
latitude | elevation | Mean | Std | V0 | V1 | V2 | V3 | V4 | V5 | |
---|---|---|---|---|---|---|---|---|---|---|
latitude | 1.000000 | 0.050933 | -0.570855 | 0.529138 | 0.658898 | 0.459174 | -0.008759 | 0.129875 | -0.055776 | 0.082009 |
elevation | 0.050933 | 1.000000 | -0.111377 | 0.070434 | 0.117966 | 0.008241 | 0.058318 | 0.211691 | 0.090786 | 0.104445 |
Mean | -0.570855 | -0.111377 | 1.000000 | -0.493821 | -0.891743 | 0.117625 | -0.019602 | -0.036192 | 0.040718 | -0.031939 |
Std | 0.529138 | 0.070434 | -0.493821 | 1.000000 | 0.661912 | 0.460934 | 0.051442 | 0.007206 | -0.049157 | 0.011361 |
V0 | 0.658898 | 0.117966 | -0.891743 | 0.661912 | 1.000000 | 0.015674 | 0.002232 | 0.044564 | -0.051980 | 0.036867 |
V1 | 0.459174 | 0.008241 | 0.117625 | 0.460934 | 0.015674 | 1.000000 | -0.010460 | -0.018923 | -0.015233 | 0.018235 |
V2 | -0.008759 | 0.058318 | -0.019602 | 0.051442 | 0.002232 | -0.010460 | 1.000000 | 0.177005 | -0.009517 | -0.073475 |
V3 | 0.129875 | 0.211691 | -0.036192 | 0.007206 | 0.044564 | -0.018923 | 0.177005 | 1.000000 | -0.019108 | -0.077623 |
V4 | -0.055776 | 0.090786 | 0.040718 | -0.049157 | -0.051980 | -0.015233 | -0.009517 | -0.019108 | 1.000000 | -0.004781 |
V5 | 0.082009 | 0.104445 | -0.031939 | 0.011361 | 0.036867 | 0.018235 | -0.073475 | -0.077623 | -0.004781 | 1.000000 |
10 rows × 10 columns
# Choosing significance threshold so that none of the correlations between the Vi-s are significant.
abs(Djoined[['latitude','elevation','Mean','Std','V0','V1','V2','V3','V4','V5']].corr())>0.2
latitude | elevation | Mean | Std | V0 | V1 | V2 | V3 | V4 | V5 | |
---|---|---|---|---|---|---|---|---|---|---|
latitude | True | False | True | True | True | True | False | False | False | False |
elevation | False | True | False | False | False | False | False | True | False | False |
Mean | True | False | True | True | True | False | False | False | False | False |
Std | True | False | True | True | True | True | False | False | False | False |
V0 | True | False | True | True | True | False | False | False | False | False |
V1 | True | False | False | True | False | True | False | False | False | False |
V2 | False | False | False | False | False | False | True | False | False | False |
V3 | False | True | False | False | False | False | False | True | False | False |
V4 | False | False | False | False | False | False | False | False | True | False |
V5 | False | False | False | False | False | False | False | False | False | True |
10 rows × 10 columns
from pandas.tools.plotting import scatter_matrix
df = Djoined.ix[:,['latitude','elevation','Mean','Std','V0','V1','V2','V3','V4','V5']]
scatter_matrix(df, alpha=0.03, figsize=(20, 20), diagonal='kde');
X='latitude'
Djoined.ix[:,X].hist(bins=100);
The advantage is that you can get both the scatter points and the topographic density estimate, which is useful when you have a large number of data points. However, I don't know how to increase the number of topo-lines to see the lower density areas. Can you fix this problem? Can you write code to create a Scatter matrix using rplot?
# Taken from http://pandasplotting.blogspot.com/
import matplotlib.pyplot as plt
from pandas.tools import rplot
#plt.figure()
X='latitude';Y='Mean'
dfTmp=Djoined[[X,Y]].dropna()
dfTmp=dfTmp.iloc[:,:]
p = rplot.RPlot(dfTmp,x=X,y=Y)
p.add(rplot.GeomPoint())
p.add(rplot.GeomDensity2D())
p.render();
# To see the source of a method use ??
rplot.GeomDensity2D??
X='latitude';Y='Mean'
scatter(Djoined.loc[:,X],Djoined.loc[:,Y],alpha=0.05)
xlabel(X)
ylabel(Y)
<matplotlib.text.Text at 0x7ffbdf85e710>
#checking for an anomaly in the elevations of stations
Djoined[['station','elevation']][Djoined['elevation']<-500].head()
station | elevation | |
---|---|---|
1244 | USC00301010 | -999.9 |
1312 | USC00095231 | -999.9 |
1747 | RSM00023707 | -999.9 |
1821 | BL000085041 | -999.9 |
2192 | USC00107878 | -999.9 |
5 rows × 2 columns
!grep ASN00010865 ghcnd-stations.txt
ASN00010865 -34.0333 117.2667 -999.9 LUMEAH
Working through http://matplotlib.org/basemap/
lons=stations.ix[:,'longitude'].values
lats=stations.ix[:,'latitude'].values
station_names=stations.index.values
ll=len(lons)
lonmin=-180;lonmax=180;latsmin=-80;latsmax=80;
select=(lons>lonmin) * (lons<lonmax)*(lats>latsmin)*(lats<latsmax)
print sum(select)
station_names=station_names[select]
lons=lons[select]
lats=lats[select]
print len(lons),len(lats),len(station_names)
85273 85273 85273 85273
# http://matplotlib.org/basemap/users/merc.html
from mpl_toolkits.basemap import Basemap
import numpy as np
import matplotlib.pyplot as plt
# llcrnrlat,llcrnrlon,urcrnrlat,urcrnrlon
# are the lat/lon values of the lower left and upper right corners
# of the map.
# lat_ts is the latitude of true scale.
# resolution = 'i' means use intermediate resolution coastlines.
plt.figure(figsize=(15,10),dpi=300)
m = Basemap(projection='merc',llcrnrlat=latsmin,urcrnrlat=latsmax,\
llcrnrlon=lonmin,urcrnrlon=lonmax,lat_ts=20,resolution='i')
m.drawcoastlines()
m.fillcontinents(color='coral',lake_color='aqua')
# draw parallels and meridians.
parallels = np.arange(-80,81,10.)
# labels = [left,right,top,bottom]
m.drawparallels(parallels,labels=[False,True,True,False])
meridians = np.arange(10.,351.,20.)
m.drawmeridians(meridians,labels=[True,False,False,True])
#m.drawparallels(np.arange(-90.,91.,30.))
#m.drawmeridians(np.arange(-180.,181.,60.))
m.drawmapboundary(fill_color='aqua')
# draw map with markers for locations
x, y = m(lons,lats)
m.plot(x,y,'g.')
plt.title('weather stations')
plt.show()
To get to these coordinate on Google Maps, type the latitude and longitude in decimal in the search box or use: https://www.google.com/maps/place/72%C2%B018%2700.0%22S+170%C2%B013%2700.1%22E/@-72.3,170.216694,17z/data=!3m1!4b1!4m2!3m1!1s0x0:0x0
HW questions
def plot_reconstructions(selection,rows=2,columns=7):
Recon=Eig*Prod.transpose()+Mean[:,np.newaxis]
plt.figure(figsize=(columns*3,rows*3),dpi=300)
j=0;
for i in selection:
subplot(rows,columns,j);
j += 1;
if j>=rows*columns: break
plot(Recon[:,i])
plot(Djoined.ix[i,1:365]);
title(Djoined.ix[i,'station']+' / '+str(Djoined.ix[i,'year']))
xlim([0,365])
Observe in the reconstructions below that the bloue line fills in (extrapolation/interpolation) the places where the measurements are not available. It also reduces the fluctuations in the relative to the original line. Recall the we are using the k top eigenvectors which explain about 88% of the variance.
plot_reconstructions(range(50,100),rows=7,columns=3)
Check how the approximations change/improve as you increase the number of coefficients
hist(Djoined.ix[:,'V0'],bins=100);
selection= [i for i in range(shape(Djoined)[0]) if Djoined.ix[i,'latitude']<-10]
plot_reconstructions(selection,rows=7,columns=3)
shape(selection)
(1083,)
Can you reduce the reconstruction error (using a fixed number of eigenvectors) by splitting the stations according to region (for example country, state, latitudal range). Note that having a regions with very few readings defeats the purpose.