Loading spatial data into numpy arrays using GDAL
from osgeo import gdal
ds = gdal.Open("./data/current/pmean_sumrc.img")
if ds is None:
raise Exception("No such raster")
a = ds.GetRasterBand(1).ReadAsArray()
del ds
type(a), a.shape
(numpy.ndarray, (201, 146))
but rasterio
provides a much cleaner way
import rasterio
with rasterio.open("./data/current/pmean_sumrc.img") as ds:
b = ds.read_band(1)
type(b), b.shape
(numpy.ndarray, (201, 146))
Display with matplotlib (and Quantum GIS)
%pylab inline
import os
figsize(6,6)
def plotit(x, title, cmap="Blues"):
plt.imshow(x, cmap=cmap, interpolation='nearest')
plt.colorbar()
plt.title(title)
plt.show()
plotit(b, "Summer Precip")
Populating the interactive namespace from numpy and matplotlib
with rasterio.open('./data/current/iso_zns3-27.img') as ds:
# geotransform defines the grid dimensions
# in a geographic reference system
gt = ds.transform
crs = ds.crs_wkt
x = ds.read_band(1)
plotit(x, "Ag Zones", cmap="Set3")
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py:2883: FutureWarning: The value of this property will change in version 1.0. Please see https://github.com/mapbox/rasterio/issues/86 for details. exec(code_obj, self.user_global_ns, self.user_ns)
Geotransform, cellsize, etc
print("upper left coordinates:", gt[0], gt[3])
print("cell size:", gt[1], gt[5])
print("coordinate reference system:", crs)
('upper left coordinates:', -2209965.195155943, 746575.4774815631) ('cell size:', 10000.0, -10000.0) ('coordinate reference system:', u'PROJCS["Lambert_Azimuthal_Equal_Area",GEOGCS["GCS_WGS_1984",DATUM["WGS_1984",SPHEROID["WGS_84",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]],PROJECTION["Lambert_Azimuthal_Equal_Area"],PARAMETER["latitude_of_center",-100],PARAMETER["longitude_of_center",6370997],PARAMETER["false_easting",45],PARAMETER["false_northing",0],UNIT["METERS",1]]')
... flatten and transformation necessary to work it into the format needed for scikit-learn
print x.shape
print x.flatten().shape
print np.array([x.flatten(), x.flatten()]).T.shape
(201, 146) (29346,) (29346, 2)
M explanatory features (x) and one response (y)
N observations (points or sampled)
train_xs.shape == (N, M)
train_y.shape == (N,)
load_training_rasters
function loads and transforms data via random stratified sampling or point data. First, random stratified sampling...
# Explanatory data
# explanatory_fields = "tmin12c tmax8c p_ph_c pmean_wntrc pmean_sumrc irr_lands gt_demc grwsnc d2u2c".split()
explanatory_rasters = [
'./data/current/tmin12c.img', # Min December Temperature
'./data/current/tmax8c.img', # Max August Temperature
'./data/current/p_ph_c.img', # Soil pH
'./data/current/pmean_wntrc.img', # Mean Winter Precipitation
'./data/current/pmean_sumrc.img', # Mean Summer Precipitation
'./data/current/irr_lands.img', # Irrigated Lands
'./data/current/gt_demc.img', # Elevation
'./data/current/grwsnc.img', # Growing Season
'./data/current/d2u2c.img' # Distance to Urban Areas
]
# Response data (i.e. Agricultural Zones)
response_raster = './data/current/iso_zns3-27.img'
# Load the training rasters using the sampled subset
from pyimpute import load_training_rasters
train_xs, train_y = load_training_rasters(response_raster, explanatory_rasters)
train_xs.shape, train_y.shape
((29346, 9), (29346,))
Optionally take a random stratified sample
from pyimpute import stratified_sample_raster
selected = stratified_sample_raster(response_raster, min_sample_proportion=0.5)
train_xs, train_y = load_training_rasters(response_raster, explanatory_rasters, selected)
train_xs.shape, train_y.shape
((14926, 9), (14926,))
Construct and train a scikit-learn classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(train_xs, train_y)
RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0)
Assess the predictive accuracy with cross validation
# Cross validate
from sklearn import cross_validation
k = 4
scores = cross_validation.cross_val_score(clf, train_xs, train_y, cv=k)
print("%d-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (k, scores.mean() * 100, scores.std() * 200))
4-fold Cross Validation Accuracy: 86.44 (+/- 1.57)
/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:412: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=4. % (min_labels, self.n_folds)), Warning)
Assess feature importance
import pandas as pd
fimp = dict(zip(explanatory_rasters, clf.feature_importances_))
fimp_df = pd.DataFrame.from_dict([fimp,])
fimp_df.plot(kind="bar", figsize=(8,8))
<matplotlib.axes.AxesSubplot at 0x7f7ad47596d0>
Evaluate other classifiers and parameters (rinse, repeat, ...)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import ExtraTreesClassifier
import time
classifiers = [
('LDA ', LDA()),
('tree ', DecisionTreeClassifier()),
('Bayes ', GaussianNB()),
('kNN ', KNeighborsClassifier(n_neighbors=6, weights="distance")),
#('SVC-RBC ', SVC(kernel="rbf", gamma=2, C=1)),
#('logit ', LogisticRegression()),
('rf ', RandomForestClassifier(n_estimators=20, n_jobs=3)),
('extrarf ', ExtraTreesClassifier(n_estimators=20, n_jobs=3)),
]
for name, classifier in classifiers:
print name
start = time.time()
classifier.fit(train_xs, train_y)
scores = cross_validation.cross_val_score(clf, train_xs, train_y, cv=k)
print "\t%d-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (k, scores.mean() * 100, scores.std() * 200)
print "\t", time.time() - start, "seconds"
LDA 4-fold Cross Validation Accuracy: 86.40 (+/- 1.47) 1.52527713776 seconds tree 4-fold Cross Validation Accuracy: 86.43 (+/- 1.10) 1.57651185989 seconds Bayes 4-fold Cross Validation Accuracy: 86.34 (+/- 1.76) 1.49497318268 seconds kNN 4-fold Cross Validation Accuracy: 86.43 (+/- 0.83) 1.67774987221 seconds rf 4-fold Cross Validation Accuracy: 86.46 (+/- 0.69) 2.01868510246 seconds extrarf 4-fold Cross Validation Accuracy: 86.53 (+/- 1.08) 1.76619100571 seconds
load_target_data
function to load target rasters. Uses present-day climate rasters as explanatory variables
from pyimpute import load_targets
target_xs, raster_info = load_targets(explanatory_rasters)
print(raster_info)
print(target_xs.shape)
{'srs': 'PROJCS["Lambert_Azimuthal_Equal_Area",GEOGCS["GCS_WGS_1984",DATUM["WGS_1984",SPHEROID["WGS_84",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]],PROJECTION["Lambert_Azimuthal_Equal_Area"],PARAMETER["latitude_of_center",-100],PARAMETER["longitude_of_center",6370997],PARAMETER["false_easting",45],PARAMETER["false_northing",0],UNIT["METERS",1]]', 'shape': (201, 146), 'gt': (-2209965.195155943, 10000.0, 0.0, 746575.4774815631, 0.0, -10000.0)} (29346, 9)
Predict the response variable using our selected scikit-learn classifier directly
agzones = clf.predict(target_xs)
print(agzones.shape)
print(agzones[9668])
print(agzones.reshape(raster_info['shape']).shape)
(29346,) 63 (201, 146)
Use the impute
function to predict response and write data to geospatial raster data formats.
from pyimpute import impute
impute(target_xs, clf, raster_info,
outdir="_aez_output_current",
class_prob=True, certainty=True)
!ls -1 "./_aez_output_current"
certainty.img probability_0.img probability_100.img probability_101.img probability_102.img probability_103.img probability_104.img probability_105.img probability_106.img probability_107.img probability_108.img probability_109.img probability_10.img probability_110.img probability_111.img probability_112.img probability_113.img probability_114.img probability_115.img probability_116.img probability_117.img probability_118.img probability_119.img probability_11.img probability_120.img probability_121.img probability_122.img probability_123.img probability_124.img probability_125.img probability_126.img probability_127.img probability_128.img probability_129.img probability_12.img probability_130.img probability_131.img probability_132.img probability_133.img probability_134.img probability_135.img probability_136.img probability_137.img probability_138.img probability_139.img probability_13.img probability_140.img probability_141.img probability_142.img probability_143.img probability_144.img probability_145.img probability_146.img probability_147.img probability_148.img probability_149.img probability_14.img probability_150.img probability_151.img probability_152.img probability_153.img probability_154.img probability_155.img probability_156.img probability_157.img probability_158.img probability_159.img probability_15.img probability_160.img probability_161.img probability_162.img probability_163.img probability_164.img probability_165.img probability_166.img probability_167.img probability_168.img probability_169.img probability_16.img probability_170.img probability_171.img probability_172.img probability_173.img probability_174.img probability_175.img probability_176.img probability_17.img probability_18.img probability_19.img probability_1.img probability_20.img probability_21.img probability_22.img probability_23.img probability_24.img probability_25.img probability_26.img probability_27.img probability_28.img probability_29.img probability_2.img probability_30.img probability_31.img probability_32.img probability_33.img probability_34.img probability_35.img probability_36.img probability_37.img probability_38.img probability_39.img probability_3.img probability_40.img probability_41.img probability_42.img probability_43.img probability_44.img probability_45.img probability_46.img probability_47.img probability_48.img probability_49.img probability_4.img probability_50.img probability_51.img probability_52.img probability_53.img probability_54.img probability_55.img probability_56.img probability_57.img probability_58.img probability_59.img probability_5.img probability_60.img probability_61.img probability_62.img probability_63.img probability_64.img probability_65.img probability_66.img probability_67.img probability_68.img probability_69.img probability_6.img probability_70.img probability_71.img probability_72.img probability_73.img probability_74.img probability_75.img probability_76.img probability_77.img probability_78.img probability_79.img probability_7.img probability_80.img probability_81.img probability_82.img probability_83.img probability_84.img probability_85.img probability_86.img probability_87.img probability_88.img probability_89.img probability_8.img probability_90.img probability_91.img probability_92.img probability_93.img probability_94.img probability_95.img probability_96.img probability_97.img probability_98.img probability_99.img probability_9.img responses.img
Inspect the results
with rasterio.open("./_aez_output_current/responses.img") as ds:
zones = ds.read_band(1)
with rasterio.open("./data/current/iso_zns3-27.img") as ds:
zones_orig = ds.read_band(1)
plotit(zones, "Ag Zones", cmap='Set3')
plotit(zones_orig, "Ag Zones", cmap="Set3")
Define a new set of explantory target data from raster datasets of modeled climate in 2070 (from the HadGEM3 RCP8.5 model)
explanatory_2070_rasters = [
'./data/2070s_HadGEM3_rcp85/tmin12c.img', # Min December Temperature
'./data/2070s_HadGEM3_rcp85/tmax8c.img', # Max August Temperature
'./data/current/p_ph_c.img', # Soil pH
'./data/2070s_HadGEM3_rcp85/pmean_wntrc.img', # Mean Winter Precipitation
'./data/2070s_HadGEM3_rcp85/pmean_sumrc.img', # Mean Summer Precipitation
'./data/current/irr_lands.img', # Irrigated Lands
'./data/current/gt_demc.img', # Elevation
'./data/2070s_HadGEM3_rcp85/grwsnc.img', # Growing Season
'./data/current/d2u2c.img' # Distance to Urban Areas
]
Load target data and impute future agricultural zones
target_xs, raster_info = load_targets(explanatory_2070_rasters)
impute(target_xs, clf, raster_info,
outdir="_aez_output_2070s_HadGEM3_rcp85",
class_prob=True, certainty=True)
!ls -1 "./_aez_output_2070s_HadGEM3_rcp85"
certainty.img probability_0.img probability_100.img probability_101.img probability_102.img probability_103.img probability_104.img probability_105.img probability_106.img probability_107.img probability_108.img probability_109.img probability_10.img probability_110.img probability_111.img probability_112.img probability_113.img probability_114.img probability_115.img probability_116.img probability_117.img probability_118.img probability_119.img probability_11.img probability_120.img probability_121.img probability_122.img probability_123.img probability_124.img probability_125.img probability_126.img probability_127.img probability_128.img probability_129.img probability_12.img probability_130.img probability_131.img probability_132.img probability_133.img probability_134.img probability_135.img probability_136.img probability_137.img probability_138.img probability_139.img probability_13.img probability_140.img probability_141.img probability_142.img probability_143.img probability_144.img probability_145.img probability_146.img probability_147.img probability_148.img probability_149.img probability_14.img probability_150.img probability_151.img probability_152.img probability_153.img probability_154.img probability_155.img probability_156.img probability_157.img probability_158.img probability_159.img probability_15.img probability_160.img probability_161.img probability_162.img probability_163.img probability_164.img probability_165.img probability_166.img probability_167.img probability_168.img probability_169.img probability_16.img probability_170.img probability_171.img probability_172.img probability_173.img probability_174.img probability_175.img probability_176.img probability_17.img probability_18.img probability_19.img probability_1.img probability_20.img probability_21.img probability_22.img probability_23.img probability_24.img probability_25.img probability_26.img probability_27.img probability_28.img probability_29.img probability_2.img probability_30.img probability_31.img probability_32.img probability_33.img probability_34.img probability_35.img probability_36.img probability_37.img probability_38.img probability_39.img probability_3.img probability_40.img probability_41.img probability_42.img probability_43.img probability_44.img probability_45.img probability_46.img probability_47.img probability_48.img probability_49.img probability_4.img probability_50.img probability_51.img probability_52.img probability_53.img probability_54.img probability_55.img probability_56.img probability_57.img probability_58.img probability_59.img probability_5.img probability_60.img probability_61.img probability_62.img probability_63.img probability_64.img probability_65.img probability_66.img probability_67.img probability_68.img probability_69.img probability_6.img probability_70.img probability_71.img probability_72.img probability_73.img probability_74.img probability_75.img probability_76.img probability_77.img probability_78.img probability_79.img probability_7.img probability_80.img probability_81.img probability_82.img probability_83.img probability_84.img probability_85.img probability_86.img probability_87.img probability_88.img probability_89.img probability_8.img probability_90.img probability_91.img probability_92.img probability_93.img probability_94.img probability_95.img probability_96.img probability_97.img probability_98.img probability_99.img probability_9.img responses.img
Compare current zones to predicted future zones
plotit(zones, "Ag Zones", cmap='Set3')
with rasterio.open("./_aez_output_2070s_HadGEM3_rcp85/responses.img") as ds:
future_zones = ds.read_band(1)
plotit(future_zones, "Ag Zones 2070", cmap='Set3')
diff = np.not_equal(future_zones, zones)
plotit(diff, "Shift in Ag Zones by 2070")
Most scikit-learn classifiers are not deterministic... we can see the probability of belonging to any given zone
with rasterio.open("./_aez_output_2070s_HadGEM3_rcp85/probability_127.img") as ds:
future_zone127 = ds.read_band(1)
plotit(future_zone127[0:80,0:75], "Zone 127 in 2070", cmap="Greys")
The zone that is chosen is the most probable. Therefore the max probability can be interpreted as a spatially-explicity measure of overall uncertainty.
with rasterio.open("./_aez_output_2070s_HadGEM3_rcp85/certainty.img") as ds:
certainty = ds.read_band(1)
plotit(certainty, "Overall Certainty", cmap="RdYlBu")
Other sources of uncertainty: agreement across CGMs and RCPs
# image of BLM agreement map
from IPython.core.display import Image
Image(filename='./img/dougfir_viability.png')
One-vs-all classifier: set of decision trees for each possible class.
With enough classes, training data and target data, the memory requirements can get out of hand. Can mitigate some with linechunk
.
%timeit impute(target_xs, clf, raster_info, outdir="_test_default", class_prob=True, certainty=True)
%timeit impute(target_xs, clf, raster_info, linechunk=1, outdir="_test_lc1", class_prob=True, certainty=True)
1 loops, best of 3: 1.39 s per loop 1 loops, best of 3: 3.79 s per loop
Increasing number of trees in RandomForests increasing accuracy (to a point)
Image("./img/Rplot01.png")
Increasing training sample size helps a lot
Image("./img/Rplot02.png")
...BUT increasing sample size and trees dramatically increases run time (and can approach memory limitations)
Image("./img/Rplot.png")
Takehome message - get as much training data as you can and increase trees as much as you can before you hit limits of your system memory or patience.
Rewind a bit...
Many problems will not have full coverage of the response variables, only point observations.
Example: Douglas fir presence/absence
from geopandas import GeoDataFrame
points = './data/dougfir/DF_PNW_prj.shp'
dougfir = GeoDataFrame.from_file(points)
dougfir['GRIDCODE'][10:19]
/usr/local/lib/python2.7/dist-packages/pkg_resources.py:1045: UserWarning: /home/mperry/.python-eggs is writable by group/others and vulnerable to attack when used with get_resource_filename. Consider a more secure location (set with .set_extraction_path or the PYTHON_EGG_CACHE environment variable). warnings.warn(msg, UserWarning)
10 1 11 0 12 1 13 1 14 1 15 1 16 0 17 1 18 1 Name: GRIDCODE, dtype: int64
As displayed in QGIS
Image("./img/dfpoints.png")
Load training rasters using zonal statistics (requires the rasterstats
module)
# load training from vector dataset
from pyimpute import load_training_vector
train_xs, train_y = load_training_vector(points, explanatory_rasters, response_field='GRIDCODE')
train_xs.shape, train_y.shape
((1333, 9), (1333,))
Fit a model to the observed point data using current climate
clf = RandomForestClassifier()
clf.fit(train_xs, train_y)
RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0)
Map future climatic suitability
impute(target_xs, clf, raster_info,
outdir="_df_output_2070",
class_prob=True, certainty=True)
with rasterio.open("./_df_output_2070/probability_1.img") as ds:
future_df = ds.read_band(1)
plotit(future_df[25:125,10:80], "Douglas-fir - 2070 Viable Range", cmap="Greens")