Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
3# spechomo, Spectral homogenization of multispectral satellite data
4#
5# Copyright (C) 2019-2021
6# - Daniel Scheffler (GFZ Potsdam, daniel.scheffler@gfz-potsdam.de)
7# - Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences Potsdam,
8# Germany (https://www.gfz-potsdam.de/)
9#
10# This software was developed within the context of the GeoMultiSens project funded
11# by the German Federal Ministry of Education and Research
12# (project grant code: 01 IS 14 010 A-C).
13#
14# Licensed under the Apache License, Version 2.0 (the "License");
15# you may not use this file except in compliance with the License.
16# You may obtain a copy of the License at
17#
18# http://www.apache.org/licenses/LICENSE-2.0
19#
20# Please note the following exception: `spechomo` depends on tqdm, which is
21# distributed under the Mozilla Public Licence (MPL) v2.0 except for the files
22# "tqdm/_tqdm.py", "setup.py", "README.rst", "MANIFEST.in" and ".gitignore".
23# Details can be found here: https://github.com/tqdm/tqdm/blob/master/LICENCE.
24#
25# Unless required by applicable law or agreed to in writing, software
26# distributed under the License is distributed on an "AS IS" BASIS,
27# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28# See the License for the specific language governing permissions and
29# limitations under the License.
31import os
32import re
33from glob import glob
34from multiprocessing import cpu_count
35from typing import List, Tuple, Union, Dict, TYPE_CHECKING # noqa F401 # flake8 issue
36import logging # noqa F401 # flake8 issue
37from itertools import product
39import dill
40import numpy as np
41from nested_dict import nested_dict
42from pandas import DataFrame
43from tqdm import tqdm
44from geoarray import GeoArray
45from pyrsr import RSR
46from pyrsr.sensorspecs import get_LayerBandsAssignment
48if TYPE_CHECKING:
49 # avoid the sklearn imports on module level to get rid of static TLS ImportError
50 from sklearn.ensemble import RandomForestRegressor
51 from sklearn.linear_model import LinearRegression, Ridge
52 from sklearn.pipeline import Pipeline
54from .clustering import KMeansRSImage
55from .resampling import SpectralResampler
56from .training_data import RefCube
57from .logging import SpecHomo_Logger
58from .options import options
61class ReferenceCube_Generator(object):
62 """Class for creating reference cube that are later used as training data for SpecHomo_Classifier."""
64 def __init__(self, filelist_refs, tgt_sat_sen_list=None, dir_refcubes='', n_clusters=10, tgt_n_samples=1000,
65 v=False, logger=None, CPUs=None, dir_clf_dump=''):
66 # type: (List[str], List[Tuple[str, str]], str, int, int, bool, logging.Logger, Union[None, int], str) -> None
67 """Initialize ReferenceCube_Generator.
69 :param filelist_refs: list of (hyperspectral) reference images,
70 representing BOA reflectance, scaled between 0 and 10000
71 :param tgt_sat_sen_list: list satellite/sensor tuples containing those sensors for which the reference cube
72 is to be computed, e.g. [('Landsat-8', 'OLI_TIRS',), ('Landsat-5', 'TM')]
73 :param dir_refcubes: output directory for the generated reference cube
74 :param n_clusters: number of clusters to be used for clustering the input images (KMeans)
75 :param tgt_n_samples: number o spectra to be collected from each input image
76 :param v: verbose mode
77 :param logger: instance of logging.Logger()
78 :param CPUs: number CPUs to use for computation
79 :param dir_clf_dump: directory where to store the serialized KMeans classifier
80 """
81 if not filelist_refs:
82 raise ValueError('The given list of reference images is empty.')
84 # args + kwargs
85 self.ims_ref = [filelist_refs, ] if isinstance(filelist_refs, str) else filelist_refs
86 self.tgt_sat_sen_list = tgt_sat_sen_list or [
87 ('Landsat-8', 'OLI_TIRS'),
88 ('Landsat-7', 'ETM+'),
89 ('Landsat-5', 'TM'),
90 ('Sentinel-2A', 'MSI'),
91 # ('Terra', 'ASTER'), # currently does not work
92 ('SPOT-4', 'HRVIR1'),
93 ('SPOT-4', 'HRVIR2'),
94 ('SPOT-5', 'HRG1'),
95 ('SPOT-5', 'HRG2'),
96 ('RapidEye-5', 'MSI')
97 ]
98 self.dir_refcubes = os.path.abspath(dir_refcubes) if dir_refcubes else ''
99 self.n_clusters = n_clusters
100 self.tgt_n_samples = tgt_n_samples
101 self.v = v
102 self.logger = logger or SpecHomo_Logger(__name__) # must be pickable
103 self.CPUs = CPUs or cpu_count()
104 self.dir_clf_dump = dir_clf_dump
106 # privates
107 self._refcubes = \
108 {(sat, sen): RefCube(satellite=sat, sensor=sen,
109 LayerBandsAssignment=self._get_tgt_LayerBandsAssignment(sat, sen))
110 for sat, sen in self.tgt_sat_sen_list}
112 if dir_refcubes and not os.path.isdir(self.dir_refcubes):
113 os.makedirs(self.dir_refcubes)
115 @property
116 def refcubes(self):
117 # type: () -> Dict[Tuple[str, str]: RefCube]
118 """Return a dict holding instances of RefCube for each target satellite / sensor of self.tgt_sat_sen_list."""
119 if not self._refcubes:
121 # fill self._ref_cubes with GeoArray instances of already existing reference cubes read from disk
122 if self.dir_refcubes:
123 for path_refcube in glob(os.path.join(self.dir_refcubes, 'refcube__*.bsq')):
124 # TODO check if that really works
125 # check if current refcube path matches the target refcube specifications
126 identifier = re.search('refcube__(.*).bsq', os.path.basename(path_refcube)).group(1)
127 sat, sen, nclust_str, nsamp_str = identifier.split('__') # type: str
128 nclust, nsamp = int(nclust_str.split('nclust')[1]), int(nsamp_str.split('nclust')[1])
129 correct_specs = all([(sat, sen) in self.tgt_sat_sen_list,
130 nclust == self.n_clusters,
131 nsamp == self.tgt_n_samples])
133 # import the existing ref cube if it matches the target refcube specs
134 if correct_specs:
135 self._refcubes[(sat, sen)] = \
136 RefCube(satellite=sat, sensor=sen, filepath=path_refcube,
137 LayerBandsAssignment=self._get_tgt_LayerBandsAssignment(sat, sen))
139 return self._refcubes
141 @staticmethod
142 def _get_tgt_LayerBandsAssignment(tgt_sat, tgt_sen):
143 # type: (str, str) -> list
144 """Get the LayerBandsAssignment for the specified target sensor.
146 NOTE: The returned bands list always contains all possible bands. Panchromatic bands or bands not available
147 after atmospheric correction are removed. Further band selections are later done using np.take().
148 NOTE: We don't use L1A here because the signal of the additional bands at L1A is not predictable by spectral
149 harmonization as these bands are not driven by surface albedo but by atmospheric conditions (945, 1373nm).
151 :param tgt_sat: target satellite
152 :param tgt_sen: target sensor
153 :return:
154 """
155 return get_LayerBandsAssignment(tgt_sat, tgt_sen,
156 no_pan=False, no_thermal=True, sort_by_cwl=True, after_ac=True)
158 @staticmethod
159 def _get_tgt_RSR_object(tgt_sat, tgt_sen):
160 # type: (str, str) -> RSR
161 """Get an RSR instance containing the spectral response functions for the target sensor.
163 :param tgt_sat: target satellite
164 :param tgt_sen: target sensor
165 :return:
166 """
167 return RSR(tgt_sat, tgt_sen, no_pan=False, no_thermal=True, after_ac=True)
169 def generate_reference_cubes(self, fmt_out='ENVI', try_read_dumped_clf=True, sam_classassignment=False,
170 max_distance='80%', max_angle=6, nmin_unique_spectra=50, alg_nodata='radical',
171 progress=True):
172 # type: (str, bool, bool, int, Union[int, float, str], Union[int, float, str], str, bool) -> ReferenceCube_Generator.refcubes # noqa
173 """Generate reference spectra from all hyperspectral input images.
175 Workflow:
176 1. Clustering/classification of hyperspectral images and selection of a given number of random signatures
177 (a. Spectral downsamling to lower spectral resolution (speedup))
178 b. KMeans clustering
179 c. Selection of the same number of signatures from each cluster to avoid unequal amount of training data.
180 2. Spectral resampling of the selected hyperspectral signatures (for each input image)
181 3. Add resampled spectra to reference cubes for each target sensor and write cubes to disk
183 :param fmt_out: output format (GDAL driver code)
184 :param try_read_dumped_clf: try to read a prediciouly serialized KMeans classifier from disk
185 (massively speeds up the RefCube generation)
186 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers
187 True: use the minimal spectral angle to assign classes to cluster centers
188 :param max_distance: spectra with a larger spectral distance than the given value will be excluded from
189 random sampling.
190 - if given as string like '20%', the maximum spectral distance is computed as 20%
191 percentile within each cluster
192 :param max_angle: spectra with a larger spectral angle than the given value will be excluded from
193 random sampling.
194 - if given as string like '20%', the maximum spectral angle is computed as 20%
195 percentile within each cluster
196 :param nmin_unique_spectra: in case a cluster has less than the given number,
197 do not include it in the reference cube (default: 50)
198 :param alg_nodata: algorithm how to deal with pixels where the spectral bands of the source image
199 contain nodata within the spectral response of a target band
200 'radical': set output band to nodata
201 'conservative': use existing spectral information and ignore nodata
202 (might alter the output spectral information,
203 e.g., at spectral absorption bands)
204 :param progress: show progress bar (default: True)
205 :return: np.array: [tgt_n_samples x images x spectral bands of the target sensor]
206 """
207 for im in self.ims_ref: # type: Union[str, GeoArray]
208 # TODO implement check if current image is already included in the refcube -> skip in that case
209 src_im = GeoArray(im)
211 # get random spectra of the original (hyperspectral) image, equally distributed over all computed clusters
212 baseN = os.path.splitext(os.path.basename(im))[0] if isinstance(im, str) else im.basename
213 unif_random_spectra = \
214 self.cluster_image_and_get_uniform_spectra(src_im,
215 try_read_dumped_clf=try_read_dumped_clf,
216 sam_classassignment=sam_classassignment,
217 max_distance=max_distance,
218 max_angle=max_angle,
219 nmin_unique_spectra=nmin_unique_spectra,
220 progress=progress,
221 basename_clf_dump=baseN).astype(np.int16)
223 # resample the set of random spectra to match the spectral characteristics of all target sensors
224 for tgt_sat, tgt_sen in self.tgt_sat_sen_list:
225 # perform spectral resampling
226 self.logger.info('Performing spectral resampling to match %s %s specifications...' % (tgt_sat, tgt_sen))
227 unif_random_spectra_rsp = self.resample_spectra(
228 unif_random_spectra,
229 src_cwl=np.array(src_im.meta.band_meta['wavelength'], dtype=float).flatten(),
230 tgt_rsr=self._get_tgt_RSR_object(tgt_sat, tgt_sen),
231 nodataVal=src_im.nodata,
232 alg_nodata=alg_nodata)
234 # add the spectra as GeoArray instance to the in-mem ref cubes
235 refcube = self.refcubes[(tgt_sat, tgt_sen)] # type: RefCube
236 refcube.add_spectra(unif_random_spectra_rsp, src_imname=src_im.basename,
237 LayerBandsAssignment=self._get_tgt_LayerBandsAssignment(tgt_sat, tgt_sen))
239 # update the reference cubes on disk
240 if self.dir_refcubes:
241 refcube.save(path_out=os.path.join(self.dir_refcubes, 'refcube__%s__%s__nclust%s__nsamp%s.bsq'
242 % (tgt_sat, tgt_sen, self.n_clusters, self.tgt_n_samples)),
243 fmt=fmt_out)
245 return self.refcubes
247 def cluster_image_and_get_uniform_spectra(self, im, downsamp_sat=None, downsamp_sen=None, basename_clf_dump='',
248 try_read_dumped_clf=True, sam_classassignment=False, max_distance='80%',
249 max_angle=6, nmin_unique_spectra=50, progress=False):
250 # type: (Union[str, GeoArray, np.ndarray], str, str, str, bool, bool, int, Union[int, float, str], Union[int, float, str], bool) -> np.ndarray # noqa
251 """Compute KMeans clusters for the given image and return the an array of uniform random samples.
253 :param im: image to be clustered
254 :param downsamp_sat: satellite code used for intermediate image dimensionality reduction (input image is
255 spectrally resampled to this satellite before it is clustered). requires downsamp_sen.
256 If it is None, no intermediate downsampling is performed.
257 :param downsamp_sen: sensor code used for intermediate image dimensionality reduction (requires downsamp_sat)
258 :param basename_clf_dump: basename of serialized KMeans classifier
259 :param try_read_dumped_clf: try to read a previously serialized KMeans classifier from disk
260 (massively speeds up the RefCube generation)
261 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers
262 True: use the minimal spectral angle to assign classes to cluster centers
263 :param max_distance: spectra with a larger spectral distance than the given value will be excluded from
264 random sampling.
265 - if given as string like '20%', the maximum spectral distance is computed as 20%
266 percentile within each cluster
267 :param max_angle: spectra with a larger spectral angle than the given value will be excluded from
268 random sampling.
269 - if given as string like '20%', the maximum spectral angle is computed as 20%
270 percentile within each cluster
271 :param nmin_unique_spectra: in case a cluster has less than the given number,
272 do not include it in the reference cube (default: 50)
273 :param progress: whether to show progress bars or not
274 :return: 2D array (rows: tgt_n_samples, columns: spectral information / bands
275 """
276 # input validation
277 if downsamp_sat and not downsamp_sen or downsamp_sen and not downsamp_sat:
278 raise ValueError("The parameters 'spec_rsp_sat' and 'spec_rsp_sen' must both be provided or completely "
279 "omitted.")
281 im2clust = GeoArray(im)
283 # get a KMeans classifier for the hyperspectral image
284 path_clf = os.path.join(self.dir_clf_dump, '%s__KMeansClf.dill' % basename_clf_dump)
285 if not sam_classassignment:
286 path_clustermap = os.path.join(self.dir_clf_dump,
287 '%s__KMeansClusterMap_nclust%d.bsq' % (basename_clf_dump, self.n_clusters))
288 else:
289 path_clustermap = os.path.join(self.dir_clf_dump,
290 '%s__SAMClusterMap_nclust%d.bsq' % (basename_clf_dump, self.n_clusters))
292 if try_read_dumped_clf and os.path.isfile(path_clf):
293 # read the previously dumped classifier from disk
294 self.logger.info('Reading previously serialized KMeans classifier from %s...' % path_clf)
295 kmeans = KMeansRSImage.from_disk(path_clf=path_clf, im=im2clust)
297 else:
298 # create KMeans classifier
299 # first, perform spectral resampling to Sentinel-2 to reduce dimensionality (speedup)
300 if downsamp_sat and downsamp_sen:
301 # NOTE: The KMeansRSImage class already reduces the input data to 1 million spectra by default
302 tgt_rsr = RSR(satellite=downsamp_sat, sensor=downsamp_sen, no_pan=True, no_thermal=True)
303 im2clust = self.resample_image_spectrally(im2clust, tgt_rsr, progress=progress) # output = int16
305 # compute KMeans clusters for the spectrally resampled image
306 # NOTE: Nodata values are ignored during KMeans clustering.
307 self.logger.info('Computing %s KMeans clusters from the input image %s...'
308 % (self.n_clusters, im2clust.basename))
309 kmeans = KMeansRSImage(im2clust, n_clusters=self.n_clusters, sam_classassignment=sam_classassignment,
310 CPUs=self.CPUs, v=self.v)
311 kmeans.compute_clusters()
312 kmeans.compute_spectral_distances()
314 if self.dir_clf_dump and basename_clf_dump:
315 kmeans.save_clustermap(path_clustermap)
316 kmeans.dump(path_clf)
318 if self.v:
319 kmeans.plot_cluster_centers()
320 kmeans.plot_cluster_histogram()
322 # randomly grab the given number of spectra from each cluster, restricted to the 30 % purest spectra
323 # -> no spectra containing nodata values are returned
324 self.logger.info('Getting %s random spectra from each cluster...' % (self.tgt_n_samples // self.n_clusters))
325 random_samples = kmeans.get_random_spectra_from_each_cluster(samplesize=self.tgt_n_samples // self.n_clusters,
326 max_distance=max_distance,
327 max_angle=max_angle,
328 nmin_unique_spectra=nmin_unique_spectra)
329 # random_samples = kmeans\
330 # .get_purest_spectra_from_each_cluster(src_im=GeoArray(im),
331 # samplesize=self.tgt_n_samples // self.n_clusters)
333 # combine the spectra (2D arrays) of all clusters to a single 2D array
334 self.logger.info('Combining random samples from all clusters.')
335 random_samples = np.vstack([random_samples[clusterlabel] for clusterlabel in random_samples])
337 return random_samples
339 def resample_spectra(self, spectra, src_cwl, tgt_rsr, nodataVal, alg_nodata='radical'):
340 # type: (Union[GeoArray, np.ndarray], Union[list, np.array], RSR, int, str) -> np.ndarray
341 """Perform spectral resampling of the given image to match the given spectral response functions.
343 :param spectra: 2D array (rows: spectral samples; columns: spectral information / bands
344 :param src_cwl: central wavelength positions of input spectra
345 :param tgt_rsr: target relative spectral response functions to be used for spectral resampling
346 :param nodataVal: nodata value of the given spectra to be ignored during resampling
347 :param alg_nodata: algorithm how to deal with pixels where the spectral bands of the source image
348 contain nodata within the spectral response of a target band
349 'radical': set output band to nodata
350 'conservative': use existing spectral information and ignore nodata
351 (might alter the outpur spectral information,
352 e.g., at spectral absorption bands)
353 :return:
354 """
355 spectra = GeoArray(spectra)
357 # perform spectral resampling of input image to match spectral properties of target sensor
358 self.logger.info('Performing spectral resampling to match spectral properties of %s %s...'
359 % (tgt_rsr.satellite, tgt_rsr.sensor))
361 SR = SpectralResampler(src_cwl, tgt_rsr)
362 spectra_rsp = SR.resample_spectra(spectra,
363 chunksize=200,
364 nodataVal=nodataVal,
365 alg_nodata=alg_nodata,
366 CPUs=self.CPUs)
368 return spectra_rsp
370 def resample_image_spectrally(self, src_im, tgt_rsr, src_nodata=None, alg_nodata='radical', progress=False):
371 # type: (Union[str, GeoArray], RSR, Union[float, int], str, bool) -> Union[GeoArray, None]
372 """Perform spectral resampling of the given image to match the given spectral response functions.
374 :param src_im: source image to be resampled
375 :param tgt_rsr: target relative spectral response functions to be used for spectral resampling
376 :param src_nodata: source image nodata value
377 :param alg_nodata: algorithm how to deal with pixels where the spectral bands of the source image
378 contain nodata within the spectral response of a target band
379 'radical': set output band to nodata
380 'conservative': use existing spectral information and ignore nodata (might alter the output
381 spectral information, e.g., at spectral absorption bands)
382 :param progress: show progress bar (default: false)
383 :return:
384 """
385 # handle src_im provided as file path or GeoArray instance
386 if isinstance(src_im, str):
387 im_name = os.path.basename(src_im)
388 im_gA = GeoArray(src_im, nodata=src_nodata)
389 else:
390 im_name = src_im.basename
391 im_gA = src_im
393 # read input image
394 self.logger.info('Reading the input image %s...' % im_name)
395 im_gA.cwl = np.array(im_gA.meta.band_meta['wavelength'], dtype=float).flatten()
397 # perform spectral resampling of input image to match spectral properties of target sensor
398 self.logger.info('Performing spectral resampling to match spectral properties of %s %s...'
399 % (tgt_rsr.satellite, tgt_rsr.sensor))
400 SR = SpectralResampler(im_gA.cwl, tgt_rsr)
402 tgt_im = GeoArray(np.zeros((*im_gA.shape[:2], len(tgt_rsr.bands)), dtype=np.int16),
403 geotransform=im_gA.gt,
404 projection=im_gA.prj,
405 nodata=im_gA.nodata)
406 tgt_im.meta.band_meta['wavelength'] = list(tgt_rsr.wvl)
407 tgt_im.bandnames = ['B%s' % i if len(i) == 2 else 'B0%s' % i for i in tgt_rsr.LayerBandsAssignment]
409 tiles = im_gA.tiles((1000, 1000)) # use tiles to save memory
410 for ((rS, rE), (cS, cE)), tiledata in (tqdm(tiles) if progress else tiles):
411 tgt_im[rS: rE + 1, cS: cE + 1, :] = SR.resample_image(tiledata.astype(np.int16),
412 nodataVal=im_gA.nodata,
413 alg_nodata=alg_nodata,
414 CPUs=self.CPUs)
416 return tgt_im
419class ClusterClassifier_Generator(object):
420 """Class for creating collections of machine learning classifiers that can be used for spectral homogenization."""
422 def __init__(self, list_refcubes, logger=None):
423 # type: (List[Union[str, RefCube]], logging.Logger) -> None
424 """Get an instance of Classifier_Generator.
426 :param list_refcubes: list of RefCube instances or paths for which the classifiers are to be created.
427 :param logger: instance of logging.Logger()
428 """
429 if not list_refcubes:
430 raise ValueError('The given list of RefCube instances or paths is empty.')
432 self.refcubes = [RefCube(inRC) if isinstance(inRC, str) else inRC for inRC in list_refcubes]
433 self.logger = logger or SpecHomo_Logger(__name__) # must be pickable
435 # validation
436 shapes = list(set([RC.data.shape[:2] for RC in self.refcubes]))
437 if not len(shapes) == 1:
438 raise ValueError('Unequal dimensions of the given reference cubes: %s' % shapes)
440 @staticmethod
441 def _get_derived_LayerBandsAssignments(satellite, sensor):
442 """Get a list of possible LayerBandsAssignments in which the spectral training data may be arranged.
444 :param satellite: satellite to return LayerBandsAssignments for
445 :param sensor: sensor to return LayerBandsAssignments for
446 :return: e.g. for Landsat-8 OLI_TIRS:
447 [['1', '2', '3', '4', '5', '9', '6', '7'],
448 ['1', '2', '3', '4', '5', '9', '6', '7', '8'],
449 ['1', '2', '3', '4', '5', '6', '7'],
450 ['1', '2', '3', '4', '5', '6', '7'], ...]
451 """
452 # get bands (after AC)
453 # NOTE: - signal of additional bands at L1A is not predictable by spectral harmonization because these bands
454 # are not driven by surface albedo but by atmospheric conditions (945, 1373 nm)
456 LBAs = []
457 kw = dict(satellite=satellite, sensor=sensor, no_thermal=True, after_ac=True)
458 for lba in [get_LayerBandsAssignment(no_pan=False, sort_by_cwl=True, **kw), # L1C_withPan_cwlSorted
459 get_LayerBandsAssignment(no_pan=True, sort_by_cwl=True, **kw), # L1C_noPan_cwlSorted
460 get_LayerBandsAssignment(no_pan=False, sort_by_cwl=False, **kw), # L1C_withPan_alphabetical
461 get_LayerBandsAssignment(no_pan=True, sort_by_cwl=False, **kw), # L1C_noPan_alphabetical
462 ]:
463 if lba not in LBAs:
464 LBAs.append(lba)
466 return LBAs
468 @staticmethod
469 def train_machine_learner(train_X, train_Y, test_X, test_Y, method, **kwargs):
470 # type: (np.ndarray, np.ndarray, np.ndarray, np.ndarray, str, dict) -> Union[LinearRegression, Ridge, Pipeline, RandomForestRegressor] # noqa E501 (line too long)
471 """Use the given train and test data to train a machine learner and append some accuracy statistics.
473 :param train_X: reference training data
474 :param train_Y: target training data
475 :param test_X: reference test data
476 :param test_Y: target test data
477 :param method: type of machine learning classifiers to be included in classifier collections
478 'LR': Linear Regression
479 'RR': Ridge Regression
480 'QR': Quadratic Regression
481 'RFR': Random Forest Regression (50 trees)
482 :param kwargs: keyword arguments to be passed to the __init__() function of machine learners
483 """
484 from sklearn.pipeline import Pipeline
486 ###################
487 # train the model #
488 ###################
490 ML = get_machine_learner(method, **kwargs)
491 ML.fit(train_X, train_Y)
493 def mean_absolute_percentage_error(y_true, y_pred):
494 y_true, y_pred = np.array(y_true), np.array(y_pred)
496 # avoid division by 0
497 if 0 in y_true:
498 y_true = y_true.astype(float)
499 y_true[y_true == 0] = np.nan
501 return np.nanmean(np.abs((y_true - y_pred) / y_true), axis=0) * 100
503 ###########################
504 # compute RMSE, MAE, MAPE #
505 ###########################
507 from sklearn.metrics import mean_squared_error, mean_absolute_error
508 predicted = ML.predict(test_X) # returns 2D array (spectral samples x bands), e.g. 640 x 6
509 # NOTE: 'raw_values': RMSE is column-wise computed
510 # => yields the same result as one would compute the RMSE band by band
511 rmse = np.sqrt(mean_squared_error(test_Y, predicted, multioutput='raw_values')).astype(np.float32)
512 mae = mean_absolute_error(test_Y, predicted, multioutput='raw_values').astype(np.float32)
513 mape = mean_absolute_percentage_error(test_Y, predicted).astype(np.float32)
515 # predicted_train = ML.predict(train_X)
516 # rmse_train = np.sqrt(mean_squared_error(train_Y, predicted_train, multioutput='raw_values'))
517 # # v2
518 # test_Y_im = spectra2im(test_Y, tgt_rows=int(tgt_data.rows * 0.4), tgt_cols=tgt_data.cols)
519 # pred_im = spectra2im(predicted, tgt_rows=int(tgt_data.rows * 0.4), tgt_cols=tgt_data.cols)
520 # rmse_v2 = np.sqrt(mean_squared_error(test_Y_im[:,:,1], pred_im[:,:,1]))
522 # append some metadata
523 ML.scores = dict(train=ML.score(train_X, train_Y),
524 test=ML.score(test_X, test_Y)) # r2 scores
525 ML.rmse_per_band = list(rmse)
526 ML.mae_per_band = list(mae)
527 ML.mape_per_band = list(mape)
528 from .version import __version__, __versionalias__
529 ML.spechomo_version = __version__
530 ML.spechomo_versionalias = __versionalias__
532 # convert float64 attributes to float32 to save memory (affects <0,05% of homogenized pixels by 1 DN)
533 for attr in ['coef_', 'intercept_', 'singular_', '_residues']:
534 if isinstance(ML, Pipeline):
535 # noinspection PyProtectedMember
536 setattr(ML._final_estimator, attr, getattr(ML._final_estimator, attr).astype(np.float32))
537 else:
538 try:
539 setattr(ML, attr, getattr(ML, attr).astype(np.float32))
540 except AttributeError:
541 pass
543 return ML
545 def create_classifiers(self, outDir, method='LR', n_clusters=50, sam_classassignment=False, CPUs=None,
546 max_distance=options['classifiers']['trainspec_filtering']['max_distance'],
547 max_angle=options['classifiers']['trainspec_filtering']['max_angle'],
548 **kwargs):
549 # type: (str, str, int, bool, int, Union[int, str], Union[int, str], dict) -> None
550 """Create cluster classifiers for all combinations of the reference cubes given in __init__().
552 :param outDir: output directory for the created cluster classifier collections
553 :param method: type of machine learning classifiers to be included in classifier collections
554 'LR': Linear Regression
555 'RR': Ridge Regression
556 'QR': Quadratic Regression
557 'RFR': Random Forest Regression (50 trees with maximum depth of 3 by default)
558 :param n_clusters: number of clusters to be used for KMeans clustering
559 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers
560 True: use the minimal spectral angle to assign classes to cluster centers
561 :param CPUs: number of CPUs to be used for KMeans clustering
562 :param max_distance: maximum spectral distance allowed during filtering of training spectra
563 - if given as string, e.g., '80%' excludes the worst 20 % of the input spectra
564 :param max_angle: maximum spectral angle allowed during filtering of training spectra
565 - if given as string, e.g., '80%' excludes the worst 20 % of the input spectra
566 :param kwargs: keyword arguments to be passed to machine learner
567 """
568 from sklearn.model_selection import train_test_split
570 # validate and set defaults
571 if not os.path.isdir(outDir):
572 os.makedirs(outDir)
574 if method == 'RFR':
575 if n_clusters > 1:
576 self.logger.warning("The spectral homogenization method 'Random Forest Regression' does not allow "
577 "spectral sub-clustering. Setting 'n_clusters' to 1.")
578 n_clusters = 1
580 kwargs.update(dict(
581 n_jobs=kwargs.get('n_jobs', CPUs),
582 n_estimators=kwargs.get('n_estimators', options['classifiers']['RFR']['n_trees']),
583 max_depth=kwargs.get('max_depth', options['classifiers']['RFR']['max_depth']),
584 max_features=kwargs.get('max_features', options['classifiers']['RFR']['max_features'])
585 ))
587 # build the classifier collections with separate classifiers for each cluster
588 for src_cube in self.refcubes: # type: RefCube
589 cls_collection = nested_dict()
591 # get cluster labels for each source cube separately (as they depend on source spectral characteristics)
592 clusterlabels_src_cube, spectral_distances, spectral_angles = \
593 self._get_cluster_labels_for_source_refcube(src_cube, n_clusters, CPUs,
594 sam_classassignment=sam_classassignment,
595 return_spectral_distances=True,
596 return_spectral_angles=True)
598 for tgt_cube in self.refcubes:
599 if (src_cube.satellite, src_cube.sensor) == (tgt_cube.satellite, tgt_cube.sensor):
600 continue
601 self.logger.info("Creating %s cluster classifier to predict %s %s from %s %s..."
602 % (method, tgt_cube.satellite, tgt_cube.sensor, src_cube.satellite, src_cube.sensor))
604 src_derived_LBAs = self._get_derived_LayerBandsAssignments(src_cube.satellite, src_cube.sensor)
605 tgt_derived_LBAs = self._get_derived_LayerBandsAssignments(tgt_cube.satellite, tgt_cube.sensor)
607 for src_LBA, tgt_LBA in product(src_derived_LBAs, tgt_derived_LBAs):
608 self.logger.debug('Creating %s cluster classifier for LBA %s => %s...'
609 % (method, '_'.join(src_LBA), '_'.join(tgt_LBA)))
611 # Get center wavelength positions
612 # NOTE: they cannot be taken from RefCube instances because they always represent after-AC-LBAs
613 src_wavelengths = list(RSR(src_cube.satellite, src_cube.sensor, LayerBandsAssignment=src_LBA).wvl)
614 tgt_wavelengths = list(RSR(tgt_cube.satellite, tgt_cube.sensor, LayerBandsAssignment=tgt_LBA).wvl)
616 # Get training data for source and target image according to the given LayerBandsAssignments
617 # e.g., source: Landsat 7 image in LBA 1__2__3__4__5__7 and target L8 in 1__2__3__4__5__6__7
618 df_src_spectra_allclust = src_cube.get_spectra_dataframe(src_LBA)
619 df_tgt_spectra_allclust = tgt_cube.get_spectra_dataframe(tgt_LBA)
621 # assign clusterlabel and spectral distance to the cluster center to each spectrum
622 for df in [df_src_spectra_allclust, df_tgt_spectra_allclust]:
623 df.insert(0, 'cluster_label', clusterlabels_src_cube)
624 df.insert(1, 'spectral_distance', spectral_distances)
625 df.insert(2, 'spectral_angle', spectral_angles)
627 # remove spectra with cluster label -9999
628 # (clusters with too few spectra that are set to nodata in the refcube)
629 df_src_spectra_allclust = df_src_spectra_allclust[df_src_spectra_allclust.cluster_label != -9999]
630 df_tgt_spectra_allclust = df_tgt_spectra_allclust[df_tgt_spectra_allclust.cluster_label != -9999]
632 # remove spectra with -9999 in all bands that HAVE a valid clusterlabel
633 # NOTE: This may happen because the cluster labels are computed on the source cube only and may be
634 # different on the target cube.
635 sub_bad = df_tgt_spectra_allclust[df_tgt_spectra_allclust.iloc[:, 3] == -9999].copy()
636 if not sub_bad.empty:
637 idx_bad = sub_bad[np.std(sub_bad.iloc[:, 3:], axis=1) == 0].index
638 if not idx_bad.empty:
639 df_src_spectra_allclust = df_src_spectra_allclust.drop(idx_bad)
640 df_tgt_spectra_allclust = df_tgt_spectra_allclust.drop(idx_bad)
642 # ensure source and target spectra do not contain nodata values (would affect classifiers)
643 assert src_cube.data.nodata is None or src_cube.data.nodata not in df_src_spectra_allclust.values
644 assert tgt_cube.data.nodata is None or tgt_cube.data.nodata not in df_tgt_spectra_allclust.values
646 for clusterlabel in range(n_clusters):
647 self.logger.debug('Creating %s classifier for cluster %s...' % (method, clusterlabel))
649 if method == 'RFR':
650 df_src_spectra_best = df_src_spectra_allclust.sample(10000, random_state=0, replace=True)
651 df_tgt_spectra_best = df_tgt_spectra_allclust.sample(10000, random_state=0, replace=True)
652 else:
653 if n_clusters == 1:
654 max_distance, max_angle = '100%', '100%'
656 df_src_spectra_best, df_tgt_spectra_best = \
657 self._extract_best_spectra_from_cluster(
658 clusterlabel, df_src_spectra_allclust, df_tgt_spectra_allclust,
659 max_distance=max_distance, max_angle=max_angle)
661 # Set train and test variables for the classifier
662 src_spectra_curlabel = df_src_spectra_best.values[:, 3:]
663 tgt_spectra_curlabel = df_tgt_spectra_best.values[:, 3:]
665 train_src, test_src, train_tgt, test_tgt = \
666 train_test_split(src_spectra_curlabel, tgt_spectra_curlabel,
667 test_size=0.4, shuffle=True, random_state=0)
669 # train the learner and add metadata
670 ML = self.train_machine_learner(train_src, train_tgt, test_src, test_tgt, method, **kwargs)
671 # noinspection PyTypeChecker
672 ML = self._add_metadata_to_machine_learner(
673 ML, src_cube, tgt_cube, src_LBA, tgt_LBA, src_wavelengths, tgt_wavelengths,
674 train_src, train_tgt, n_clusters, clusterlabel)
676 # append to classifier collection
677 cls_collection[
678 '__'.join(src_LBA)][tgt_cube.satellite, tgt_cube.sensor][
679 '__'.join(tgt_LBA)][clusterlabel] = ML
681 # dump to disk
682 fName_cls = get_filename_classifier_collection(method, src_cube.satellite, src_cube.sensor,
683 n_clusters=n_clusters, **kwargs)
685 with open(os.path.join(outDir, fName_cls), 'wb') as outF:
686 dill.dump(cls_collection.to_dict(), outF,
687 protocol=dill.HIGHEST_PROTOCOL - 1) # ensures some downwards compatibility
689 def _get_cluster_labels_for_source_refcube(self, src_cube, n_clusters, CPUs, sam_classassignment=False,
690 return_spectral_distances=False, return_spectral_angles=False):
691 """Get cluster labels for each source cube separately.
693 NOTE: - We use the GMS L1C bands (without atmospheric bands and PAN-band) for clustering.
694 - clustering is only performed on the source cube because the source sensor spectral information
695 is later used to assign the correct homogenization classifier to each pixel
697 :param src_cube:
698 :param n_clusters:
699 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers
700 True: use the minimal spectral angle to assign classes to cluster centers
701 :param CPUs:
702 :param return_spectral_distances:
703 :return:
704 """
705 self.logger.info('Clustering %s %s reference cube (%s clusters)...'
706 % (src_cube.satellite, src_cube.sensor, n_clusters))
707 LBA2clust = get_LayerBandsAssignment(src_cube.satellite, src_cube.sensor,
708 no_pan=True, no_thermal=True, sort_by_cwl=True, after_ac=True)
709 src_data2clust = src_cube.get_band_combination(LBA2clust)
710 km = KMeansRSImage(src_data2clust, n_clusters=n_clusters, sam_classassignment=sam_classassignment, CPUs=CPUs)
711 km.compute_clusters()
713 return_vals = [km.labels_with_nodata]
715 if return_spectral_distances:
716 return_vals.append(km.spectral_distances_with_nodata)
717 if return_spectral_angles:
718 return_vals.append(km.spectral_angles_with_nodata)
720 return tuple(return_vals)
722 def _extract_best_spectra_from_cluster(self, clusterlabel, df_src_spectra_allclust, df_tgt_spectra_allclust,
723 max_distance, max_angle):
724 # NOTE: We exclude the noisy spectra with the largest spectral distances to their cluster
725 # center here (random spectra from within the upper 40 %)
726 assert len(df_src_spectra_allclust.index) == len(df_tgt_spectra_allclust.index), \
727 'Source and target spectra dataframes must have the same number of spectral samples.'
729 df_src_spectra = df_src_spectra_allclust[df_src_spectra_allclust.cluster_label == clusterlabel]
731 # max_dist = np.percentile(df_src_spectra.spectral_distance, max_distance)
732 # max_angle = np.percentile(df_src_spectra.spectral_angle, max_angle_percent)
734 if isinstance(max_angle, str):
735 max_angle = np.percentile(df_src_spectra.spectral_angle,
736 int(max_angle.split('%')[0].strip()))
738 tmp = df_src_spectra[df_src_spectra.spectral_angle <= max_angle]
739 if len(tmp.index) > 10:
740 df_src_spectra = tmp
741 else:
742 df_src_spectra = df_src_spectra.sort_values(by='spectral_angle').head(10)
743 self.logger.warning('Had to choose spectra with SA up to %.1f degrees for cluster #%s.'
744 % (np.max(df_src_spectra['spectral_angle']), clusterlabel))
746 if isinstance(max_distance, str):
747 max_distance = np.percentile(df_src_spectra.spectral_distance,
748 int(max_distance.split('%')[0].strip()))
750 tmp = df_src_spectra[df_src_spectra.spectral_distance <= max_distance]
751 if len(tmp.index) > 10:
752 df_src_spectra = tmp
753 else:
754 df_src_spectra = df_src_spectra.sort_values(by='spectral_distance').head(10)
755 self.logger.warning('Had to choose spectra with SD up to %.1f for cluster #%s.'
756 % (np.max(df_src_spectra['spectral_distance']), clusterlabel))
758 if len(df_src_spectra.index) > 1700:
759 df_src_spectra = df_src_spectra.sort_values(by='spectral_angle').head(1700)
761 df_tgt_spectra = df_tgt_spectra_allclust.loc[df_src_spectra.index, :]
763 return df_src_spectra, df_tgt_spectra
765 @staticmethod
766 def _add_metadata_to_machine_learner(ML, src_cube, tgt_cube, src_LBA, tgt_LBA, src_wavelengths, tgt_wavelengths,
767 src_train_spectra, tgt_train_spectra, n_clusters, clusterlabel):
768 # compute cluster center for source spectra (only on those spectra used for model training)
769 cluster_center = np.mean(src_train_spectra, axis=0)
770 cluster_median = np.median(src_train_spectra, axis=0)
771 sample_spectra = DataFrame(src_train_spectra).sample(100, replace=True, random_state=20).values
773 ML.src_satellite = src_cube.satellite
774 ML.tgt_satellite = tgt_cube.satellite
775 ML.src_sensor = src_cube.sensor
776 ML.tgt_sensor = tgt_cube.sensor
777 ML.src_LBA = src_LBA
778 ML.tgt_LBA = tgt_LBA
779 ML.src_n_bands = len(ML.src_LBA)
780 ML.tgt_n_bands = len(ML.tgt_LBA)
781 ML.src_wavelengths = list(np.array(src_wavelengths).astype(np.float32))
782 ML.tgt_wavelengths = list(np.array(tgt_wavelengths).astype(np.float32))
783 ML.n_clusters = n_clusters
784 ML.clusterlabel = clusterlabel
785 ML.cluster_center = cluster_center.astype(src_cube.data.dtype)
786 ML.cluster_median = cluster_median.astype(src_cube.data.dtype)
787 ML.cluster_sample_spectra = sample_spectra.astype(np.int16) # scaled between 0 and 10000
788 ML.tgt_cluster_center = np.mean(tgt_train_spectra, axis=0).astype(tgt_cube.data.dtype)
789 ML.tgt_cluster_median = np.median(tgt_train_spectra, axis=0).astype(tgt_cube.data.dtype)
791 assert len(ML.src_LBA) == len(ML.src_wavelengths)
792 assert len(ML.tgt_LBA) == len(ML.tgt_wavelengths)
794 return ML
797def get_machine_learner(method='LR', **init_params):
798 # type: (str, dict) -> Union[LinearRegression, Ridge, Pipeline]
799 """Get an instance of a machine learner.
801 :param method: 'LR': Linear Regression
802 'RR': Ridge Regression
803 'QR': Quadratic Regression
804 'RFR': Random Forest Regression (50 trees)
805 :param init_params: parameters to be passed to __init__() function of the returned machine learner model.
806 """
807 from sklearn.ensemble import RandomForestRegressor
808 from sklearn.linear_model import LinearRegression, Ridge
809 from sklearn.pipeline import make_pipeline
810 from sklearn.preprocessing import PolynomialFeatures
812 if method == 'LR':
813 return LinearRegression(**init_params)
814 elif method == 'RR':
815 return Ridge(**init_params)
816 elif method == 'QR':
817 return make_pipeline(PolynomialFeatures(degree=2), LinearRegression(**init_params))
818 elif method == 'RFR':
819 return RandomForestRegressor(**init_params)
820 else:
821 raise ValueError("Unknown machine learner method code '%s'." % method)
824def get_filename_classifier_collection(method, src_satellite, src_sensor, n_clusters=1, **cls_kwinit):
825 if method == 'RR':
826 method += '_alpha1.0' # TODO add to config
827 elif method == 'RFR':
828 assert 'n_estimators' in cls_kwinit
829 method += '_trees%s' % cls_kwinit['n_estimators']
831 return '__'.join(['%s_clust%s' % (method, n_clusters), src_satellite, src_sensor]) + '.dill'
834# def get_classifier_filename(method, src_satellite, src_sensor, src_LBA_name, tgt_satellite, tgt_sensor, tgt_LBA_name):
835# return '__'.join([method, src_satellite, src_sensor, src_LBA_name]) + \
836# '__to__' + '__'.join([tgt_satellite, tgt_sensor, tgt_LBA_name]) + '.dill'