Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2 

3# spechomo, Spectral homogenization of multispectral satellite data 

4# 

5# Copyright (C) 2019-2021 

6# - Daniel Scheffler (GFZ Potsdam, daniel.scheffler@gfz-potsdam.de) 

7# - Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences Potsdam, 

8# Germany (https://www.gfz-potsdam.de/) 

9# 

10# This software was developed within the context of the GeoMultiSens project funded 

11# by the German Federal Ministry of Education and Research 

12# (project grant code: 01 IS 14 010 A-C). 

13# 

14# Licensed under the Apache License, Version 2.0 (the "License"); 

15# you may not use this file except in compliance with the License. 

16# You may obtain a copy of the License at 

17# 

18# http://www.apache.org/licenses/LICENSE-2.0 

19# 

20# Please note the following exception: `spechomo` depends on tqdm, which is 

21# distributed under the Mozilla Public Licence (MPL) v2.0 except for the files 

22# "tqdm/_tqdm.py", "setup.py", "README.rst", "MANIFEST.in" and ".gitignore". 

23# Details can be found here: https://github.com/tqdm/tqdm/blob/master/LICENCE. 

24# 

25# Unless required by applicable law or agreed to in writing, software 

26# distributed under the License is distributed on an "AS IS" BASIS, 

27# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

28# See the License for the specific language governing permissions and 

29# limitations under the License. 

30 

31import os 

32import re 

33from glob import glob 

34from multiprocessing import cpu_count 

35from typing import List, Tuple, Union, Dict, TYPE_CHECKING # noqa F401 # flake8 issue 

36import logging # noqa F401 # flake8 issue 

37from itertools import product 

38 

39import dill 

40import numpy as np 

41from nested_dict import nested_dict 

42from pandas import DataFrame 

43from tqdm import tqdm 

44from geoarray import GeoArray 

45from pyrsr import RSR 

46from pyrsr.sensorspecs import get_LayerBandsAssignment 

47 

48if TYPE_CHECKING: 

49 # avoid the sklearn imports on module level to get rid of static TLS ImportError 

50 from sklearn.ensemble import RandomForestRegressor 

51 from sklearn.linear_model import LinearRegression, Ridge 

52 from sklearn.pipeline import Pipeline 

53 

54from .clustering import KMeansRSImage 

55from .resampling import SpectralResampler 

56from .training_data import RefCube 

57from .logging import SpecHomo_Logger 

58from .options import options 

59 

60 

61class ReferenceCube_Generator(object): 

62 """Class for creating reference cube that are later used as training data for SpecHomo_Classifier.""" 

63 

64 def __init__(self, filelist_refs, tgt_sat_sen_list=None, dir_refcubes='', n_clusters=10, tgt_n_samples=1000, 

65 v=False, logger=None, CPUs=None, dir_clf_dump=''): 

66 # type: (List[str], List[Tuple[str, str]], str, int, int, bool, logging.Logger, Union[None, int], str) -> None 

67 """Initialize ReferenceCube_Generator. 

68 

69 :param filelist_refs: list of (hyperspectral) reference images, 

70 representing BOA reflectance, scaled between 0 and 10000 

71 :param tgt_sat_sen_list: list satellite/sensor tuples containing those sensors for which the reference cube 

72 is to be computed, e.g. [('Landsat-8', 'OLI_TIRS',), ('Landsat-5', 'TM')] 

73 :param dir_refcubes: output directory for the generated reference cube 

74 :param n_clusters: number of clusters to be used for clustering the input images (KMeans) 

75 :param tgt_n_samples: number o spectra to be collected from each input image 

76 :param v: verbose mode 

77 :param logger: instance of logging.Logger() 

78 :param CPUs: number CPUs to use for computation 

79 :param dir_clf_dump: directory where to store the serialized KMeans classifier 

80 """ 

81 if not filelist_refs: 

82 raise ValueError('The given list of reference images is empty.') 

83 

84 # args + kwargs 

85 self.ims_ref = [filelist_refs, ] if isinstance(filelist_refs, str) else filelist_refs 

86 self.tgt_sat_sen_list = tgt_sat_sen_list or [ 

87 ('Landsat-8', 'OLI_TIRS'), 

88 ('Landsat-7', 'ETM+'), 

89 ('Landsat-5', 'TM'), 

90 ('Sentinel-2A', 'MSI'), 

91 # ('Terra', 'ASTER'), # currently does not work 

92 ('SPOT-4', 'HRVIR1'), 

93 ('SPOT-4', 'HRVIR2'), 

94 ('SPOT-5', 'HRG1'), 

95 ('SPOT-5', 'HRG2'), 

96 ('RapidEye-5', 'MSI') 

97 ] 

98 self.dir_refcubes = os.path.abspath(dir_refcubes) if dir_refcubes else '' 

99 self.n_clusters = n_clusters 

100 self.tgt_n_samples = tgt_n_samples 

101 self.v = v 

102 self.logger = logger or SpecHomo_Logger(__name__) # must be pickable 

103 self.CPUs = CPUs or cpu_count() 

104 self.dir_clf_dump = dir_clf_dump 

105 

106 # privates 

107 self._refcubes = \ 

108 {(sat, sen): RefCube(satellite=sat, sensor=sen, 

109 LayerBandsAssignment=self._get_tgt_LayerBandsAssignment(sat, sen)) 

110 for sat, sen in self.tgt_sat_sen_list} 

111 

112 if dir_refcubes and not os.path.isdir(self.dir_refcubes): 

113 os.makedirs(self.dir_refcubes) 

114 

115 @property 

116 def refcubes(self): 

117 # type: () -> Dict[Tuple[str, str]: RefCube] 

118 """Return a dict holding instances of RefCube for each target satellite / sensor of self.tgt_sat_sen_list.""" 

119 if not self._refcubes: 

120 

121 # fill self._ref_cubes with GeoArray instances of already existing reference cubes read from disk 

122 if self.dir_refcubes: 

123 for path_refcube in glob(os.path.join(self.dir_refcubes, 'refcube__*.bsq')): 

124 # TODO check if that really works 

125 # check if current refcube path matches the target refcube specifications 

126 identifier = re.search('refcube__(.*).bsq', os.path.basename(path_refcube)).group(1) 

127 sat, sen, nclust_str, nsamp_str = identifier.split('__') # type: str 

128 nclust, nsamp = int(nclust_str.split('nclust')[1]), int(nsamp_str.split('nclust')[1]) 

129 correct_specs = all([(sat, sen) in self.tgt_sat_sen_list, 

130 nclust == self.n_clusters, 

131 nsamp == self.tgt_n_samples]) 

132 

133 # import the existing ref cube if it matches the target refcube specs 

134 if correct_specs: 

135 self._refcubes[(sat, sen)] = \ 

136 RefCube(satellite=sat, sensor=sen, filepath=path_refcube, 

137 LayerBandsAssignment=self._get_tgt_LayerBandsAssignment(sat, sen)) 

138 

139 return self._refcubes 

140 

141 @staticmethod 

142 def _get_tgt_LayerBandsAssignment(tgt_sat, tgt_sen): 

143 # type: (str, str) -> list 

144 """Get the LayerBandsAssignment for the specified target sensor. 

145 

146 NOTE: The returned bands list always contains all possible bands. Panchromatic bands or bands not available 

147 after atmospheric correction are removed. Further band selections are later done using np.take(). 

148 NOTE: We don't use L1A here because the signal of the additional bands at L1A is not predictable by spectral 

149 harmonization as these bands are not driven by surface albedo but by atmospheric conditions (945, 1373nm). 

150 

151 :param tgt_sat: target satellite 

152 :param tgt_sen: target sensor 

153 :return: 

154 """ 

155 return get_LayerBandsAssignment(tgt_sat, tgt_sen, 

156 no_pan=False, no_thermal=True, sort_by_cwl=True, after_ac=True) 

157 

158 @staticmethod 

159 def _get_tgt_RSR_object(tgt_sat, tgt_sen): 

160 # type: (str, str) -> RSR 

161 """Get an RSR instance containing the spectral response functions for the target sensor. 

162 

163 :param tgt_sat: target satellite 

164 :param tgt_sen: target sensor 

165 :return: 

166 """ 

167 return RSR(tgt_sat, tgt_sen, no_pan=False, no_thermal=True, after_ac=True) 

168 

169 def generate_reference_cubes(self, fmt_out='ENVI', try_read_dumped_clf=True, sam_classassignment=False, 

170 max_distance='80%', max_angle=6, nmin_unique_spectra=50, alg_nodata='radical', 

171 progress=True): 

172 # type: (str, bool, bool, int, Union[int, float, str], Union[int, float, str], str, bool) -> ReferenceCube_Generator.refcubes # noqa 

173 """Generate reference spectra from all hyperspectral input images. 

174 

175 Workflow: 

176 1. Clustering/classification of hyperspectral images and selection of a given number of random signatures 

177 (a. Spectral downsamling to lower spectral resolution (speedup)) 

178 b. KMeans clustering 

179 c. Selection of the same number of signatures from each cluster to avoid unequal amount of training data. 

180 2. Spectral resampling of the selected hyperspectral signatures (for each input image) 

181 3. Add resampled spectra to reference cubes for each target sensor and write cubes to disk 

182 

183 :param fmt_out: output format (GDAL driver code) 

184 :param try_read_dumped_clf: try to read a prediciouly serialized KMeans classifier from disk 

185 (massively speeds up the RefCube generation) 

186 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers 

187 True: use the minimal spectral angle to assign classes to cluster centers 

188 :param max_distance: spectra with a larger spectral distance than the given value will be excluded from 

189 random sampling. 

190 - if given as string like '20%', the maximum spectral distance is computed as 20% 

191 percentile within each cluster 

192 :param max_angle: spectra with a larger spectral angle than the given value will be excluded from 

193 random sampling. 

194 - if given as string like '20%', the maximum spectral angle is computed as 20% 

195 percentile within each cluster 

196 :param nmin_unique_spectra: in case a cluster has less than the given number, 

197 do not include it in the reference cube (default: 50) 

198 :param alg_nodata: algorithm how to deal with pixels where the spectral bands of the source image 

199 contain nodata within the spectral response of a target band 

200 'radical': set output band to nodata 

201 'conservative': use existing spectral information and ignore nodata 

202 (might alter the output spectral information, 

203 e.g., at spectral absorption bands) 

204 :param progress: show progress bar (default: True) 

205 :return: np.array: [tgt_n_samples x images x spectral bands of the target sensor] 

206 """ 

207 for im in self.ims_ref: # type: Union[str, GeoArray] 

208 # TODO implement check if current image is already included in the refcube -> skip in that case 

209 src_im = GeoArray(im) 

210 

211 # get random spectra of the original (hyperspectral) image, equally distributed over all computed clusters 

212 baseN = os.path.splitext(os.path.basename(im))[0] if isinstance(im, str) else im.basename 

213 unif_random_spectra = \ 

214 self.cluster_image_and_get_uniform_spectra(src_im, 

215 try_read_dumped_clf=try_read_dumped_clf, 

216 sam_classassignment=sam_classassignment, 

217 max_distance=max_distance, 

218 max_angle=max_angle, 

219 nmin_unique_spectra=nmin_unique_spectra, 

220 progress=progress, 

221 basename_clf_dump=baseN).astype(np.int16) 

222 

223 # resample the set of random spectra to match the spectral characteristics of all target sensors 

224 for tgt_sat, tgt_sen in self.tgt_sat_sen_list: 

225 # perform spectral resampling 

226 self.logger.info('Performing spectral resampling to match %s %s specifications...' % (tgt_sat, tgt_sen)) 

227 unif_random_spectra_rsp = self.resample_spectra( 

228 unif_random_spectra, 

229 src_cwl=np.array(src_im.meta.band_meta['wavelength'], dtype=float).flatten(), 

230 tgt_rsr=self._get_tgt_RSR_object(tgt_sat, tgt_sen), 

231 nodataVal=src_im.nodata, 

232 alg_nodata=alg_nodata) 

233 

234 # add the spectra as GeoArray instance to the in-mem ref cubes 

235 refcube = self.refcubes[(tgt_sat, tgt_sen)] # type: RefCube 

236 refcube.add_spectra(unif_random_spectra_rsp, src_imname=src_im.basename, 

237 LayerBandsAssignment=self._get_tgt_LayerBandsAssignment(tgt_sat, tgt_sen)) 

238 

239 # update the reference cubes on disk 

240 if self.dir_refcubes: 

241 refcube.save(path_out=os.path.join(self.dir_refcubes, 'refcube__%s__%s__nclust%s__nsamp%s.bsq' 

242 % (tgt_sat, tgt_sen, self.n_clusters, self.tgt_n_samples)), 

243 fmt=fmt_out) 

244 

245 return self.refcubes 

246 

247 def cluster_image_and_get_uniform_spectra(self, im, downsamp_sat=None, downsamp_sen=None, basename_clf_dump='', 

248 try_read_dumped_clf=True, sam_classassignment=False, max_distance='80%', 

249 max_angle=6, nmin_unique_spectra=50, progress=False): 

250 # type: (Union[str, GeoArray, np.ndarray], str, str, str, bool, bool, int, Union[int, float, str], Union[int, float, str], bool) -> np.ndarray # noqa 

251 """Compute KMeans clusters for the given image and return the an array of uniform random samples. 

252 

253 :param im: image to be clustered 

254 :param downsamp_sat: satellite code used for intermediate image dimensionality reduction (input image is 

255 spectrally resampled to this satellite before it is clustered). requires downsamp_sen. 

256 If it is None, no intermediate downsampling is performed. 

257 :param downsamp_sen: sensor code used for intermediate image dimensionality reduction (requires downsamp_sat) 

258 :param basename_clf_dump: basename of serialized KMeans classifier 

259 :param try_read_dumped_clf: try to read a previously serialized KMeans classifier from disk 

260 (massively speeds up the RefCube generation) 

261 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers 

262 True: use the minimal spectral angle to assign classes to cluster centers 

263 :param max_distance: spectra with a larger spectral distance than the given value will be excluded from 

264 random sampling. 

265 - if given as string like '20%', the maximum spectral distance is computed as 20% 

266 percentile within each cluster 

267 :param max_angle: spectra with a larger spectral angle than the given value will be excluded from 

268 random sampling. 

269 - if given as string like '20%', the maximum spectral angle is computed as 20% 

270 percentile within each cluster 

271 :param nmin_unique_spectra: in case a cluster has less than the given number, 

272 do not include it in the reference cube (default: 50) 

273 :param progress: whether to show progress bars or not 

274 :return: 2D array (rows: tgt_n_samples, columns: spectral information / bands 

275 """ 

276 # input validation 

277 if downsamp_sat and not downsamp_sen or downsamp_sen and not downsamp_sat: 

278 raise ValueError("The parameters 'spec_rsp_sat' and 'spec_rsp_sen' must both be provided or completely " 

279 "omitted.") 

280 

281 im2clust = GeoArray(im) 

282 

283 # get a KMeans classifier for the hyperspectral image 

284 path_clf = os.path.join(self.dir_clf_dump, '%s__KMeansClf.dill' % basename_clf_dump) 

285 if not sam_classassignment: 

286 path_clustermap = os.path.join(self.dir_clf_dump, 

287 '%s__KMeansClusterMap_nclust%d.bsq' % (basename_clf_dump, self.n_clusters)) 

288 else: 

289 path_clustermap = os.path.join(self.dir_clf_dump, 

290 '%s__SAMClusterMap_nclust%d.bsq' % (basename_clf_dump, self.n_clusters)) 

291 

292 if try_read_dumped_clf and os.path.isfile(path_clf): 

293 # read the previously dumped classifier from disk 

294 self.logger.info('Reading previously serialized KMeans classifier from %s...' % path_clf) 

295 kmeans = KMeansRSImage.from_disk(path_clf=path_clf, im=im2clust) 

296 

297 else: 

298 # create KMeans classifier 

299 # first, perform spectral resampling to Sentinel-2 to reduce dimensionality (speedup) 

300 if downsamp_sat and downsamp_sen: 

301 # NOTE: The KMeansRSImage class already reduces the input data to 1 million spectra by default 

302 tgt_rsr = RSR(satellite=downsamp_sat, sensor=downsamp_sen, no_pan=True, no_thermal=True) 

303 im2clust = self.resample_image_spectrally(im2clust, tgt_rsr, progress=progress) # output = int16 

304 

305 # compute KMeans clusters for the spectrally resampled image 

306 # NOTE: Nodata values are ignored during KMeans clustering. 

307 self.logger.info('Computing %s KMeans clusters from the input image %s...' 

308 % (self.n_clusters, im2clust.basename)) 

309 kmeans = KMeansRSImage(im2clust, n_clusters=self.n_clusters, sam_classassignment=sam_classassignment, 

310 CPUs=self.CPUs, v=self.v) 

311 kmeans.compute_clusters() 

312 kmeans.compute_spectral_distances() 

313 

314 if self.dir_clf_dump and basename_clf_dump: 

315 kmeans.save_clustermap(path_clustermap) 

316 kmeans.dump(path_clf) 

317 

318 if self.v: 

319 kmeans.plot_cluster_centers() 

320 kmeans.plot_cluster_histogram() 

321 

322 # randomly grab the given number of spectra from each cluster, restricted to the 30 % purest spectra 

323 # -> no spectra containing nodata values are returned 

324 self.logger.info('Getting %s random spectra from each cluster...' % (self.tgt_n_samples // self.n_clusters)) 

325 random_samples = kmeans.get_random_spectra_from_each_cluster(samplesize=self.tgt_n_samples // self.n_clusters, 

326 max_distance=max_distance, 

327 max_angle=max_angle, 

328 nmin_unique_spectra=nmin_unique_spectra) 

329 # random_samples = kmeans\ 

330 # .get_purest_spectra_from_each_cluster(src_im=GeoArray(im), 

331 # samplesize=self.tgt_n_samples // self.n_clusters) 

332 

333 # combine the spectra (2D arrays) of all clusters to a single 2D array 

334 self.logger.info('Combining random samples from all clusters.') 

335 random_samples = np.vstack([random_samples[clusterlabel] for clusterlabel in random_samples]) 

336 

337 return random_samples 

338 

339 def resample_spectra(self, spectra, src_cwl, tgt_rsr, nodataVal, alg_nodata='radical'): 

340 # type: (Union[GeoArray, np.ndarray], Union[list, np.array], RSR, int, str) -> np.ndarray 

341 """Perform spectral resampling of the given image to match the given spectral response functions. 

342 

343 :param spectra: 2D array (rows: spectral samples; columns: spectral information / bands 

344 :param src_cwl: central wavelength positions of input spectra 

345 :param tgt_rsr: target relative spectral response functions to be used for spectral resampling 

346 :param nodataVal: nodata value of the given spectra to be ignored during resampling 

347 :param alg_nodata: algorithm how to deal with pixels where the spectral bands of the source image 

348 contain nodata within the spectral response of a target band 

349 'radical': set output band to nodata 

350 'conservative': use existing spectral information and ignore nodata 

351 (might alter the outpur spectral information, 

352 e.g., at spectral absorption bands) 

353 :return: 

354 """ 

355 spectra = GeoArray(spectra) 

356 

357 # perform spectral resampling of input image to match spectral properties of target sensor 

358 self.logger.info('Performing spectral resampling to match spectral properties of %s %s...' 

359 % (tgt_rsr.satellite, tgt_rsr.sensor)) 

360 

361 SR = SpectralResampler(src_cwl, tgt_rsr) 

362 spectra_rsp = SR.resample_spectra(spectra, 

363 chunksize=200, 

364 nodataVal=nodataVal, 

365 alg_nodata=alg_nodata, 

366 CPUs=self.CPUs) 

367 

368 return spectra_rsp 

369 

370 def resample_image_spectrally(self, src_im, tgt_rsr, src_nodata=None, alg_nodata='radical', progress=False): 

371 # type: (Union[str, GeoArray], RSR, Union[float, int], str, bool) -> Union[GeoArray, None] 

372 """Perform spectral resampling of the given image to match the given spectral response functions. 

373 

374 :param src_im: source image to be resampled 

375 :param tgt_rsr: target relative spectral response functions to be used for spectral resampling 

376 :param src_nodata: source image nodata value 

377 :param alg_nodata: algorithm how to deal with pixels where the spectral bands of the source image 

378 contain nodata within the spectral response of a target band 

379 'radical': set output band to nodata 

380 'conservative': use existing spectral information and ignore nodata (might alter the output 

381 spectral information, e.g., at spectral absorption bands) 

382 :param progress: show progress bar (default: false) 

383 :return: 

384 """ 

385 # handle src_im provided as file path or GeoArray instance 

386 if isinstance(src_im, str): 

387 im_name = os.path.basename(src_im) 

388 im_gA = GeoArray(src_im, nodata=src_nodata) 

389 else: 

390 im_name = src_im.basename 

391 im_gA = src_im 

392 

393 # read input image 

394 self.logger.info('Reading the input image %s...' % im_name) 

395 im_gA.cwl = np.array(im_gA.meta.band_meta['wavelength'], dtype=float).flatten() 

396 

397 # perform spectral resampling of input image to match spectral properties of target sensor 

398 self.logger.info('Performing spectral resampling to match spectral properties of %s %s...' 

399 % (tgt_rsr.satellite, tgt_rsr.sensor)) 

400 SR = SpectralResampler(im_gA.cwl, tgt_rsr) 

401 

402 tgt_im = GeoArray(np.zeros((*im_gA.shape[:2], len(tgt_rsr.bands)), dtype=np.int16), 

403 geotransform=im_gA.gt, 

404 projection=im_gA.prj, 

405 nodata=im_gA.nodata) 

406 tgt_im.meta.band_meta['wavelength'] = list(tgt_rsr.wvl) 

407 tgt_im.bandnames = ['B%s' % i if len(i) == 2 else 'B0%s' % i for i in tgt_rsr.LayerBandsAssignment] 

408 

409 tiles = im_gA.tiles((1000, 1000)) # use tiles to save memory 

410 for ((rS, rE), (cS, cE)), tiledata in (tqdm(tiles) if progress else tiles): 

411 tgt_im[rS: rE + 1, cS: cE + 1, :] = SR.resample_image(tiledata.astype(np.int16), 

412 nodataVal=im_gA.nodata, 

413 alg_nodata=alg_nodata, 

414 CPUs=self.CPUs) 

415 

416 return tgt_im 

417 

418 

419class ClusterClassifier_Generator(object): 

420 """Class for creating collections of machine learning classifiers that can be used for spectral homogenization.""" 

421 

422 def __init__(self, list_refcubes, logger=None): 

423 # type: (List[Union[str, RefCube]], logging.Logger) -> None 

424 """Get an instance of Classifier_Generator. 

425 

426 :param list_refcubes: list of RefCube instances or paths for which the classifiers are to be created. 

427 :param logger: instance of logging.Logger() 

428 """ 

429 if not list_refcubes: 

430 raise ValueError('The given list of RefCube instances or paths is empty.') 

431 

432 self.refcubes = [RefCube(inRC) if isinstance(inRC, str) else inRC for inRC in list_refcubes] 

433 self.logger = logger or SpecHomo_Logger(__name__) # must be pickable 

434 

435 # validation 

436 shapes = list(set([RC.data.shape[:2] for RC in self.refcubes])) 

437 if not len(shapes) == 1: 

438 raise ValueError('Unequal dimensions of the given reference cubes: %s' % shapes) 

439 

440 @staticmethod 

441 def _get_derived_LayerBandsAssignments(satellite, sensor): 

442 """Get a list of possible LayerBandsAssignments in which the spectral training data may be arranged. 

443 

444 :param satellite: satellite to return LayerBandsAssignments for 

445 :param sensor: sensor to return LayerBandsAssignments for 

446 :return: e.g. for Landsat-8 OLI_TIRS: 

447 [['1', '2', '3', '4', '5', '9', '6', '7'], 

448 ['1', '2', '3', '4', '5', '9', '6', '7', '8'], 

449 ['1', '2', '3', '4', '5', '6', '7'], 

450 ['1', '2', '3', '4', '5', '6', '7'], ...] 

451 """ 

452 # get bands (after AC) 

453 # NOTE: - signal of additional bands at L1A is not predictable by spectral harmonization because these bands 

454 # are not driven by surface albedo but by atmospheric conditions (945, 1373 nm) 

455 

456 LBAs = [] 

457 kw = dict(satellite=satellite, sensor=sensor, no_thermal=True, after_ac=True) 

458 for lba in [get_LayerBandsAssignment(no_pan=False, sort_by_cwl=True, **kw), # L1C_withPan_cwlSorted 

459 get_LayerBandsAssignment(no_pan=True, sort_by_cwl=True, **kw), # L1C_noPan_cwlSorted 

460 get_LayerBandsAssignment(no_pan=False, sort_by_cwl=False, **kw), # L1C_withPan_alphabetical 

461 get_LayerBandsAssignment(no_pan=True, sort_by_cwl=False, **kw), # L1C_noPan_alphabetical 

462 ]: 

463 if lba not in LBAs: 

464 LBAs.append(lba) 

465 

466 return LBAs 

467 

468 @staticmethod 

469 def train_machine_learner(train_X, train_Y, test_X, test_Y, method, **kwargs): 

470 # type: (np.ndarray, np.ndarray, np.ndarray, np.ndarray, str, dict) -> Union[LinearRegression, Ridge, Pipeline, RandomForestRegressor] # noqa E501 (line too long) 

471 """Use the given train and test data to train a machine learner and append some accuracy statistics. 

472 

473 :param train_X: reference training data 

474 :param train_Y: target training data 

475 :param test_X: reference test data 

476 :param test_Y: target test data 

477 :param method: type of machine learning classifiers to be included in classifier collections 

478 'LR': Linear Regression 

479 'RR': Ridge Regression 

480 'QR': Quadratic Regression 

481 'RFR': Random Forest Regression (50 trees) 

482 :param kwargs: keyword arguments to be passed to the __init__() function of machine learners 

483 """ 

484 from sklearn.pipeline import Pipeline 

485 

486 ################### 

487 # train the model # 

488 ################### 

489 

490 ML = get_machine_learner(method, **kwargs) 

491 ML.fit(train_X, train_Y) 

492 

493 def mean_absolute_percentage_error(y_true, y_pred): 

494 y_true, y_pred = np.array(y_true), np.array(y_pred) 

495 

496 # avoid division by 0 

497 if 0 in y_true: 

498 y_true = y_true.astype(float) 

499 y_true[y_true == 0] = np.nan 

500 

501 return np.nanmean(np.abs((y_true - y_pred) / y_true), axis=0) * 100 

502 

503 ########################### 

504 # compute RMSE, MAE, MAPE # 

505 ########################### 

506 

507 from sklearn.metrics import mean_squared_error, mean_absolute_error 

508 predicted = ML.predict(test_X) # returns 2D array (spectral samples x bands), e.g. 640 x 6 

509 # NOTE: 'raw_values': RMSE is column-wise computed 

510 # => yields the same result as one would compute the RMSE band by band 

511 rmse = np.sqrt(mean_squared_error(test_Y, predicted, multioutput='raw_values')).astype(np.float32) 

512 mae = mean_absolute_error(test_Y, predicted, multioutput='raw_values').astype(np.float32) 

513 mape = mean_absolute_percentage_error(test_Y, predicted).astype(np.float32) 

514 

515 # predicted_train = ML.predict(train_X) 

516 # rmse_train = np.sqrt(mean_squared_error(train_Y, predicted_train, multioutput='raw_values')) 

517 # # v2 

518 # test_Y_im = spectra2im(test_Y, tgt_rows=int(tgt_data.rows * 0.4), tgt_cols=tgt_data.cols) 

519 # pred_im = spectra2im(predicted, tgt_rows=int(tgt_data.rows * 0.4), tgt_cols=tgt_data.cols) 

520 # rmse_v2 = np.sqrt(mean_squared_error(test_Y_im[:,:,1], pred_im[:,:,1])) 

521 

522 # append some metadata 

523 ML.scores = dict(train=ML.score(train_X, train_Y), 

524 test=ML.score(test_X, test_Y)) # r2 scores 

525 ML.rmse_per_band = list(rmse) 

526 ML.mae_per_band = list(mae) 

527 ML.mape_per_band = list(mape) 

528 from .version import __version__, __versionalias__ 

529 ML.spechomo_version = __version__ 

530 ML.spechomo_versionalias = __versionalias__ 

531 

532 # convert float64 attributes to float32 to save memory (affects <0,05% of homogenized pixels by 1 DN) 

533 for attr in ['coef_', 'intercept_', 'singular_', '_residues']: 

534 if isinstance(ML, Pipeline): 

535 # noinspection PyProtectedMember 

536 setattr(ML._final_estimator, attr, getattr(ML._final_estimator, attr).astype(np.float32)) 

537 else: 

538 try: 

539 setattr(ML, attr, getattr(ML, attr).astype(np.float32)) 

540 except AttributeError: 

541 pass 

542 

543 return ML 

544 

545 def create_classifiers(self, outDir, method='LR', n_clusters=50, sam_classassignment=False, CPUs=None, 

546 max_distance=options['classifiers']['trainspec_filtering']['max_distance'], 

547 max_angle=options['classifiers']['trainspec_filtering']['max_angle'], 

548 **kwargs): 

549 # type: (str, str, int, bool, int, Union[int, str], Union[int, str], dict) -> None 

550 """Create cluster classifiers for all combinations of the reference cubes given in __init__(). 

551 

552 :param outDir: output directory for the created cluster classifier collections 

553 :param method: type of machine learning classifiers to be included in classifier collections 

554 'LR': Linear Regression 

555 'RR': Ridge Regression 

556 'QR': Quadratic Regression 

557 'RFR': Random Forest Regression (50 trees with maximum depth of 3 by default) 

558 :param n_clusters: number of clusters to be used for KMeans clustering 

559 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers 

560 True: use the minimal spectral angle to assign classes to cluster centers 

561 :param CPUs: number of CPUs to be used for KMeans clustering 

562 :param max_distance: maximum spectral distance allowed during filtering of training spectra 

563 - if given as string, e.g., '80%' excludes the worst 20 % of the input spectra 

564 :param max_angle: maximum spectral angle allowed during filtering of training spectra 

565 - if given as string, e.g., '80%' excludes the worst 20 % of the input spectra 

566 :param kwargs: keyword arguments to be passed to machine learner 

567 """ 

568 from sklearn.model_selection import train_test_split 

569 

570 # validate and set defaults 

571 if not os.path.isdir(outDir): 

572 os.makedirs(outDir) 

573 

574 if method == 'RFR': 

575 if n_clusters > 1: 

576 self.logger.warning("The spectral homogenization method 'Random Forest Regression' does not allow " 

577 "spectral sub-clustering. Setting 'n_clusters' to 1.") 

578 n_clusters = 1 

579 

580 kwargs.update(dict( 

581 n_jobs=kwargs.get('n_jobs', CPUs), 

582 n_estimators=kwargs.get('n_estimators', options['classifiers']['RFR']['n_trees']), 

583 max_depth=kwargs.get('max_depth', options['classifiers']['RFR']['max_depth']), 

584 max_features=kwargs.get('max_features', options['classifiers']['RFR']['max_features']) 

585 )) 

586 

587 # build the classifier collections with separate classifiers for each cluster 

588 for src_cube in self.refcubes: # type: RefCube 

589 cls_collection = nested_dict() 

590 

591 # get cluster labels for each source cube separately (as they depend on source spectral characteristics) 

592 clusterlabels_src_cube, spectral_distances, spectral_angles = \ 

593 self._get_cluster_labels_for_source_refcube(src_cube, n_clusters, CPUs, 

594 sam_classassignment=sam_classassignment, 

595 return_spectral_distances=True, 

596 return_spectral_angles=True) 

597 

598 for tgt_cube in self.refcubes: 

599 if (src_cube.satellite, src_cube.sensor) == (tgt_cube.satellite, tgt_cube.sensor): 

600 continue 

601 self.logger.info("Creating %s cluster classifier to predict %s %s from %s %s..." 

602 % (method, tgt_cube.satellite, tgt_cube.sensor, src_cube.satellite, src_cube.sensor)) 

603 

604 src_derived_LBAs = self._get_derived_LayerBandsAssignments(src_cube.satellite, src_cube.sensor) 

605 tgt_derived_LBAs = self._get_derived_LayerBandsAssignments(tgt_cube.satellite, tgt_cube.sensor) 

606 

607 for src_LBA, tgt_LBA in product(src_derived_LBAs, tgt_derived_LBAs): 

608 self.logger.debug('Creating %s cluster classifier for LBA %s => %s...' 

609 % (method, '_'.join(src_LBA), '_'.join(tgt_LBA))) 

610 

611 # Get center wavelength positions 

612 # NOTE: they cannot be taken from RefCube instances because they always represent after-AC-LBAs 

613 src_wavelengths = list(RSR(src_cube.satellite, src_cube.sensor, LayerBandsAssignment=src_LBA).wvl) 

614 tgt_wavelengths = list(RSR(tgt_cube.satellite, tgt_cube.sensor, LayerBandsAssignment=tgt_LBA).wvl) 

615 

616 # Get training data for source and target image according to the given LayerBandsAssignments 

617 # e.g., source: Landsat 7 image in LBA 1__2__3__4__5__7 and target L8 in 1__2__3__4__5__6__7 

618 df_src_spectra_allclust = src_cube.get_spectra_dataframe(src_LBA) 

619 df_tgt_spectra_allclust = tgt_cube.get_spectra_dataframe(tgt_LBA) 

620 

621 # assign clusterlabel and spectral distance to the cluster center to each spectrum 

622 for df in [df_src_spectra_allclust, df_tgt_spectra_allclust]: 

623 df.insert(0, 'cluster_label', clusterlabels_src_cube) 

624 df.insert(1, 'spectral_distance', spectral_distances) 

625 df.insert(2, 'spectral_angle', spectral_angles) 

626 

627 # remove spectra with cluster label -9999 

628 # (clusters with too few spectra that are set to nodata in the refcube) 

629 df_src_spectra_allclust = df_src_spectra_allclust[df_src_spectra_allclust.cluster_label != -9999] 

630 df_tgt_spectra_allclust = df_tgt_spectra_allclust[df_tgt_spectra_allclust.cluster_label != -9999] 

631 

632 # remove spectra with -9999 in all bands that HAVE a valid clusterlabel 

633 # NOTE: This may happen because the cluster labels are computed on the source cube only and may be 

634 # different on the target cube. 

635 sub_bad = df_tgt_spectra_allclust[df_tgt_spectra_allclust.iloc[:, 3] == -9999].copy() 

636 if not sub_bad.empty: 

637 idx_bad = sub_bad[np.std(sub_bad.iloc[:, 3:], axis=1) == 0].index 

638 if not idx_bad.empty: 

639 df_src_spectra_allclust = df_src_spectra_allclust.drop(idx_bad) 

640 df_tgt_spectra_allclust = df_tgt_spectra_allclust.drop(idx_bad) 

641 

642 # ensure source and target spectra do not contain nodata values (would affect classifiers) 

643 assert src_cube.data.nodata is None or src_cube.data.nodata not in df_src_spectra_allclust.values 

644 assert tgt_cube.data.nodata is None or tgt_cube.data.nodata not in df_tgt_spectra_allclust.values 

645 

646 for clusterlabel in range(n_clusters): 

647 self.logger.debug('Creating %s classifier for cluster %s...' % (method, clusterlabel)) 

648 

649 if method == 'RFR': 

650 df_src_spectra_best = df_src_spectra_allclust.sample(10000, random_state=0, replace=True) 

651 df_tgt_spectra_best = df_tgt_spectra_allclust.sample(10000, random_state=0, replace=True) 

652 else: 

653 if n_clusters == 1: 

654 max_distance, max_angle = '100%', '100%' 

655 

656 df_src_spectra_best, df_tgt_spectra_best = \ 

657 self._extract_best_spectra_from_cluster( 

658 clusterlabel, df_src_spectra_allclust, df_tgt_spectra_allclust, 

659 max_distance=max_distance, max_angle=max_angle) 

660 

661 # Set train and test variables for the classifier 

662 src_spectra_curlabel = df_src_spectra_best.values[:, 3:] 

663 tgt_spectra_curlabel = df_tgt_spectra_best.values[:, 3:] 

664 

665 train_src, test_src, train_tgt, test_tgt = \ 

666 train_test_split(src_spectra_curlabel, tgt_spectra_curlabel, 

667 test_size=0.4, shuffle=True, random_state=0) 

668 

669 # train the learner and add metadata 

670 ML = self.train_machine_learner(train_src, train_tgt, test_src, test_tgt, method, **kwargs) 

671 # noinspection PyTypeChecker 

672 ML = self._add_metadata_to_machine_learner( 

673 ML, src_cube, tgt_cube, src_LBA, tgt_LBA, src_wavelengths, tgt_wavelengths, 

674 train_src, train_tgt, n_clusters, clusterlabel) 

675 

676 # append to classifier collection 

677 cls_collection[ 

678 '__'.join(src_LBA)][tgt_cube.satellite, tgt_cube.sensor][ 

679 '__'.join(tgt_LBA)][clusterlabel] = ML 

680 

681 # dump to disk 

682 fName_cls = get_filename_classifier_collection(method, src_cube.satellite, src_cube.sensor, 

683 n_clusters=n_clusters, **kwargs) 

684 

685 with open(os.path.join(outDir, fName_cls), 'wb') as outF: 

686 dill.dump(cls_collection.to_dict(), outF, 

687 protocol=dill.HIGHEST_PROTOCOL - 1) # ensures some downwards compatibility 

688 

689 def _get_cluster_labels_for_source_refcube(self, src_cube, n_clusters, CPUs, sam_classassignment=False, 

690 return_spectral_distances=False, return_spectral_angles=False): 

691 """Get cluster labels for each source cube separately. 

692 

693 NOTE: - We use the GMS L1C bands (without atmospheric bands and PAN-band) for clustering. 

694 - clustering is only performed on the source cube because the source sensor spectral information 

695 is later used to assign the correct homogenization classifier to each pixel 

696 

697 :param src_cube: 

698 :param n_clusters: 

699 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers 

700 True: use the minimal spectral angle to assign classes to cluster centers 

701 :param CPUs: 

702 :param return_spectral_distances: 

703 :return: 

704 """ 

705 self.logger.info('Clustering %s %s reference cube (%s clusters)...' 

706 % (src_cube.satellite, src_cube.sensor, n_clusters)) 

707 LBA2clust = get_LayerBandsAssignment(src_cube.satellite, src_cube.sensor, 

708 no_pan=True, no_thermal=True, sort_by_cwl=True, after_ac=True) 

709 src_data2clust = src_cube.get_band_combination(LBA2clust) 

710 km = KMeansRSImage(src_data2clust, n_clusters=n_clusters, sam_classassignment=sam_classassignment, CPUs=CPUs) 

711 km.compute_clusters() 

712 

713 return_vals = [km.labels_with_nodata] 

714 

715 if return_spectral_distances: 

716 return_vals.append(km.spectral_distances_with_nodata) 

717 if return_spectral_angles: 

718 return_vals.append(km.spectral_angles_with_nodata) 

719 

720 return tuple(return_vals) 

721 

722 def _extract_best_spectra_from_cluster(self, clusterlabel, df_src_spectra_allclust, df_tgt_spectra_allclust, 

723 max_distance, max_angle): 

724 # NOTE: We exclude the noisy spectra with the largest spectral distances to their cluster 

725 # center here (random spectra from within the upper 40 %) 

726 assert len(df_src_spectra_allclust.index) == len(df_tgt_spectra_allclust.index), \ 

727 'Source and target spectra dataframes must have the same number of spectral samples.' 

728 

729 df_src_spectra = df_src_spectra_allclust[df_src_spectra_allclust.cluster_label == clusterlabel] 

730 

731 # max_dist = np.percentile(df_src_spectra.spectral_distance, max_distance) 

732 # max_angle = np.percentile(df_src_spectra.spectral_angle, max_angle_percent) 

733 

734 if isinstance(max_angle, str): 

735 max_angle = np.percentile(df_src_spectra.spectral_angle, 

736 int(max_angle.split('%')[0].strip())) 

737 

738 tmp = df_src_spectra[df_src_spectra.spectral_angle <= max_angle] 

739 if len(tmp.index) > 10: 

740 df_src_spectra = tmp 

741 else: 

742 df_src_spectra = df_src_spectra.sort_values(by='spectral_angle').head(10) 

743 self.logger.warning('Had to choose spectra with SA up to %.1f degrees for cluster #%s.' 

744 % (np.max(df_src_spectra['spectral_angle']), clusterlabel)) 

745 

746 if isinstance(max_distance, str): 

747 max_distance = np.percentile(df_src_spectra.spectral_distance, 

748 int(max_distance.split('%')[0].strip())) 

749 

750 tmp = df_src_spectra[df_src_spectra.spectral_distance <= max_distance] 

751 if len(tmp.index) > 10: 

752 df_src_spectra = tmp 

753 else: 

754 df_src_spectra = df_src_spectra.sort_values(by='spectral_distance').head(10) 

755 self.logger.warning('Had to choose spectra with SD up to %.1f for cluster #%s.' 

756 % (np.max(df_src_spectra['spectral_distance']), clusterlabel)) 

757 

758 if len(df_src_spectra.index) > 1700: 

759 df_src_spectra = df_src_spectra.sort_values(by='spectral_angle').head(1700) 

760 

761 df_tgt_spectra = df_tgt_spectra_allclust.loc[df_src_spectra.index, :] 

762 

763 return df_src_spectra, df_tgt_spectra 

764 

765 @staticmethod 

766 def _add_metadata_to_machine_learner(ML, src_cube, tgt_cube, src_LBA, tgt_LBA, src_wavelengths, tgt_wavelengths, 

767 src_train_spectra, tgt_train_spectra, n_clusters, clusterlabel): 

768 # compute cluster center for source spectra (only on those spectra used for model training) 

769 cluster_center = np.mean(src_train_spectra, axis=0) 

770 cluster_median = np.median(src_train_spectra, axis=0) 

771 sample_spectra = DataFrame(src_train_spectra).sample(100, replace=True, random_state=20).values 

772 

773 ML.src_satellite = src_cube.satellite 

774 ML.tgt_satellite = tgt_cube.satellite 

775 ML.src_sensor = src_cube.sensor 

776 ML.tgt_sensor = tgt_cube.sensor 

777 ML.src_LBA = src_LBA 

778 ML.tgt_LBA = tgt_LBA 

779 ML.src_n_bands = len(ML.src_LBA) 

780 ML.tgt_n_bands = len(ML.tgt_LBA) 

781 ML.src_wavelengths = list(np.array(src_wavelengths).astype(np.float32)) 

782 ML.tgt_wavelengths = list(np.array(tgt_wavelengths).astype(np.float32)) 

783 ML.n_clusters = n_clusters 

784 ML.clusterlabel = clusterlabel 

785 ML.cluster_center = cluster_center.astype(src_cube.data.dtype) 

786 ML.cluster_median = cluster_median.astype(src_cube.data.dtype) 

787 ML.cluster_sample_spectra = sample_spectra.astype(np.int16) # scaled between 0 and 10000 

788 ML.tgt_cluster_center = np.mean(tgt_train_spectra, axis=0).astype(tgt_cube.data.dtype) 

789 ML.tgt_cluster_median = np.median(tgt_train_spectra, axis=0).astype(tgt_cube.data.dtype) 

790 

791 assert len(ML.src_LBA) == len(ML.src_wavelengths) 

792 assert len(ML.tgt_LBA) == len(ML.tgt_wavelengths) 

793 

794 return ML 

795 

796 

797def get_machine_learner(method='LR', **init_params): 

798 # type: (str, dict) -> Union[LinearRegression, Ridge, Pipeline] 

799 """Get an instance of a machine learner. 

800 

801 :param method: 'LR': Linear Regression 

802 'RR': Ridge Regression 

803 'QR': Quadratic Regression 

804 'RFR': Random Forest Regression (50 trees) 

805 :param init_params: parameters to be passed to __init__() function of the returned machine learner model. 

806 """ 

807 from sklearn.ensemble import RandomForestRegressor 

808 from sklearn.linear_model import LinearRegression, Ridge 

809 from sklearn.pipeline import make_pipeline 

810 from sklearn.preprocessing import PolynomialFeatures 

811 

812 if method == 'LR': 

813 return LinearRegression(**init_params) 

814 elif method == 'RR': 

815 return Ridge(**init_params) 

816 elif method == 'QR': 

817 return make_pipeline(PolynomialFeatures(degree=2), LinearRegression(**init_params)) 

818 elif method == 'RFR': 

819 return RandomForestRegressor(**init_params) 

820 else: 

821 raise ValueError("Unknown machine learner method code '%s'." % method) 

822 

823 

824def get_filename_classifier_collection(method, src_satellite, src_sensor, n_clusters=1, **cls_kwinit): 

825 if method == 'RR': 

826 method += '_alpha1.0' # TODO add to config 

827 elif method == 'RFR': 

828 assert 'n_estimators' in cls_kwinit 

829 method += '_trees%s' % cls_kwinit['n_estimators'] 

830 

831 return '__'.join(['%s_clust%s' % (method, n_clusters), src_satellite, src_sensor]) + '.dill' 

832 

833 

834# def get_classifier_filename(method, src_satellite, src_sensor, src_LBA_name, tgt_satellite, tgt_sensor, tgt_LBA_name): 

835# return '__'.join([method, src_satellite, src_sensor, src_LBA_name]) + \ 

836# '__to__' + '__'.join([tgt_satellite, tgt_sensor, tgt_LBA_name]) + '.dill'