Coverage for spechomo/classifier

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1# -*- coding: utf-8 -*-

3# spechomo, Spectral homogenization of multispectral satellite data

6# - Daniel Scheffler (GFZ Potsdam, daniel.scheffler@gfz-potsdam.de)

7# - Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences Potsdam,

8# Germany (https://www.gfz-potsdam.de/)

10# This software was developed within the context of the GeoMultiSens project funded

11# by the German Federal Ministry of Education and Research

12# (project grant code: 01 IS 14 010 A-C).

13#

14# Licensed under the Apache License, Version 2.0 (the "License");

15# you may not use this file except in compliance with the License.

16# You may obtain a copy of the License at

17#

18# http://www.apache.org/licenses/LICENSE-2.0

19#

20# Please note the following exception: `spechomo` depends on tqdm, which is

21# distributed under the Mozilla Public Licence (MPL) v2.0 except for the files

22# "tqdm/_tqdm.py", "setup.py", "README.rst", "MANIFEST.in" and ".gitignore".

23# Details can be found here: https://github.com/tqdm/tqdm/blob/master/LICENCE.

24#

25# Unless required by applicable law or agreed to in writing, software

26# distributed under the License is distributed on an "AS IS" BASIS,

27# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

28# See the License for the specific language governing permissions and

29# limitations under the License.

31import os

32import re

33from glob import glob

34from multiprocessing import cpu_count

35from typing import List, Tuple, Union, Dict, TYPE_CHECKING # noqa F401 # flake8 issue

36import logging # noqa F401 # flake8 issue

37from itertools import product

39import dill

40import numpy as np

41from nested_dict import nested_dict

42from pandas import DataFrame

43from tqdm import tqdm

44from geoarray import GeoArray

45from pyrsr import RSR

46from pyrsr.sensorspecs import get_LayerBandsAssignment

48if TYPE_CHECKING:

49 # avoid the sklearn imports on module level to get rid of static TLS ImportError

50 from sklearn.ensemble import RandomForestRegressor

51 from sklearn.linear_model import LinearRegression, Ridge

52 from sklearn.pipeline import Pipeline

54from .clustering import KMeansRSImage

55from .resampling import SpectralResampler

56from .training_data import RefCube

57from .logging import SpecHomo_Logger

58from .options import options

61class ReferenceCube_Generator(object):

62 """Class for creating reference cube that are later used as training data for SpecHomo_Classifier."""

64 def __init__(self, filelist_refs, tgt_sat_sen_list=None, dir_refcubes='', n_clusters=10, tgt_n_samples=1000,

65 v=False, logger=None, CPUs=None, dir_clf_dump=''):

66 # type: (List[str], List[Tuple[str, str]], str, int, int, bool, logging.Logger, Union[None, int], str) -> None

67 """Initialize ReferenceCube_Generator.

69 :param filelist_refs: list of (hyperspectral) reference images,

70 representing BOA reflectance, scaled between 0 and 10000

71 :param tgt_sat_sen_list: list satellite/sensor tuples containing those sensors for which the reference cube

72 is to be computed, e.g. [('Landsat-8', 'OLI_TIRS',), ('Landsat-5', 'TM')]

73 :param dir_refcubes: output directory for the generated reference cube

74 :param n_clusters: number of clusters to be used for clustering the input images (KMeans)

75 :param tgt_n_samples: number o spectra to be collected from each input image

76 :param v: verbose mode

77 :param logger: instance of logging.Logger()

78 :param CPUs: number CPUs to use for computation

79 :param dir_clf_dump: directory where to store the serialized KMeans classifier

80 """

81 if not filelist_refs:

82 raise ValueError('The given list of reference images is empty.')

84 # args + kwargs

85 self.ims_ref = [filelist_refs, ] if isinstance(filelist_refs, str) else filelist_refs

86 self.tgt_sat_sen_list = tgt_sat_sen_list or [

87 ('Landsat-8', 'OLI_TIRS'),

88 ('Landsat-7', 'ETM+'),

89 ('Landsat-5', 'TM'),

90 ('Sentinel-2A', 'MSI'),

91 # ('Terra', 'ASTER'), # currently does not work

92 ('SPOT-4', 'HRVIR1'),

93 ('SPOT-4', 'HRVIR2'),

94 ('SPOT-5', 'HRG1'),

95 ('SPOT-5', 'HRG2'),

96 ('RapidEye-5', 'MSI')

97 ]

98 self.dir_refcubes = os.path.abspath(dir_refcubes) if dir_refcubes else ''

99 self.n_clusters = n_clusters

100 self.tgt_n_samples = tgt_n_samples

101 self.v = v

102 self.logger = logger or SpecHomo_Logger(__name__) # must be pickable

103 self.CPUs = CPUs or cpu_count()

104 self.dir_clf_dump = dir_clf_dump

105

106 # privates

107 self._refcubes = \

108 {(sat, sen): RefCube(satellite=sat, sensor=sen,

109 LayerBandsAssignment=self._get_tgt_LayerBandsAssignment(sat, sen))

110 for sat, sen in self.tgt_sat_sen_list}

111

112 if dir_refcubes and not os.path.isdir(self.dir_refcubes):

113 os.makedirs(self.dir_refcubes)

114

115 @property

116 def refcubes(self):

117 # type: () -> Dict[Tuple[str, str]: RefCube]

118 """Return a dict holding instances of RefCube for each target satellite / sensor of self.tgt_sat_sen_list."""

119 if not self._refcubes:

120

121 # fill self._ref_cubes with GeoArray instances of already existing reference cubes read from disk

122 if self.dir_refcubes:

123 for path_refcube in glob(os.path.join(self.dir_refcubes, 'refcube__*.bsq')):

124 # TODO check if that really works

125 # check if current refcube path matches the target refcube specifications

126 identifier = re.search('refcube__(.*).bsq', os.path.basename(path_refcube)).group(1)

127 sat, sen, nclust_str, nsamp_str = identifier.split('__') # type: str

128 nclust, nsamp = int(nclust_str.split('nclust')[1]), int(nsamp_str.split('nclust')[1])

129 correct_specs = all([(sat, sen) in self.tgt_sat_sen_list,

130 nclust == self.n_clusters,

131 nsamp == self.tgt_n_samples])

132

133 # import the existing ref cube if it matches the target refcube specs

134 if correct_specs:

135 self._refcubes[(sat, sen)] = \

136 RefCube(satellite=sat, sensor=sen, filepath=path_refcube,

137 LayerBandsAssignment=self._get_tgt_LayerBandsAssignment(sat, sen))

138

139 return self._refcubes

140

141 @staticmethod

142 def _get_tgt_LayerBandsAssignment(tgt_sat, tgt_sen):

143 # type: (str, str) -> list

144 """Get the LayerBandsAssignment for the specified target sensor.

145

146 NOTE: The returned bands list always contains all possible bands. Panchromatic bands or bands not available

147 after atmospheric correction are removed. Further band selections are later done using np.take().

148 NOTE: We don't use L1A here because the signal of the additional bands at L1A is not predictable by spectral

149 harmonization as these bands are not driven by surface albedo but by atmospheric conditions (945, 1373nm).

150

151 :param tgt_sat: target satellite

152 :param tgt_sen: target sensor

153 :return:

154 """

155 return get_LayerBandsAssignment(tgt_sat, tgt_sen,

156 no_pan=False, no_thermal=True, sort_by_cwl=True, after_ac=True)

157

158 @staticmethod

159 def _get_tgt_RSR_object(tgt_sat, tgt_sen):

160 # type: (str, str) -> RSR

161 """Get an RSR instance containing the spectral response functions for the target sensor.

162

163 :param tgt_sat: target satellite

164 :param tgt_sen: target sensor

165 :return:

166 """

167 return RSR(tgt_sat, tgt_sen, no_pan=False, no_thermal=True, after_ac=True)

168

169 def generate_reference_cubes(self, fmt_out='ENVI', try_read_dumped_clf=True, sam_classassignment=False,

170 max_distance='80%', max_angle=6, nmin_unique_spectra=50, alg_nodata='radical',

171 progress=True):

172 # type: (str, bool, bool, int, Union[int, float, str], Union[int, float, str], str, bool) -> ReferenceCube_Generator.refcubes # noqa

173 """Generate reference spectra from all hyperspectral input images.

174

175 Workflow:

176 1. Clustering/classification of hyperspectral images and selection of a given number of random signatures

177 (a. Spectral downsamling to lower spectral resolution (speedup))

178 b. KMeans clustering

179 c. Selection of the same number of signatures from each cluster to avoid unequal amount of training data.

180 2. Spectral resampling of the selected hyperspectral signatures (for each input image)

181 3. Add resampled spectra to reference cubes for each target sensor and write cubes to disk

182

183 :param fmt_out: output format (GDAL driver code)

184 :param try_read_dumped_clf: try to read a prediciouly serialized KMeans classifier from disk

185 (massively speeds up the RefCube generation)

186 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers

187 True: use the minimal spectral angle to assign classes to cluster centers

188 :param max_distance: spectra with a larger spectral distance than the given value will be excluded from

189 random sampling.

190 - if given as string like '20%', the maximum spectral distance is computed as 20%

191 percentile within each cluster

192 :param max_angle: spectra with a larger spectral angle than the given value will be excluded from

193 random sampling.

194 - if given as string like '20%', the maximum spectral angle is computed as 20%

195 percentile within each cluster

196 :param nmin_unique_spectra: in case a cluster has less than the given number,

197 do not include it in the reference cube (default: 50)

198 :param alg_nodata: algorithm how to deal with pixels where the spectral bands of the source image

199 contain nodata within the spectral response of a target band

200 'radical': set output band to nodata

201 'conservative': use existing spectral information and ignore nodata

202 (might alter the output spectral information,

203 e.g., at spectral absorption bands)

204 :param progress: show progress bar (default: True)

205 :return: np.array: [tgt_n_samples x images x spectral bands of the target sensor]

206 """

207 for im in self.ims_ref: # type: Union[str, GeoArray]

208 # TODO implement check if current image is already included in the refcube -> skip in that case

209 src_im = GeoArray(im)

210

211 # get random spectra of the original (hyperspectral) image, equally distributed over all computed clusters

212 baseN = os.path.splitext(os.path.basename(im))[0] if isinstance(im, str) else im.basename

213 unif_random_spectra = \

214 self.cluster_image_and_get_uniform_spectra(src_im,

215 try_read_dumped_clf=try_read_dumped_clf,

216 sam_classassignment=sam_classassignment,

217 max_distance=max_distance,

218 max_angle=max_angle,

219 nmin_unique_spectra=nmin_unique_spectra,

220 progress=progress,

221 basename_clf_dump=baseN).astype(np.int16)

222

223 # resample the set of random spectra to match the spectral characteristics of all target sensors

224 for tgt_sat, tgt_sen in self.tgt_sat_sen_list:

225 # perform spectral resampling

226 self.logger.info('Performing spectral resampling to match %s %s specifications...' % (tgt_sat, tgt_sen))

227 unif_random_spectra_rsp = self.resample_spectra(

228 unif_random_spectra,

229 src_cwl=np.array(src_im.meta.band_meta['wavelength'], dtype=float).flatten(),

230 tgt_rsr=self._get_tgt_RSR_object(tgt_sat, tgt_sen),

231 nodataVal=src_im.nodata,

232 alg_nodata=alg_nodata)

233

234 # add the spectra as GeoArray instance to the in-mem ref cubes

235 refcube = self.refcubes[(tgt_sat, tgt_sen)] # type: RefCube

236 refcube.add_spectra(unif_random_spectra_rsp, src_imname=src_im.basename,

237 LayerBandsAssignment=self._get_tgt_LayerBandsAssignment(tgt_sat, tgt_sen))

238

239 # update the reference cubes on disk

240 if self.dir_refcubes:

241 refcube.save(path_out=os.path.join(self.dir_refcubes, 'refcube__%s__%s__nclust%s__nsamp%s.bsq'

242 % (tgt_sat, tgt_sen, self.n_clusters, self.tgt_n_samples)),

243 fmt=fmt_out)

244

245 return self.refcubes

246

247 def cluster_image_and_get_uniform_spectra(self, im, downsamp_sat=None, downsamp_sen=None, basename_clf_dump='',

248 try_read_dumped_clf=True, sam_classassignment=False, max_distance='80%',

249 max_angle=6, nmin_unique_spectra=50, progress=False):

250 # type: (Union[str, GeoArray, np.ndarray], str, str, str, bool, bool, int, Union[int, float, str], Union[int, float, str], bool) -> np.ndarray # noqa

251 """Compute KMeans clusters for the given image and return the an array of uniform random samples.

252

253 :param im: image to be clustered

254 :param downsamp_sat: satellite code used for intermediate image dimensionality reduction (input image is

255 spectrally resampled to this satellite before it is clustered). requires downsamp_sen.

256 If it is None, no intermediate downsampling is performed.

257 :param downsamp_sen: sensor code used for intermediate image dimensionality reduction (requires downsamp_sat)

258 :param basename_clf_dump: basename of serialized KMeans classifier

259 :param try_read_dumped_clf: try to read a previously serialized KMeans classifier from disk

260 (massively speeds up the RefCube generation)

261 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers

262 True: use the minimal spectral angle to assign classes to cluster centers

263 :param max_distance: spectra with a larger spectral distance than the given value will be excluded from

264 random sampling.

265 - if given as string like '20%', the maximum spectral distance is computed as 20%

266 percentile within each cluster

267 :param max_angle: spectra with a larger spectral angle than the given value will be excluded from

268 random sampling.

269 - if given as string like '20%', the maximum spectral angle is computed as 20%

270 percentile within each cluster

271 :param nmin_unique_spectra: in case a cluster has less than the given number,

272 do not include it in the reference cube (default: 50)

273 :param progress: whether to show progress bars or not

274 :return: 2D array (rows: tgt_n_samples, columns: spectral information / bands

275 """

276 # input validation

277 if downsamp_sat and not downsamp_sen or downsamp_sen and not downsamp_sat:

278 raise ValueError("The parameters 'spec_rsp_sat' and 'spec_rsp_sen' must both be provided or completely "

279 "omitted.")

280

281 im2clust = GeoArray(im)

282

283 # get a KMeans classifier for the hyperspectral image

284 path_clf = os.path.join(self.dir_clf_dump, '%s__KMeansClf.dill' % basename_clf_dump)

285 if not sam_classassignment:

286 path_clustermap = os.path.join(self.dir_clf_dump,

287 '%s__KMeansClusterMap_nclust%d.bsq' % (basename_clf_dump, self.n_clusters))

288 else:

289 path_clustermap = os.path.join(self.dir_clf_dump,

290 '%s__SAMClusterMap_nclust%d.bsq' % (basename_clf_dump, self.n_clusters))

291

292 if try_read_dumped_clf and os.path.isfile(path_clf):

293 # read the previously dumped classifier from disk

294 self.logger.info('Reading previously serialized KMeans classifier from %s...' % path_clf)

295 kmeans = KMeansRSImage.from_disk(path_clf=path_clf, im=im2clust)

296

297 else:

298 # create KMeans classifier

299 # first, perform spectral resampling to Sentinel-2 to reduce dimensionality (speedup)

300 if downsamp_sat and downsamp_sen:

301 # NOTE: The KMeansRSImage class already reduces the input data to 1 million spectra by default

302 tgt_rsr = RSR(satellite=downsamp_sat, sensor=downsamp_sen, no_pan=True, no_thermal=True)

303 im2clust = self.resample_image_spectrally(im2clust, tgt_rsr, progress=progress) # output = int16

304

305 # compute KMeans clusters for the spectrally resampled image

306 # NOTE: Nodata values are ignored during KMeans clustering.

307 self.logger.info('Computing %s KMeans clusters from the input image %s...'

308 % (self.n_clusters, im2clust.basename))

309 kmeans = KMeansRSImage(im2clust, n_clusters=self.n_clusters, sam_classassignment=sam_classassignment,

310 CPUs=self.CPUs, v=self.v)

311 kmeans.compute_clusters()

312 kmeans.compute_spectral_distances()

313

314 if self.dir_clf_dump and basename_clf_dump:

315 kmeans.save_clustermap(path_clustermap)

316 kmeans.dump(path_clf)

317

318 if self.v:

319 kmeans.plot_cluster_centers()

320 kmeans.plot_cluster_histogram()

321

322 # randomly grab the given number of spectra from each cluster, restricted to the 30 % purest spectra

323 # -> no spectra containing nodata values are returned

324 self.logger.info('Getting %s random spectra from each cluster...' % (self.tgt_n_samples // self.n_clusters))

325 random_samples = kmeans.get_random_spectra_from_each_cluster(samplesize=self.tgt_n_samples // self.n_clusters,

326 max_distance=max_distance,

327 max_angle=max_angle,

328 nmin_unique_spectra=nmin_unique_spectra)

329 # random_samples = kmeans\

330 # .get_purest_spectra_from_each_cluster(src_im=GeoArray(im),

331 # samplesize=self.tgt_n_samples // self.n_clusters)

332

333 # combine the spectra (2D arrays) of all clusters to a single 2D array

334 self.logger.info('Combining random samples from all clusters.')

335 random_samples = np.vstack([random_samples[clusterlabel] for clusterlabel in random_samples])

336

337 return random_samples

338

339 def resample_spectra(self, spectra, src_cwl, tgt_rsr, nodataVal, alg_nodata='radical'):

340 # type: (Union[GeoArray, np.ndarray], Union[list, np.array], RSR, int, str) -> np.ndarray

341 """Perform spectral resampling of the given image to match the given spectral response functions.

342

343 :param spectra: 2D array (rows: spectral samples; columns: spectral information / bands

344 :param src_cwl: central wavelength positions of input spectra

345 :param tgt_rsr: target relative spectral response functions to be used for spectral resampling

346 :param nodataVal: nodata value of the given spectra to be ignored during resampling

347 :param alg_nodata: algorithm how to deal with pixels where the spectral bands of the source image

348 contain nodata within the spectral response of a target band

349 'radical': set output band to nodata

350 'conservative': use existing spectral information and ignore nodata

351 (might alter the outpur spectral information,

352 e.g., at spectral absorption bands)

353 :return:

354 """

355 spectra = GeoArray(spectra)

356

357 # perform spectral resampling of input image to match spectral properties of target sensor

358 self.logger.info('Performing spectral resampling to match spectral properties of %s %s...'

359 % (tgt_rsr.satellite, tgt_rsr.sensor))

360

361 SR = SpectralResampler(src_cwl, tgt_rsr)

362 spectra_rsp = SR.resample_spectra(spectra,

363 chunksize=200,

364 nodataVal=nodataVal,

365 alg_nodata=alg_nodata,

366 CPUs=self.CPUs)

367

368 return spectra_rsp

369

370 def resample_image_spectrally(self, src_im, tgt_rsr, src_nodata=None, alg_nodata='radical', progress=False):

371 # type: (Union[str, GeoArray], RSR, Union[float, int], str, bool) -> Union[GeoArray, None]

372 """Perform spectral resampling of the given image to match the given spectral response functions.

373

374 :param src_im: source image to be resampled

375 :param tgt_rsr: target relative spectral response functions to be used for spectral resampling

376 :param src_nodata: source image nodata value

377 :param alg_nodata: algorithm how to deal with pixels where the spectral bands of the source image

378 contain nodata within the spectral response of a target band

379 'radical': set output band to nodata

380 'conservative': use existing spectral information and ignore nodata (might alter the output

381 spectral information, e.g., at spectral absorption bands)

382 :param progress: show progress bar (default: false)

383 :return:

384 """

385 # handle src_im provided as file path or GeoArray instance

386 if isinstance(src_im, str):

387 im_name = os.path.basename(src_im)

388 im_gA = GeoArray(src_im, nodata=src_nodata)

389 else:

390 im_name = src_im.basename

391 im_gA = src_im

392

393 # read input image

394 self.logger.info('Reading the input image %s...' % im_name)

395 im_gA.cwl = np.array(im_gA.meta.band_meta['wavelength'], dtype=float).flatten()

396

397 # perform spectral resampling of input image to match spectral properties of target sensor

398 self.logger.info('Performing spectral resampling to match spectral properties of %s %s...'

399 % (tgt_rsr.satellite, tgt_rsr.sensor))

400 SR = SpectralResampler(im_gA.cwl, tgt_rsr)

401

402 tgt_im = GeoArray(np.zeros((*im_gA.shape[:2], len(tgt_rsr.bands)), dtype=np.int16),

403 geotransform=im_gA.gt,

404 projection=im_gA.prj,

405 nodata=im_gA.nodata)

406 tgt_im.meta.band_meta['wavelength'] = list(tgt_rsr.wvl)

407 tgt_im.bandnames = ['B%s' % i if len(i) == 2 else 'B0%s' % i for i in tgt_rsr.LayerBandsAssignment]

408

409 tiles = im_gA.tiles((1000, 1000)) # use tiles to save memory

410 for ((rS, rE), (cS, cE)), tiledata in (tqdm(tiles) if progress else tiles):

411 tgt_im[rS: rE + 1, cS: cE + 1, :] = SR.resample_image(tiledata.astype(np.int16),

412 nodataVal=im_gA.nodata,

413 alg_nodata=alg_nodata,

414 CPUs=self.CPUs)

415

416 return tgt_im

417

418

419class ClusterClassifier_Generator(object):

420 """Class for creating collections of machine learning classifiers that can be used for spectral homogenization."""

421

422 def __init__(self, list_refcubes, logger=None):

423 # type: (List[Union[str, RefCube]], logging.Logger) -> None

424 """Get an instance of Classifier_Generator.

425

426 :param list_refcubes: list of RefCube instances or paths for which the classifiers are to be created.

427 :param logger: instance of logging.Logger()

428 """

429 if not list_refcubes:

430 raise ValueError('The given list of RefCube instances or paths is empty.')

431

432 self.refcubes = [RefCube(inRC) if isinstance(inRC, str) else inRC for inRC in list_refcubes]

433 self.logger = logger or SpecHomo_Logger(__name__) # must be pickable

434

435 # validation

436 shapes = list(set([RC.data.shape[:2] for RC in self.refcubes]))

437 if not len(shapes) == 1:

438 raise ValueError('Unequal dimensions of the given reference cubes: %s' % shapes)

439

440 @staticmethod

441 def _get_derived_LayerBandsAssignments(satellite, sensor):

442 """Get a list of possible LayerBandsAssignments in which the spectral training data may be arranged.

443

444 :param satellite: satellite to return LayerBandsAssignments for

445 :param sensor: sensor to return LayerBandsAssignments for

446 :return: e.g. for Landsat-8 OLI_TIRS:

447 [['1', '2', '3', '4', '5', '9', '6', '7'],

448 ['1', '2', '3', '4', '5', '9', '6', '7', '8'],

449 ['1', '2', '3', '4', '5', '6', '7'],

450 ['1', '2', '3', '4', '5', '6', '7'], ...]

451 """

452 # get bands (after AC)

453 # NOTE: - signal of additional bands at L1A is not predictable by spectral harmonization because these bands

454 # are not driven by surface albedo but by atmospheric conditions (945, 1373 nm)

455

456 LBAs = []

457 kw = dict(satellite=satellite, sensor=sensor, no_thermal=True, after_ac=True)

458 for lba in [get_LayerBandsAssignment(no_pan=False, sort_by_cwl=True, **kw), # L1C_withPan_cwlSorted

459 get_LayerBandsAssignment(no_pan=True, sort_by_cwl=True, **kw), # L1C_noPan_cwlSorted

460 get_LayerBandsAssignment(no_pan=False, sort_by_cwl=False, **kw), # L1C_withPan_alphabetical

461 get_LayerBandsAssignment(no_pan=True, sort_by_cwl=False, **kw), # L1C_noPan_alphabetical

462 ]:

463 if lba not in LBAs:

464 LBAs.append(lba)

465

466 return LBAs

467

468 @staticmethod

469 def train_machine_learner(train_X, train_Y, test_X, test_Y, method, **kwargs):

470 # type: (np.ndarray, np.ndarray, np.ndarray, np.ndarray, str, dict) -> Union[LinearRegression, Ridge, Pipeline, RandomForestRegressor] # noqa E501 (line too long)

471 """Use the given train and test data to train a machine learner and append some accuracy statistics.

472

473 :param train_X: reference training data

474 :param train_Y: target training data

475 :param test_X: reference test data

476 :param test_Y: target test data

477 :param method: type of machine learning classifiers to be included in classifier collections

478 'LR': Linear Regression

479 'RR': Ridge Regression

480 'QR': Quadratic Regression

481 'RFR': Random Forest Regression (50 trees)

482 :param kwargs: keyword arguments to be passed to the __init__() function of machine learners

483 """

484 from sklearn.pipeline import Pipeline

485

486 ###################

487 # train the model #

488 ###################

489

490 ML = get_machine_learner(method, **kwargs)

491 ML.fit(train_X, train_Y)

492

493 def mean_absolute_percentage_error(y_true, y_pred):

494 y_true, y_pred = np.array(y_true), np.array(y_pred)

495

496 # avoid division by 0

497 if 0 in y_true:

498 y_true = y_true.astype(float)

499 y_true[y_true == 0] = np.nan

500

501 return np.nanmean(np.abs((y_true - y_pred) / y_true), axis=0) * 100

502

503 ###########################

504 # compute RMSE, MAE, MAPE #

505 ###########################

506

507 from sklearn.metrics import mean_squared_error, mean_absolute_error

508 predicted = ML.predict(test_X) # returns 2D array (spectral samples x bands), e.g. 640 x 6

509 # NOTE: 'raw_values': RMSE is column-wise computed

510 # => yields the same result as one would compute the RMSE band by band

511 rmse = np.sqrt(mean_squared_error(test_Y, predicted, multioutput='raw_values')).astype(np.float32)

512 mae = mean_absolute_error(test_Y, predicted, multioutput='raw_values').astype(np.float32)

513 mape = mean_absolute_percentage_error(test_Y, predicted).astype(np.float32)

514

515 # predicted_train = ML.predict(train_X)

516 # rmse_train = np.sqrt(mean_squared_error(train_Y, predicted_train, multioutput='raw_values'))

517 # # v2

518 # test_Y_im = spectra2im(test_Y, tgt_rows=int(tgt_data.rows * 0.4), tgt_cols=tgt_data.cols)

519 # pred_im = spectra2im(predicted, tgt_rows=int(tgt_data.rows * 0.4), tgt_cols=tgt_data.cols)

520 # rmse_v2 = np.sqrt(mean_squared_error(test_Y_im[:,:,1], pred_im[:,:,1]))

521

522 # append some metadata

523 ML.scores = dict(train=ML.score(train_X, train_Y),

524 test=ML.score(test_X, test_Y)) # r2 scores

525 ML.rmse_per_band = list(rmse)

526 ML.mae_per_band = list(mae)

527 ML.mape_per_band = list(mape)

528 from .version import __version__, __versionalias__

529 ML.spechomo_version = __version__

530 ML.spechomo_versionalias = __versionalias__

531

532 # convert float64 attributes to float32 to save memory (affects <0,05% of homogenized pixels by 1 DN)

533 for attr in ['coef_', 'intercept_', 'singular_', '_residues']:

534 if isinstance(ML, Pipeline):

535 # noinspection PyProtectedMember

536 setattr(ML._final_estimator, attr, getattr(ML._final_estimator, attr).astype(np.float32))

537 else:

538 try:

539 setattr(ML, attr, getattr(ML, attr).astype(np.float32))

540 except AttributeError:

541 pass

542

543 return ML

544

545 def create_classifiers(self, outDir, method='LR', n_clusters=50, sam_classassignment=False, CPUs=None,

546 max_distance=options['classifiers']['trainspec_filtering']['max_distance'],

547 max_angle=options['classifiers']['trainspec_filtering']['max_angle'],

548 **kwargs):

549 # type: (str, str, int, bool, int, Union[int, str], Union[int, str], dict) -> None

550 """Create cluster classifiers for all combinations of the reference cubes given in __init__().

551

552 :param outDir: output directory for the created cluster classifier collections

553 :param method: type of machine learning classifiers to be included in classifier collections

554 'LR': Linear Regression

555 'RR': Ridge Regression

556 'QR': Quadratic Regression

557 'RFR': Random Forest Regression (50 trees with maximum depth of 3 by default)

558 :param n_clusters: number of clusters to be used for KMeans clustering

559 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers

560 True: use the minimal spectral angle to assign classes to cluster centers

561 :param CPUs: number of CPUs to be used for KMeans clustering

562 :param max_distance: maximum spectral distance allowed during filtering of training spectra

563 - if given as string, e.g., '80%' excludes the worst 20 % of the input spectra

564 :param max_angle: maximum spectral angle allowed during filtering of training spectra

565 - if given as string, e.g., '80%' excludes the worst 20 % of the input spectra

566 :param kwargs: keyword arguments to be passed to machine learner

567 """

568 from sklearn.model_selection import train_test_split

569

570 # validate and set defaults

571 if not os.path.isdir(outDir):

572 os.makedirs(outDir)

573

574 if method == 'RFR':

575 if n_clusters > 1:

576 self.logger.warning("The spectral homogenization method 'Random Forest Regression' does not allow "

577 "spectral sub-clustering. Setting 'n_clusters' to 1.")

578 n_clusters = 1

579

580 kwargs.update(dict(

581 n_jobs=kwargs.get('n_jobs', CPUs),

582 n_estimators=kwargs.get('n_estimators', options['classifiers']['RFR']['n_trees']),

583 max_depth=kwargs.get('max_depth', options['classifiers']['RFR']['max_depth']),

584 max_features=kwargs.get('max_features', options['classifiers']['RFR']['max_features'])

585 ))

586

587 # build the classifier collections with separate classifiers for each cluster

588 for src_cube in self.refcubes: # type: RefCube

589 cls_collection = nested_dict()

590

591 # get cluster labels for each source cube separately (as they depend on source spectral characteristics)

592 clusterlabels_src_cube, spectral_distances, spectral_angles = \

593 self._get_cluster_labels_for_source_refcube(src_cube, n_clusters, CPUs,

594 sam_classassignment=sam_classassignment,

595 return_spectral_distances=True,

596 return_spectral_angles=True)

597

598 for tgt_cube in self.refcubes:

599 if (src_cube.satellite, src_cube.sensor) == (tgt_cube.satellite, tgt_cube.sensor):

600 continue

601 self.logger.info("Creating %s cluster classifier to predict %s %s from %s %s..."

602 % (method, tgt_cube.satellite, tgt_cube.sensor, src_cube.satellite, src_cube.sensor))

603

604 src_derived_LBAs = self._get_derived_LayerBandsAssignments(src_cube.satellite, src_cube.sensor)

605 tgt_derived_LBAs = self._get_derived_LayerBandsAssignments(tgt_cube.satellite, tgt_cube.sensor)

606

607 for src_LBA, tgt_LBA in product(src_derived_LBAs, tgt_derived_LBAs):

608 self.logger.debug('Creating %s cluster classifier for LBA %s => %s...'

609 % (method, '_'.join(src_LBA), '_'.join(tgt_LBA)))

610

611 # Get center wavelength positions

612 # NOTE: they cannot be taken from RefCube instances because they always represent after-AC-LBAs

613 src_wavelengths = list(RSR(src_cube.satellite, src_cube.sensor, LayerBandsAssignment=src_LBA).wvl)

614 tgt_wavelengths = list(RSR(tgt_cube.satellite, tgt_cube.sensor, LayerBandsAssignment=tgt_LBA).wvl)

615

616 # Get training data for source and target image according to the given LayerBandsAssignments

617 # e.g., source: Landsat 7 image in LBA 1__2__3__4__5__7 and target L8 in 1__2__3__4__5__6__7

618 df_src_spectra_allclust = src_cube.get_spectra_dataframe(src_LBA)

619 df_tgt_spectra_allclust = tgt_cube.get_spectra_dataframe(tgt_LBA)

620

621 # assign clusterlabel and spectral distance to the cluster center to each spectrum

622 for df in [df_src_spectra_allclust, df_tgt_spectra_allclust]:

623 df.insert(0, 'cluster_label', clusterlabels_src_cube)

624 df.insert(1, 'spectral_distance', spectral_distances)

625 df.insert(2, 'spectral_angle', spectral_angles)

626

627 # remove spectra with cluster label -9999

628 # (clusters with too few spectra that are set to nodata in the refcube)

629 df_src_spectra_allclust = df_src_spectra_allclust[df_src_spectra_allclust.cluster_label != -9999]

630 df_tgt_spectra_allclust = df_tgt_spectra_allclust[df_tgt_spectra_allclust.cluster_label != -9999]

631

632 # remove spectra with -9999 in all bands that HAVE a valid clusterlabel

633 # NOTE: This may happen because the cluster labels are computed on the source cube only and may be

634 # different on the target cube.

635 sub_bad = df_tgt_spectra_allclust[df_tgt_spectra_allclust.iloc[:, 3] == -9999].copy()

636 if not sub_bad.empty:

637 idx_bad = sub_bad[np.std(sub_bad.iloc[:, 3:], axis=1) == 0].index

638 if not idx_bad.empty:

639 df_src_spectra_allclust = df_src_spectra_allclust.drop(idx_bad)

640 df_tgt_spectra_allclust = df_tgt_spectra_allclust.drop(idx_bad)

641

642 # ensure source and target spectra do not contain nodata values (would affect classifiers)

643 assert src_cube.data.nodata is None or src_cube.data.nodata not in df_src_spectra_allclust.values

644 assert tgt_cube.data.nodata is None or tgt_cube.data.nodata not in df_tgt_spectra_allclust.values

645

646 for clusterlabel in range(n_clusters):

647 self.logger.debug('Creating %s classifier for cluster %s...' % (method, clusterlabel))

648

649 if method == 'RFR':

650 df_src_spectra_best = df_src_spectra_allclust.sample(10000, random_state=0, replace=True)

651 df_tgt_spectra_best = df_tgt_spectra_allclust.sample(10000, random_state=0, replace=True)

652 else:

653 if n_clusters == 1:

654 max_distance, max_angle = '100%', '100%'

655

656 df_src_spectra_best, df_tgt_spectra_best = \

657 self._extract_best_spectra_from_cluster(

658 clusterlabel, df_src_spectra_allclust, df_tgt_spectra_allclust,

659 max_distance=max_distance, max_angle=max_angle)

660

661 # Set train and test variables for the classifier

662 src_spectra_curlabel = df_src_spectra_best.values[:, 3:]

663 tgt_spectra_curlabel = df_tgt_spectra_best.values[:, 3:]

664

665 train_src, test_src, train_tgt, test_tgt = \

666 train_test_split(src_spectra_curlabel, tgt_spectra_curlabel,

667 test_size=0.4, shuffle=True, random_state=0)

668

669 # train the learner and add metadata

670 ML = self.train_machine_learner(train_src, train_tgt, test_src, test_tgt, method, **kwargs)

671 # noinspection PyTypeChecker

672 ML = self._add_metadata_to_machine_learner(

673 ML, src_cube, tgt_cube, src_LBA, tgt_LBA, src_wavelengths, tgt_wavelengths,

674 train_src, train_tgt, n_clusters, clusterlabel)

675

676 # append to classifier collection

677 cls_collection[

678 '__'.join(src_LBA)][tgt_cube.satellite, tgt_cube.sensor][

679 '__'.join(tgt_LBA)][clusterlabel] = ML

680

681 # dump to disk

682 fName_cls = get_filename_classifier_collection(method, src_cube.satellite, src_cube.sensor,

683 n_clusters=n_clusters, **kwargs)

684

685 with open(os.path.join(outDir, fName_cls), 'wb') as outF:

686 dill.dump(cls_collection.to_dict(), outF,

687 protocol=dill.HIGHEST_PROTOCOL - 1) # ensures some downwards compatibility

688

689 def _get_cluster_labels_for_source_refcube(self, src_cube, n_clusters, CPUs, sam_classassignment=False,

690 return_spectral_distances=False, return_spectral_angles=False):

691 """Get cluster labels for each source cube separately.

692

693 NOTE: - We use the GMS L1C bands (without atmospheric bands and PAN-band) for clustering.

694 - clustering is only performed on the source cube because the source sensor spectral information

695 is later used to assign the correct homogenization classifier to each pixel

696

697 :param src_cube:

698 :param n_clusters:

699 :param sam_classassignment: False: use minimal euclidian distance to assign classes to cluster centers

700 True: use the minimal spectral angle to assign classes to cluster centers

701 :param CPUs:

702 :param return_spectral_distances:

703 :return:

704 """

705 self.logger.info('Clustering %s %s reference cube (%s clusters)...'

706 % (src_cube.satellite, src_cube.sensor, n_clusters))

707 LBA2clust = get_LayerBandsAssignment(src_cube.satellite, src_cube.sensor,

708 no_pan=True, no_thermal=True, sort_by_cwl=True, after_ac=True)

709 src_data2clust = src_cube.get_band_combination(LBA2clust)

710 km = KMeansRSImage(src_data2clust, n_clusters=n_clusters, sam_classassignment=sam_classassignment, CPUs=CPUs)

711 km.compute_clusters()

712

713 return_vals = [km.labels_with_nodata]

714

715 if return_spectral_distances:

716 return_vals.append(km.spectral_distances_with_nodata)

717 if return_spectral_angles:

718 return_vals.append(km.spectral_angles_with_nodata)

719

720 return tuple(return_vals)

721

722 def _extract_best_spectra_from_cluster(self, clusterlabel, df_src_spectra_allclust, df_tgt_spectra_allclust,

723 max_distance, max_angle):

724 # NOTE: We exclude the noisy spectra with the largest spectral distances to their cluster

725 # center here (random spectra from within the upper 40 %)

726 assert len(df_src_spectra_allclust.index) == len(df_tgt_spectra_allclust.index), \

727 'Source and target spectra dataframes must have the same number of spectral samples.'

728

729 df_src_spectra = df_src_spectra_allclust[df_src_spectra_allclust.cluster_label == clusterlabel]

730

731 # max_dist = np.percentile(df_src_spectra.spectral_distance, max_distance)

732 # max_angle = np.percentile(df_src_spectra.spectral_angle, max_angle_percent)

733

734 if isinstance(max_angle, str):

735 max_angle = np.percentile(df_src_spectra.spectral_angle,

736 int(max_angle.split('%')[0].strip()))

737

738 tmp = df_src_spectra[df_src_spectra.spectral_angle <= max_angle]

739 if len(tmp.index) > 10:

740 df_src_spectra = tmp

741 else:

742 df_src_spectra = df_src_spectra.sort_values(by='spectral_angle').head(10)

743 self.logger.warning('Had to choose spectra with SA up to %.1f degrees for cluster #%s.'

744 % (np.max(df_src_spectra['spectral_angle']), clusterlabel))

745

746 if isinstance(max_distance, str):

747 max_distance = np.percentile(df_src_spectra.spectral_distance,

748 int(max_distance.split('%')[0].strip()))

749

750 tmp = df_src_spectra[df_src_spectra.spectral_distance <= max_distance]

751 if len(tmp.index) > 10:

752 df_src_spectra = tmp

753 else:

754 df_src_spectra = df_src_spectra.sort_values(by='spectral_distance').head(10)

755 self.logger.warning('Had to choose spectra with SD up to %.1f for cluster #%s.'

756 % (np.max(df_src_spectra['spectral_distance']), clusterlabel))

757

758 if len(df_src_spectra.index) > 1700:

759 df_src_spectra = df_src_spectra.sort_values(by='spectral_angle').head(1700)

760

761 df_tgt_spectra = df_tgt_spectra_allclust.loc[df_src_spectra.index, :]

762

763 return df_src_spectra, df_tgt_spectra

764

765 @staticmethod

766 def _add_metadata_to_machine_learner(ML, src_cube, tgt_cube, src_LBA, tgt_LBA, src_wavelengths, tgt_wavelengths,

767 src_train_spectra, tgt_train_spectra, n_clusters, clusterlabel):

768 # compute cluster center for source spectra (only on those spectra used for model training)

769 cluster_center = np.mean(src_train_spectra, axis=0)

770 cluster_median = np.median(src_train_spectra, axis=0)

771 sample_spectra = DataFrame(src_train_spectra).sample(100, replace=True, random_state=20).values

772

773 ML.src_satellite = src_cube.satellite

774 ML.tgt_satellite = tgt_cube.satellite

775 ML.src_sensor = src_cube.sensor

776 ML.tgt_sensor = tgt_cube.sensor

777 ML.src_LBA = src_LBA

778 ML.tgt_LBA = tgt_LBA

779 ML.src_n_bands = len(ML.src_LBA)

780 ML.tgt_n_bands = len(ML.tgt_LBA)

781 ML.src_wavelengths = list(np.array(src_wavelengths).astype(np.float32))

782 ML.tgt_wavelengths = list(np.array(tgt_wavelengths).astype(np.float32))

783 ML.n_clusters = n_clusters

784 ML.clusterlabel = clusterlabel

785 ML.cluster_center = cluster_center.astype(src_cube.data.dtype)

786 ML.cluster_median = cluster_median.astype(src_cube.data.dtype)

787 ML.cluster_sample_spectra = sample_spectra.astype(np.int16) # scaled between 0 and 10000

788 ML.tgt_cluster_center = np.mean(tgt_train_spectra, axis=0).astype(tgt_cube.data.dtype)

789 ML.tgt_cluster_median = np.median(tgt_train_spectra, axis=0).astype(tgt_cube.data.dtype)

790

791 assert len(ML.src_LBA) == len(ML.src_wavelengths)

792 assert len(ML.tgt_LBA) == len(ML.tgt_wavelengths)

793

794 return ML

795

796

797def get_machine_learner(method='LR', **init_params):

798 # type: (str, dict) -> Union[LinearRegression, Ridge, Pipeline]

799 """Get an instance of a machine learner.

800

801 :param method: 'LR': Linear Regression

802 'RR': Ridge Regression

803 'QR': Quadratic Regression

804 'RFR': Random Forest Regression (50 trees)

805 :param init_params: parameters to be passed to __init__() function of the returned machine learner model.

806 """

807 from sklearn.ensemble import RandomForestRegressor

808 from sklearn.linear_model import LinearRegression, Ridge

809 from sklearn.pipeline import make_pipeline

810 from sklearn.preprocessing import PolynomialFeatures

811

812 if method == 'LR':

813 return LinearRegression(**init_params)

814 elif method == 'RR':

815 return Ridge(**init_params)

816 elif method == 'QR':

817 return make_pipeline(PolynomialFeatures(degree=2), LinearRegression(**init_params))

818 elif method == 'RFR':

819 return RandomForestRegressor(**init_params)

820 else:

821 raise ValueError("Unknown machine learner method code '%s'." % method)

822

823

824def get_filename_classifier_collection(method, src_satellite, src_sensor, n_clusters=1, **cls_kwinit):

825 if method == 'RR':

826 method += '_alpha1.0' # TODO add to config

827 elif method == 'RFR':

828 assert 'n_estimators' in cls_kwinit

829 method += '_trees%s' % cls_kwinit['n_estimators']

830

831 return '__'.join(['%s_clust%s' % (method, n_clusters), src_satellite, src_sensor]) + '.dill'

832

833

834# def get_classifier_filename(method, src_satellite, src_sensor, src_LBA_name, tgt_satellite, tgt_sensor, tgt_LBA_name):

835# return '__'.join([method, src_satellite, src_sensor, src_LBA_name]) + \

836# '__to__' + '__'.join([tgt_satellite, tgt_sensor, tgt_LBA_name]) + '.dill'

Coverage for spechomo/classifier_creation.py : 83%

305 statements

Coverage for spechomo/classifier_creation.py : 83%

305 statements 253 run 52 missing 0 excluded

305 statements