Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2 

3# spechomo, Spectral homogenization of multispectral satellite data 

4# 

5# Copyright (C) 2019-2021 

6# - Daniel Scheffler (GFZ Potsdam, daniel.scheffler@gfz-potsdam.de) 

7# - Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences Potsdam, 

8# Germany (https://www.gfz-potsdam.de/) 

9# 

10# This software was developed within the context of the GeoMultiSens project funded 

11# by the German Federal Ministry of Education and Research 

12# (project grant code: 01 IS 14 010 A-C). 

13# 

14# Licensed under the Apache License, Version 2.0 (the "License"); 

15# you may not use this file except in compliance with the License. 

16# You may obtain a copy of the License at 

17# 

18# http://www.apache.org/licenses/LICENSE-2.0 

19# 

20# Please note the following exception: `spechomo` depends on tqdm, which is 

21# distributed under the Mozilla Public Licence (MPL) v2.0 except for the files 

22# "tqdm/_tqdm.py", "setup.py", "README.rst", "MANIFEST.in" and ".gitignore". 

23# Details can be found here: https://github.com/tqdm/tqdm/blob/master/LICENCE. 

24# 

25# Unless required by applicable law or agreed to in writing, software 

26# distributed under the License is distributed on an "AS IS" BASIS, 

27# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

28# See the License for the specific language governing permissions and 

29# limitations under the License. 

30 

31from multiprocessing import cpu_count 

32from typing import Union, TYPE_CHECKING # noqa F401 # flake8 issue 

33import os 

34 

35if TYPE_CHECKING: 

36 from sklearn.cluster import KMeans 

37 

38import dill 

39import numpy as np 

40from geoarray import GeoArray 

41from pandas import DataFrame 

42from specclassify import SAM_Classifier, classify_image 

43 

44from .utils import im2spectra 

45 

46 

47class KMeansRSImage(object): 

48 """Class for clustering a given input image by using K-Means algorithm. 

49 

50 NOTE: Based on the nodata value of the input GeoArray those pixels that have nodata values in some bands are 

51 ignored when computing the cluster coefficients. Nodata values would affect clustering result otherwise. 

52 """ 

53 

54 def __init__(self, im, n_clusters, sam_classassignment=False, CPUs=1, v=False): 

55 # type: (GeoArray, int, bool, Union[None, int], bool) -> None 

56 

57 # privates 

58 self._clusters = None 

59 self._clustermap = None 

60 self._goodSpecMask = None 

61 self._spectra = None 

62 self._labels_with_nodata = None 

63 self._spectral_distances = None 

64 self._spectral_distances_with_nodata = None 

65 self._spectral_angles = None 

66 self._spectral_angles_with_nodata = None 

67 

68 self.im = im 

69 self.n_clusters = n_clusters 

70 self.sam_classassignment = sam_classassignment 

71 self.CPUs = CPUs or cpu_count() 

72 self.v = v 

73 

74 @classmethod 

75 def from_disk(cls, path_clf, im): 

76 # type: (str, GeoArray) -> KMeansRSImage 

77 """Get an instance of KMeansRSImage from a previously saved classifier. 

78 

79 :param path_clf: path of serialzed classifier (dill file) 

80 :param im: path of the image cube belonging to that classifier 

81 :return: KMeansRSImage 

82 """ 

83 with open(path_clf, 'rb') as inF: 

84 undilled = dill.load(inF) # type: dict 

85 

86 KM = KMeansRSImage(im, 

87 undilled['clusters'].n_clusters, 

88 undilled['clusters'].n_jobs, 

89 undilled['clusters'].verbose) 

90 KM._clusters = undilled['clusters'] 

91 KM._goodSpecMask = undilled['_goodSpecMask'] 

92 KM._spectral_distances = undilled['_spectral_distances'] 

93 

94 return KM 

95 

96 @property 

97 def goodSpecMask(self): 

98 if self._goodSpecMask is None: 

99 if self.im.nodata is not None: 

100 mask_nodata = im2spectra(self.im) == self.im.nodata 

101 goodSpecMask = np.all(~mask_nodata, axis=1) 

102 

103 if True not in goodSpecMask: 

104 raise RuntimeError('All spectra contain no data values in one or multiple bands and are therefore ' 

105 'not usable for clustering. Clustering failed.') 

106 

107 self._goodSpecMask = goodSpecMask 

108 else: 

109 self._goodSpecMask = np.ones((self.im.rows * self.im.cols), dtype=bool) 

110 

111 return self._goodSpecMask 

112 

113 @property 

114 def spectra(self): 

115 """Get spectra used for clustering (excluding spectra containing nodata values that would affect clustering).""" 

116 if self._spectra is None: 

117 self._spectra = im2spectra(self.im)[self.goodSpecMask, :] 

118 return self._spectra 

119 

120 @property 

121 def n_spectra(self): 

122 """Get number of spectra used for clustering (excluding spectra containing nodata values).""" 

123 return int(np.sum(self.goodSpecMask)) 

124 

125 @property 

126 def clusters(self): 

127 # type: () -> KMeans 

128 if not self._clusters: 

129 self._clusters = self.compute_clusters() 

130 return self._clusters 

131 

132 @clusters.setter 

133 def clusters(self, clusters): 

134 self._clusters = clusters 

135 

136 @property 

137 def clustermap(self): 

138 if self._clustermap is None: 

139 self._clustermap = self.labels_with_nodata.reshape((self.im.rows, self.im.cols)) 

140 

141 return self._clustermap 

142 

143 def compute_clusters(self, nmax_spectra=100000): 

144 """Compute the cluster means and labels. 

145 

146 :param nmax_spectra: maximum number of spectra to be included (pseudo-randomly selected (reproducable)) 

147 :return: 

148 """ 

149 from sklearn.cluster import KMeans # avoids static TLS ImportError here 

150 from sklearn import __version__ as skver 

151 

152 _old_environ = dict(os.environ) 

153 

154 try: 

155 kwargs_kmeans = dict(n_clusters=self.n_clusters, random_state=0, verbose=self.v) 

156 # scikit-learn>0.23 uses all cores by default; number is adjustable via OMP_NUM_THREADS 

157 if float('%s.%s' % tuple(skver.split('.')[:2])) < 0.23: 

158 kwargs_kmeans['n_jobs'] = self.CPUs 

159 else: 

160 os.environ['OMP_NUM_THREADS '] = str(self.CPUs) 

161 

162 # data reduction in case we have too many spectra 

163 if self.spectra.shape[0] > nmax_spectra: 

164 if self.v: 

165 print('Reducing data...') 

166 idxs_specIncl = np.random.RandomState(seed=0).choice(range(self.n_spectra), nmax_spectra) 

167 idxs_specNotIncl = np.array(range(self.n_spectra))[~np.in1d(range(self.n_spectra), idxs_specIncl)] 

168 spectra_incl = self.spectra[idxs_specIncl, :] 

169 spectra_notIncl = self.spectra[idxs_specNotIncl, :] 

170 

171 if self.v: 

172 print('Fitting KMeans...') 

173 kmeans = KMeans(**kwargs_kmeans) 

174 distmatrix_incl = kmeans.fit_transform(spectra_incl) 

175 

176 if self.v: 

177 print('Computing full resolution labels...') 

178 labels = np.zeros((self.n_spectra,), dtype=kmeans.labels_.dtype) 

179 distances = np.zeros((self.n_spectra,), dtype=distmatrix_incl.dtype) 

180 

181 labels[idxs_specIncl] = kmeans.labels_ 

182 distances[idxs_specIncl] = np.min(distmatrix_incl, axis=1) 

183 

184 if self.sam_classassignment: 

185 # override cluster labels with labels computed via SAM (distances have be recomputed then) 

186 print('Using SAM class assignment.') 

187 SC = SAM_Classifier(kmeans.cluster_centers_, CPUs=self.CPUs) 

188 im_sam_labels = SC.classify(self.im) 

189 sam_labels = im_sam_labels.flatten()[self.goodSpecMask] 

190 self._spectral_angles = SC.angles_deg.flatten()[self.goodSpecMask] 

191 

192 # update distances at those positions where SAM assigns different class labels 

193 distsPos2update = labels != sam_labels 

194 distsPos2update[idxs_specNotIncl] = True 

195 distances[distsPos2update] = \ 

196 self.compute_euclidian_distance_for_labelled_spectra( 

197 self.spectra[distsPos2update, :], sam_labels[distsPos2update], kmeans.cluster_centers_) 

198 

199 kmeans.labels_ = sam_labels 

200 

201 else: 

202 distmatrix_specNotIncl = kmeans.transform(spectra_notIncl) 

203 labels[idxs_specNotIncl] = np.argmin(distmatrix_specNotIncl, axis=1) 

204 distances[idxs_specNotIncl] = np.min(distmatrix_specNotIncl, axis=1) 

205 

206 kmeans.labels_ = labels 

207 

208 else: 

209 if self.v: 

210 print('Fitting KMeans...') 

211 kmeans = KMeans(**kwargs_kmeans) 

212 

213 distmatrix = kmeans.fit_transform(self.spectra) 

214 kmeans_labels = kmeans.labels_ 

215 distances = np.min(distmatrix, axis=1) 

216 

217 if self.sam_classassignment: 

218 # override cluster labels with labels computed via SAM (distances have be recomputed then) 

219 print('Using SAM class assignment.') 

220 SC = SAM_Classifier(kmeans.cluster_centers_, CPUs=self.CPUs) 

221 im_sam_labels = SC.classify(self.im) 

222 sam_labels = im_sam_labels.flatten()[self.goodSpecMask] 

223 self._spectral_angles = SC.angles_deg.flatten()[self.goodSpecMask] 

224 

225 # update distances at those positions where SAM assigns different class labels 

226 distsPos2update = kmeans_labels != sam_labels 

227 distances[distsPos2update] = \ 

228 self.compute_euclidian_distance_for_labelled_spectra( 

229 self.spectra[distsPos2update, :], sam_labels[distsPos2update], kmeans.cluster_centers_) 

230 finally: 

231 os.environ.clear() 

232 os.environ.update(_old_environ) 

233 

234 self.clusters = kmeans 

235 self._spectral_distances = distances 

236 

237 return self.clusters 

238 

239 def apply_clusters(self, image): 

240 labels = self.clusters.predict(im2spectra(GeoArray(image))) 

241 return labels 

242 

243 def compute_spectral_distances(self): 

244 self._spectral_distances = np.min(self.clusters.transform(self.spectra), axis=1) 

245 return self.spectral_distances 

246 

247 @staticmethod 

248 def compute_euclidian_distance_2D(spectra, endmembers): 

249 n_samples, n_features = endmembers.shape 

250 

251 if not spectra.shape[1] == endmembers.shape[1]: 

252 raise RuntimeError('Matrix dimensions are not aligned. Input spectra have %d bands but endmembers ' 

253 'have %d.' % (spectra.shape[1], endmembers.shape[1])) 

254 

255 dists = np.zeros((spectra.shape[0], n_samples), np.float32) 

256 

257 # loop over all endmember spectra and compute spectral angle for each input spectrum 

258 for n_sample in range(n_samples): 

259 train_spectrum = endmembers[n_sample, :].astype(float) 

260 diff = spectra - train_spectrum 

261 dists[:, n_sample] = np.sqrt((diff ** 2).sum(axis=1)) 

262 

263 return dists 

264 

265 @staticmethod 

266 def compute_euclidian_distance_for_labelled_spectra(spectra, labels, endmembers): 

267 if not spectra.shape[1] == endmembers.shape[1]: 

268 raise RuntimeError('Matrix dimensions are not aligned. Input spectra have %d bands but endmembers ' 

269 'have %d.' % (spectra.shape[1], endmembers.shape[1])) 

270 

271 dists = np.zeros((spectra.shape[0]), np.float32) 

272 

273 # loop over all endmember spectra and compute spectral angle for each input spectrum 

274 for lbl in np.unique(labels): 

275 train_spectrum = endmembers[lbl, :].astype(float) 

276 mask_curlbl = labels == lbl 

277 spectra_curlbl = spectra[mask_curlbl, :] 

278 diff_curlbl = spectra_curlbl - train_spectrum 

279 dists[mask_curlbl] = np.sqrt((diff_curlbl ** 2).sum(axis=1)) 

280 

281 return dists 

282 

283 def compute_spectral_angles(self): 

284 spectral_angles = classify_image(self.im, self.clusters.cluster_centers_, list(range(self.n_clusters)), 

285 'SAM', in_nodataVal=self.im.nodata, cmap_nodataVal=-9999, 

286 tiledims=(400, 400), CPUs=self.CPUs, return_distance=True, 

287 unclassified_pixVal=-1)[1] 

288 

289 self._spectral_angles = spectral_angles.flatten()[self.goodSpecMask] 

290 return self.spectral_angles 

291 

292 @property 

293 def labels(self): 

294 """Get labels for all clustered spectra (excluding spectra that contain nodata values).""" 

295 return self.clusters.labels_ 

296 

297 @property 

298 def labels_with_nodata(self): 

299 """Get the labels for all pixels (including those containing nodata values).""" 

300 if self._labels_with_nodata is None: 

301 if self.n_spectra == (self.im.rows * self.im.cols): 

302 self._labels_with_nodata = self.clusters.labels_ 

303 else: 

304 labels = np.full_like(self.goodSpecMask, -9999, dtype=self.clusters.labels_.dtype) 

305 labels[self.goodSpecMask] = self.clusters.labels_ 

306 self._labels_with_nodata = labels 

307 

308 return self._labels_with_nodata 

309 

310 @property 

311 def spectral_distances(self): 

312 """Get spectral distances for all pixels that don't contain nodata values.""" 

313 if self._spectral_distances is None: 

314 self._spectral_distances = self.compute_spectral_distances() 

315 

316 return self._spectral_distances 

317 

318 @property 

319 def spectral_distances_with_nodata(self): 

320 if self._clusters is not None and self._spectral_distances_with_nodata is None: 

321 if self.n_spectra == (self.im.rows * self.im.cols): 

322 self._spectral_distances_with_nodata = self.spectral_distances 

323 else: 

324 dists = np.full_like(self.goodSpecMask, np.nan, dtype=float) 

325 dists[self.goodSpecMask] = self.spectral_distances 

326 self._spectral_distances_with_nodata = dists 

327 

328 return self._spectral_distances_with_nodata 

329 

330 @property 

331 def spectral_angles(self): 

332 """Get spectral angles in degrees for all pixels that don't contain nodata values.""" 

333 if self._spectral_angles is None: 

334 self._spectral_angles = self.compute_spectral_angles() 

335 

336 return self._spectral_angles 

337 

338 @property 

339 def spectral_angles_with_nodata(self): 

340 if self._clusters is not None and self._spectral_angles_with_nodata is None: 

341 if self.n_spectra == (self.im.rows * self.im.cols): 

342 self._spectral_angles_with_nodata = self.spectral_angles 

343 else: 

344 angles = np.full_like(self.goodSpecMask, np.nan, dtype=float) 

345 angles[self.goodSpecMask] = self.spectral_angles 

346 self._spectral_angles_with_nodata = angles 

347 

348 return self._spectral_angles_with_nodata 

349 

350 def dump(self, path_out): 

351 with open(path_out, 'wb') as outF: 

352 dill.dump(dict( 

353 clusters=self.clusters, 

354 _goodSpecMask=self._goodSpecMask, 

355 _spectral_distances=self._spectral_distances), 

356 outF) 

357 

358 def save_clustermap(self, path_out, **kw_save): 

359 GeoArray(self.clustermap, geotransform=self.im.gt, projection=self.im.prj, nodata=-9999)\ 

360 .save(path_out, **kw_save) 

361 

362 def plot_cluster_centers(self, figsize=(15, 5)): 

363 # type: (tuple) -> None 

364 """Show a plot of the cluster center signatures. 

365 

366 :param figsize: figure size (inches) 

367 """ 

368 from matplotlib import pyplot as plt 

369 

370 plt.figure(figsize=figsize) 

371 for i, center_signature in enumerate(self.clusters.cluster_centers_): 

372 plt.plot(range(1, self.im.bands + 1), center_signature, label='Cluster #%s' % (i + 1)) 

373 

374 plt.title('KMeans cluster centers for %s clusters' % self.n_clusters) 

375 plt.xlabel('Spectral band') 

376 plt.ylabel('Pixel values') 

377 plt.legend(loc='upper right') 

378 

379 plt.show() 

380 

381 def plot_cluster_histogram(self, figsize=(15, 5)): 

382 # type: (tuple) -> None 

383 """Show a histogram indicating the proportion of each cluster label in percent. 

384 

385 :param figsize: figure size (inches) 

386 """ 

387 from matplotlib import pyplot as plt 

388 

389 # grab the number of different clusters and create a histogram 

390 # based on the number of pixels assigned to each cluster 

391 numLabels = np.arange(0, len(np.unique(self.clusters.labels_)) + 1) 

392 hist, bins = np.histogram(self.clusters.labels_, bins=numLabels) 

393 

394 # normalize the histogram, such that it sums to 100 

395 hist = hist.astype("float") 

396 hist /= hist.sum() / 100 

397 

398 # plot the histogram as bar plot 

399 plt.figure(figsize=figsize) 

400 

401 plt.bar(bins[:-1], hist, width=1) 

402 

403 plt.title('Proportion of cluster labels (%s clusters)' % self.n_clusters) 

404 plt.xlabel('# Cluster') 

405 plt.ylabel('Proportion [%]') 

406 

407 plt.show() 

408 

409 def plot_clustermap(self, figsize=None): 

410 # type: (tuple) -> None 

411 """Show a the clustered image. 

412 

413 :param figsize: figure size (inches) 

414 """ 

415 from matplotlib import pyplot as plt 

416 

417 plt.figure(figsize=figsize) 

418 rows, cols = self.clustermap.shape[:2] 

419 

420 image2plot = self.clustermap if self.im.nodata is None else np.ma.masked_equal(self.clustermap, self.im.nodata) 

421 

422 plt.imshow(image2plot, plt.get_cmap('prism'), interpolation='none', extent=(0, cols, rows, 0)) 

423 plt.show() 

424 

425 def get_random_spectra_from_each_cluster(self, samplesize=50, max_distance=None, max_angle=None, 

426 nmin_unique_spectra=50): 

427 # type: (int, Union[int, float, str], Union[int, float, str], int) -> dict 

428 """Return a given number of spectra randomly selected within each cluster. 

429 

430 E.g., 50 spectra belonging to cluster 1, 50 spectra belonging to cluster 2 and so on. 

431 

432 :param samplesize: number of spectra to be randomly selected from each cluster 

433 :param max_distance: spectra with a larger spectral distance than the given value will be excluded from 

434 random sampling. 

435 - if given as string like '20%', the maximum spectral distance is computed as 20% 

436 percentile within each cluster 

437 :param max_angle: spectra with a larger spectral angle than the given value will be excluded from 

438 random sampling. 

439 - if given as string like '20%', the maximum spectral angle is computed as 20% 

440 percentile within each cluster 

441 :param nmin_unique_spectra: in case a cluster has less than the given number, do not use its spectra 

442 (return missing values) 

443 :return: 

444 """ 

445 # get DataFrame with columns [cluster_label, B1, B2, B3, ...] 

446 # NOTE: spectra with nodata values are already excluded here in the self.spectra getter 

447 # which uses self.goodSpecMask (excludes spectra with nodata in ANY band) 

448 df = DataFrame(self.spectra, columns=['B%s' % band for band in range(1, self.im.bands + 1)], ) 

449 df.insert(0, 'cluster_label', self.clusters.labels_) 

450 

451 if max_angle is not None: 

452 if not (isinstance(max_angle, (int, float)) and max_angle > 0) and \ 

453 not (isinstance(max_angle, str) and max_angle.endswith('%')): 

454 raise ValueError(max_angle) 

455 if isinstance(max_angle, str): 

456 max_angle = np.percentile(self.spectral_angles, float(max_angle.split('%')[0].strip())) 

457 df.insert(1, 'spectral_angle', self.spectral_angles) 

458 

459 if max_distance is not None: 

460 if not (isinstance(max_distance, (int, float)) and 0 < max_distance < 100) and \ 

461 not (isinstance(max_distance, str) and max_distance.endswith('%')): 

462 raise ValueError(max_distance) 

463 if isinstance(max_distance, str): 

464 max_distance = np.percentile(self.spectral_distances, float(max_distance.split('%')[0].strip())) 

465 df.insert(1, 'spectral_distance', self.spectral_distances) 

466 

467 # get random sample from each cluster and generate a dict like {cluster_label: random_sample} 

468 # NOTE: nodata label is skipped 

469 random_samples = dict() 

470 for label in range(self.n_clusters): 

471 if max_angle is None and max_distance is None: 

472 cluster_subset = df[df.cluster_label == label].loc[:, 'B1':] 

473 else: 

474 cluster_subset = df[df.cluster_label == label] 

475 

476 # if self.n_clusters > 1: 

477 # filter by spectral angle 

478 if len(cluster_subset.index) >= nmin_unique_spectra and max_angle is not None: 

479 cluster_subset = cluster_subset[cluster_subset.spectral_angle < max_angle] 

480 

481 # filter by spectral distance 

482 if len(cluster_subset.index) >= nmin_unique_spectra and max_distance is not None: 

483 cluster_subset = cluster_subset[cluster_subset.spectral_distance < max_distance] 

484 

485 cluster_subset = cluster_subset.loc[:, 'B1':] 

486 

487 # handle clusters with less than nmin_unique_spectra in there 

488 if len(cluster_subset.index) < nmin_unique_spectra: 

489 if len(cluster_subset.index) > 0: 

490 # don't use the cluster (return nodata) 

491 cluster_subset[:] = -9999 

492 

493 else: 

494 # cluster_subset is empty after filtering -> return nodata 

495 cluster_subset.loc[0] = [-9999] * len(cluster_subset.columns) 

496 

497 # get random sample while filling it with duplicates of the same sample when cluster has not enough spectra 

498 random_samples[label] = np.array(cluster_subset.sample(samplesize, replace=True, random_state=20)) 

499 

500 return random_samples 

501 

502 def get_purest_spectra_from_each_cluster(self, samplesize=50): 

503 # type: (int) -> dict 

504 """Return a given number of spectra directly surrounding the center of each cluster. 

505 

506 E.g., 50 spectra belonging to cluster 1, 50 spectra belonging to cluster 2 and so on. 

507 

508 :param samplesize: number of spectra to be selected from each cluster 

509 :return: 

510 """ 

511 # get DataFrame with columns [cluster_label, B1, B2, B3, ...] 

512 df = DataFrame(self.spectra, columns=['B%s' % band for band in range(1, self.im.bands + 1)], ) 

513 df.insert(0, 'cluster_label', self.clusters.labels_) 

514 df.insert(1, 'spectral_distance', self.spectral_distances) 

515 

516 # get random sample from each cluster and generate a dict like {cluster_label: random_sample} 

517 random_samples = dict() 

518 for label in range(self.n_clusters): 

519 cluster_subset = df[df.cluster_label == label].loc[:, 'spectral_distance':] 

520 

521 # catch the case that the cluster does not contain enough spectra (duplicate existing ones) 

522 if len(cluster_subset.index) < samplesize: 

523 cluster_subset = cluster_subset.sample(samplesize, replace=True, random_state=20) 

524 

525 cluster_subset_sorted = cluster_subset.sort_values(by=['spectral_distance'], ascending=True) 

526 random_samples[label] = np.array(cluster_subset_sorted.loc[:, 'B1':][:samplesize]) 

527 

528 return random_samples