Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
3# spechomo, Spectral homogenization of multispectral satellite data
4#
5# Copyright (C) 2019-2021
6# - Daniel Scheffler (GFZ Potsdam, daniel.scheffler@gfz-potsdam.de)
7# - Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences Potsdam,
8# Germany (https://www.gfz-potsdam.de/)
9#
10# This software was developed within the context of the GeoMultiSens project funded
11# by the German Federal Ministry of Education and Research
12# (project grant code: 01 IS 14 010 A-C).
13#
14# Licensed under the Apache License, Version 2.0 (the "License");
15# you may not use this file except in compliance with the License.
16# You may obtain a copy of the License at
17#
18# http://www.apache.org/licenses/LICENSE-2.0
19#
20# Please note the following exception: `spechomo` depends on tqdm, which is
21# distributed under the Mozilla Public Licence (MPL) v2.0 except for the files
22# "tqdm/_tqdm.py", "setup.py", "README.rst", "MANIFEST.in" and ".gitignore".
23# Details can be found here: https://github.com/tqdm/tqdm/blob/master/LICENCE.
24#
25# Unless required by applicable law or agreed to in writing, software
26# distributed under the License is distributed on an "AS IS" BASIS,
27# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28# See the License for the specific language governing permissions and
29# limitations under the License.
31from multiprocessing import cpu_count
32from typing import Union, TYPE_CHECKING # noqa F401 # flake8 issue
33import os
35if TYPE_CHECKING:
36 from sklearn.cluster import KMeans
38import dill
39import numpy as np
40from geoarray import GeoArray
41from pandas import DataFrame
42from specclassify import SAM_Classifier, classify_image
44from .utils import im2spectra
47class KMeansRSImage(object):
48 """Class for clustering a given input image by using K-Means algorithm.
50 NOTE: Based on the nodata value of the input GeoArray those pixels that have nodata values in some bands are
51 ignored when computing the cluster coefficients. Nodata values would affect clustering result otherwise.
52 """
54 def __init__(self, im, n_clusters, sam_classassignment=False, CPUs=1, v=False):
55 # type: (GeoArray, int, bool, Union[None, int], bool) -> None
57 # privates
58 self._clusters = None
59 self._clustermap = None
60 self._goodSpecMask = None
61 self._spectra = None
62 self._labels_with_nodata = None
63 self._spectral_distances = None
64 self._spectral_distances_with_nodata = None
65 self._spectral_angles = None
66 self._spectral_angles_with_nodata = None
68 self.im = im
69 self.n_clusters = n_clusters
70 self.sam_classassignment = sam_classassignment
71 self.CPUs = CPUs or cpu_count()
72 self.v = v
74 @classmethod
75 def from_disk(cls, path_clf, im):
76 # type: (str, GeoArray) -> KMeansRSImage
77 """Get an instance of KMeansRSImage from a previously saved classifier.
79 :param path_clf: path of serialzed classifier (dill file)
80 :param im: path of the image cube belonging to that classifier
81 :return: KMeansRSImage
82 """
83 with open(path_clf, 'rb') as inF:
84 undilled = dill.load(inF) # type: dict
86 KM = KMeansRSImage(im,
87 undilled['clusters'].n_clusters,
88 undilled['clusters'].n_jobs,
89 undilled['clusters'].verbose)
90 KM._clusters = undilled['clusters']
91 KM._goodSpecMask = undilled['_goodSpecMask']
92 KM._spectral_distances = undilled['_spectral_distances']
94 return KM
96 @property
97 def goodSpecMask(self):
98 if self._goodSpecMask is None:
99 if self.im.nodata is not None:
100 mask_nodata = im2spectra(self.im) == self.im.nodata
101 goodSpecMask = np.all(~mask_nodata, axis=1)
103 if True not in goodSpecMask:
104 raise RuntimeError('All spectra contain no data values in one or multiple bands and are therefore '
105 'not usable for clustering. Clustering failed.')
107 self._goodSpecMask = goodSpecMask
108 else:
109 self._goodSpecMask = np.ones((self.im.rows * self.im.cols), dtype=bool)
111 return self._goodSpecMask
113 @property
114 def spectra(self):
115 """Get spectra used for clustering (excluding spectra containing nodata values that would affect clustering)."""
116 if self._spectra is None:
117 self._spectra = im2spectra(self.im)[self.goodSpecMask, :]
118 return self._spectra
120 @property
121 def n_spectra(self):
122 """Get number of spectra used for clustering (excluding spectra containing nodata values)."""
123 return int(np.sum(self.goodSpecMask))
125 @property
126 def clusters(self):
127 # type: () -> KMeans
128 if not self._clusters:
129 self._clusters = self.compute_clusters()
130 return self._clusters
132 @clusters.setter
133 def clusters(self, clusters):
134 self._clusters = clusters
136 @property
137 def clustermap(self):
138 if self._clustermap is None:
139 self._clustermap = self.labels_with_nodata.reshape((self.im.rows, self.im.cols))
141 return self._clustermap
143 def compute_clusters(self, nmax_spectra=100000):
144 """Compute the cluster means and labels.
146 :param nmax_spectra: maximum number of spectra to be included (pseudo-randomly selected (reproducable))
147 :return:
148 """
149 from sklearn.cluster import KMeans # avoids static TLS ImportError here
150 from sklearn import __version__ as skver
152 _old_environ = dict(os.environ)
154 try:
155 kwargs_kmeans = dict(n_clusters=self.n_clusters, random_state=0, verbose=self.v)
156 # scikit-learn>0.23 uses all cores by default; number is adjustable via OMP_NUM_THREADS
157 if float('%s.%s' % tuple(skver.split('.')[:2])) < 0.23:
158 kwargs_kmeans['n_jobs'] = self.CPUs
159 else:
160 os.environ['OMP_NUM_THREADS '] = str(self.CPUs)
162 # data reduction in case we have too many spectra
163 if self.spectra.shape[0] > nmax_spectra:
164 if self.v:
165 print('Reducing data...')
166 idxs_specIncl = np.random.RandomState(seed=0).choice(range(self.n_spectra), nmax_spectra)
167 idxs_specNotIncl = np.array(range(self.n_spectra))[~np.in1d(range(self.n_spectra), idxs_specIncl)]
168 spectra_incl = self.spectra[idxs_specIncl, :]
169 spectra_notIncl = self.spectra[idxs_specNotIncl, :]
171 if self.v:
172 print('Fitting KMeans...')
173 kmeans = KMeans(**kwargs_kmeans)
174 distmatrix_incl = kmeans.fit_transform(spectra_incl)
176 if self.v:
177 print('Computing full resolution labels...')
178 labels = np.zeros((self.n_spectra,), dtype=kmeans.labels_.dtype)
179 distances = np.zeros((self.n_spectra,), dtype=distmatrix_incl.dtype)
181 labels[idxs_specIncl] = kmeans.labels_
182 distances[idxs_specIncl] = np.min(distmatrix_incl, axis=1)
184 if self.sam_classassignment:
185 # override cluster labels with labels computed via SAM (distances have be recomputed then)
186 print('Using SAM class assignment.')
187 SC = SAM_Classifier(kmeans.cluster_centers_, CPUs=self.CPUs)
188 im_sam_labels = SC.classify(self.im)
189 sam_labels = im_sam_labels.flatten()[self.goodSpecMask]
190 self._spectral_angles = SC.angles_deg.flatten()[self.goodSpecMask]
192 # update distances at those positions where SAM assigns different class labels
193 distsPos2update = labels != sam_labels
194 distsPos2update[idxs_specNotIncl] = True
195 distances[distsPos2update] = \
196 self.compute_euclidian_distance_for_labelled_spectra(
197 self.spectra[distsPos2update, :], sam_labels[distsPos2update], kmeans.cluster_centers_)
199 kmeans.labels_ = sam_labels
201 else:
202 distmatrix_specNotIncl = kmeans.transform(spectra_notIncl)
203 labels[idxs_specNotIncl] = np.argmin(distmatrix_specNotIncl, axis=1)
204 distances[idxs_specNotIncl] = np.min(distmatrix_specNotIncl, axis=1)
206 kmeans.labels_ = labels
208 else:
209 if self.v:
210 print('Fitting KMeans...')
211 kmeans = KMeans(**kwargs_kmeans)
213 distmatrix = kmeans.fit_transform(self.spectra)
214 kmeans_labels = kmeans.labels_
215 distances = np.min(distmatrix, axis=1)
217 if self.sam_classassignment:
218 # override cluster labels with labels computed via SAM (distances have be recomputed then)
219 print('Using SAM class assignment.')
220 SC = SAM_Classifier(kmeans.cluster_centers_, CPUs=self.CPUs)
221 im_sam_labels = SC.classify(self.im)
222 sam_labels = im_sam_labels.flatten()[self.goodSpecMask]
223 self._spectral_angles = SC.angles_deg.flatten()[self.goodSpecMask]
225 # update distances at those positions where SAM assigns different class labels
226 distsPos2update = kmeans_labels != sam_labels
227 distances[distsPos2update] = \
228 self.compute_euclidian_distance_for_labelled_spectra(
229 self.spectra[distsPos2update, :], sam_labels[distsPos2update], kmeans.cluster_centers_)
230 finally:
231 os.environ.clear()
232 os.environ.update(_old_environ)
234 self.clusters = kmeans
235 self._spectral_distances = distances
237 return self.clusters
239 def apply_clusters(self, image):
240 labels = self.clusters.predict(im2spectra(GeoArray(image)))
241 return labels
243 def compute_spectral_distances(self):
244 self._spectral_distances = np.min(self.clusters.transform(self.spectra), axis=1)
245 return self.spectral_distances
247 @staticmethod
248 def compute_euclidian_distance_2D(spectra, endmembers):
249 n_samples, n_features = endmembers.shape
251 if not spectra.shape[1] == endmembers.shape[1]:
252 raise RuntimeError('Matrix dimensions are not aligned. Input spectra have %d bands but endmembers '
253 'have %d.' % (spectra.shape[1], endmembers.shape[1]))
255 dists = np.zeros((spectra.shape[0], n_samples), np.float32)
257 # loop over all endmember spectra and compute spectral angle for each input spectrum
258 for n_sample in range(n_samples):
259 train_spectrum = endmembers[n_sample, :].astype(float)
260 diff = spectra - train_spectrum
261 dists[:, n_sample] = np.sqrt((diff ** 2).sum(axis=1))
263 return dists
265 @staticmethod
266 def compute_euclidian_distance_for_labelled_spectra(spectra, labels, endmembers):
267 if not spectra.shape[1] == endmembers.shape[1]:
268 raise RuntimeError('Matrix dimensions are not aligned. Input spectra have %d bands but endmembers '
269 'have %d.' % (spectra.shape[1], endmembers.shape[1]))
271 dists = np.zeros((spectra.shape[0]), np.float32)
273 # loop over all endmember spectra and compute spectral angle for each input spectrum
274 for lbl in np.unique(labels):
275 train_spectrum = endmembers[lbl, :].astype(float)
276 mask_curlbl = labels == lbl
277 spectra_curlbl = spectra[mask_curlbl, :]
278 diff_curlbl = spectra_curlbl - train_spectrum
279 dists[mask_curlbl] = np.sqrt((diff_curlbl ** 2).sum(axis=1))
281 return dists
283 def compute_spectral_angles(self):
284 spectral_angles = classify_image(self.im, self.clusters.cluster_centers_, list(range(self.n_clusters)),
285 'SAM', in_nodataVal=self.im.nodata, cmap_nodataVal=-9999,
286 tiledims=(400, 400), CPUs=self.CPUs, return_distance=True,
287 unclassified_pixVal=-1)[1]
289 self._spectral_angles = spectral_angles.flatten()[self.goodSpecMask]
290 return self.spectral_angles
292 @property
293 def labels(self):
294 """Get labels for all clustered spectra (excluding spectra that contain nodata values)."""
295 return self.clusters.labels_
297 @property
298 def labels_with_nodata(self):
299 """Get the labels for all pixels (including those containing nodata values)."""
300 if self._labels_with_nodata is None:
301 if self.n_spectra == (self.im.rows * self.im.cols):
302 self._labels_with_nodata = self.clusters.labels_
303 else:
304 labels = np.full_like(self.goodSpecMask, -9999, dtype=self.clusters.labels_.dtype)
305 labels[self.goodSpecMask] = self.clusters.labels_
306 self._labels_with_nodata = labels
308 return self._labels_with_nodata
310 @property
311 def spectral_distances(self):
312 """Get spectral distances for all pixels that don't contain nodata values."""
313 if self._spectral_distances is None:
314 self._spectral_distances = self.compute_spectral_distances()
316 return self._spectral_distances
318 @property
319 def spectral_distances_with_nodata(self):
320 if self._clusters is not None and self._spectral_distances_with_nodata is None:
321 if self.n_spectra == (self.im.rows * self.im.cols):
322 self._spectral_distances_with_nodata = self.spectral_distances
323 else:
324 dists = np.full_like(self.goodSpecMask, np.nan, dtype=float)
325 dists[self.goodSpecMask] = self.spectral_distances
326 self._spectral_distances_with_nodata = dists
328 return self._spectral_distances_with_nodata
330 @property
331 def spectral_angles(self):
332 """Get spectral angles in degrees for all pixels that don't contain nodata values."""
333 if self._spectral_angles is None:
334 self._spectral_angles = self.compute_spectral_angles()
336 return self._spectral_angles
338 @property
339 def spectral_angles_with_nodata(self):
340 if self._clusters is not None and self._spectral_angles_with_nodata is None:
341 if self.n_spectra == (self.im.rows * self.im.cols):
342 self._spectral_angles_with_nodata = self.spectral_angles
343 else:
344 angles = np.full_like(self.goodSpecMask, np.nan, dtype=float)
345 angles[self.goodSpecMask] = self.spectral_angles
346 self._spectral_angles_with_nodata = angles
348 return self._spectral_angles_with_nodata
350 def dump(self, path_out):
351 with open(path_out, 'wb') as outF:
352 dill.dump(dict(
353 clusters=self.clusters,
354 _goodSpecMask=self._goodSpecMask,
355 _spectral_distances=self._spectral_distances),
356 outF)
358 def save_clustermap(self, path_out, **kw_save):
359 GeoArray(self.clustermap, geotransform=self.im.gt, projection=self.im.prj, nodata=-9999)\
360 .save(path_out, **kw_save)
362 def plot_cluster_centers(self, figsize=(15, 5)):
363 # type: (tuple) -> None
364 """Show a plot of the cluster center signatures.
366 :param figsize: figure size (inches)
367 """
368 from matplotlib import pyplot as plt
370 plt.figure(figsize=figsize)
371 for i, center_signature in enumerate(self.clusters.cluster_centers_):
372 plt.plot(range(1, self.im.bands + 1), center_signature, label='Cluster #%s' % (i + 1))
374 plt.title('KMeans cluster centers for %s clusters' % self.n_clusters)
375 plt.xlabel('Spectral band')
376 plt.ylabel('Pixel values')
377 plt.legend(loc='upper right')
379 plt.show()
381 def plot_cluster_histogram(self, figsize=(15, 5)):
382 # type: (tuple) -> None
383 """Show a histogram indicating the proportion of each cluster label in percent.
385 :param figsize: figure size (inches)
386 """
387 from matplotlib import pyplot as plt
389 # grab the number of different clusters and create a histogram
390 # based on the number of pixels assigned to each cluster
391 numLabels = np.arange(0, len(np.unique(self.clusters.labels_)) + 1)
392 hist, bins = np.histogram(self.clusters.labels_, bins=numLabels)
394 # normalize the histogram, such that it sums to 100
395 hist = hist.astype("float")
396 hist /= hist.sum() / 100
398 # plot the histogram as bar plot
399 plt.figure(figsize=figsize)
401 plt.bar(bins[:-1], hist, width=1)
403 plt.title('Proportion of cluster labels (%s clusters)' % self.n_clusters)
404 plt.xlabel('# Cluster')
405 plt.ylabel('Proportion [%]')
407 plt.show()
409 def plot_clustermap(self, figsize=None):
410 # type: (tuple) -> None
411 """Show a the clustered image.
413 :param figsize: figure size (inches)
414 """
415 from matplotlib import pyplot as plt
417 plt.figure(figsize=figsize)
418 rows, cols = self.clustermap.shape[:2]
420 image2plot = self.clustermap if self.im.nodata is None else np.ma.masked_equal(self.clustermap, self.im.nodata)
422 plt.imshow(image2plot, plt.get_cmap('prism'), interpolation='none', extent=(0, cols, rows, 0))
423 plt.show()
425 def get_random_spectra_from_each_cluster(self, samplesize=50, max_distance=None, max_angle=None,
426 nmin_unique_spectra=50):
427 # type: (int, Union[int, float, str], Union[int, float, str], int) -> dict
428 """Return a given number of spectra randomly selected within each cluster.
430 E.g., 50 spectra belonging to cluster 1, 50 spectra belonging to cluster 2 and so on.
432 :param samplesize: number of spectra to be randomly selected from each cluster
433 :param max_distance: spectra with a larger spectral distance than the given value will be excluded from
434 random sampling.
435 - if given as string like '20%', the maximum spectral distance is computed as 20%
436 percentile within each cluster
437 :param max_angle: spectra with a larger spectral angle than the given value will be excluded from
438 random sampling.
439 - if given as string like '20%', the maximum spectral angle is computed as 20%
440 percentile within each cluster
441 :param nmin_unique_spectra: in case a cluster has less than the given number, do not use its spectra
442 (return missing values)
443 :return:
444 """
445 # get DataFrame with columns [cluster_label, B1, B2, B3, ...]
446 # NOTE: spectra with nodata values are already excluded here in the self.spectra getter
447 # which uses self.goodSpecMask (excludes spectra with nodata in ANY band)
448 df = DataFrame(self.spectra, columns=['B%s' % band for band in range(1, self.im.bands + 1)], )
449 df.insert(0, 'cluster_label', self.clusters.labels_)
451 if max_angle is not None:
452 if not (isinstance(max_angle, (int, float)) and max_angle > 0) and \
453 not (isinstance(max_angle, str) and max_angle.endswith('%')):
454 raise ValueError(max_angle)
455 if isinstance(max_angle, str):
456 max_angle = np.percentile(self.spectral_angles, float(max_angle.split('%')[0].strip()))
457 df.insert(1, 'spectral_angle', self.spectral_angles)
459 if max_distance is not None:
460 if not (isinstance(max_distance, (int, float)) and 0 < max_distance < 100) and \
461 not (isinstance(max_distance, str) and max_distance.endswith('%')):
462 raise ValueError(max_distance)
463 if isinstance(max_distance, str):
464 max_distance = np.percentile(self.spectral_distances, float(max_distance.split('%')[0].strip()))
465 df.insert(1, 'spectral_distance', self.spectral_distances)
467 # get random sample from each cluster and generate a dict like {cluster_label: random_sample}
468 # NOTE: nodata label is skipped
469 random_samples = dict()
470 for label in range(self.n_clusters):
471 if max_angle is None and max_distance is None:
472 cluster_subset = df[df.cluster_label == label].loc[:, 'B1':]
473 else:
474 cluster_subset = df[df.cluster_label == label]
476 # if self.n_clusters > 1:
477 # filter by spectral angle
478 if len(cluster_subset.index) >= nmin_unique_spectra and max_angle is not None:
479 cluster_subset = cluster_subset[cluster_subset.spectral_angle < max_angle]
481 # filter by spectral distance
482 if len(cluster_subset.index) >= nmin_unique_spectra and max_distance is not None:
483 cluster_subset = cluster_subset[cluster_subset.spectral_distance < max_distance]
485 cluster_subset = cluster_subset.loc[:, 'B1':]
487 # handle clusters with less than nmin_unique_spectra in there
488 if len(cluster_subset.index) < nmin_unique_spectra:
489 if len(cluster_subset.index) > 0:
490 # don't use the cluster (return nodata)
491 cluster_subset[:] = -9999
493 else:
494 # cluster_subset is empty after filtering -> return nodata
495 cluster_subset.loc[0] = [-9999] * len(cluster_subset.columns)
497 # get random sample while filling it with duplicates of the same sample when cluster has not enough spectra
498 random_samples[label] = np.array(cluster_subset.sample(samplesize, replace=True, random_state=20))
500 return random_samples
502 def get_purest_spectra_from_each_cluster(self, samplesize=50):
503 # type: (int) -> dict
504 """Return a given number of spectra directly surrounding the center of each cluster.
506 E.g., 50 spectra belonging to cluster 1, 50 spectra belonging to cluster 2 and so on.
508 :param samplesize: number of spectra to be selected from each cluster
509 :return:
510 """
511 # get DataFrame with columns [cluster_label, B1, B2, B3, ...]
512 df = DataFrame(self.spectra, columns=['B%s' % band for band in range(1, self.im.bands + 1)], )
513 df.insert(0, 'cluster_label', self.clusters.labels_)
514 df.insert(1, 'spectral_distance', self.spectral_distances)
516 # get random sample from each cluster and generate a dict like {cluster_label: random_sample}
517 random_samples = dict()
518 for label in range(self.n_clusters):
519 cluster_subset = df[df.cluster_label == label].loc[:, 'spectral_distance':]
521 # catch the case that the cluster does not contain enough spectra (duplicate existing ones)
522 if len(cluster_subset.index) < samplesize:
523 cluster_subset = cluster_subset.sample(samplesize, replace=True, random_state=20)
525 cluster_subset_sorted = cluster_subset.sort_values(by=['spectral_distance'], ascending=True)
526 random_samples[label] = np.array(cluster_subset_sorted.loc[:, 'B1':][:samplesize])
528 return random_samples