Coverage for spechomo/clustering.py: 81%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1# -*- coding: utf-8 -*-

3# spechomo, Spectral homogenization of multispectral satellite data

6# - Daniel Scheffler (GFZ Potsdam, daniel.scheffler@gfz-potsdam.de)

7# - Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences Potsdam,

8# Germany (https://www.gfz-potsdam.de/)

10# This software was developed within the context of the GeoMultiSens project funded

11# by the German Federal Ministry of Education and Research

12# (project grant code: 01 IS 14 010 A-C).

13#

14# Licensed under the Apache License, Version 2.0 (the "License");

15# you may not use this file except in compliance with the License.

16# You may obtain a copy of the License at

17#

18# http://www.apache.org/licenses/LICENSE-2.0

19#

20# Please note the following exception: `spechomo` depends on tqdm, which is

21# distributed under the Mozilla Public Licence (MPL) v2.0 except for the files

22# "tqdm/_tqdm.py", "setup.py", "README.rst", "MANIFEST.in" and ".gitignore".

23# Details can be found here: https://github.com/tqdm/tqdm/blob/master/LICENCE.

24#

25# Unless required by applicable law or agreed to in writing, software

26# distributed under the License is distributed on an "AS IS" BASIS,

27# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

28# See the License for the specific language governing permissions and

29# limitations under the License.

31from multiprocessing import cpu_count

32from typing import Union, TYPE_CHECKING # noqa F401 # flake8 issue

33import os

35if TYPE_CHECKING:

36 from sklearn.cluster import KMeans

38import dill

39import numpy as np

40from geoarray import GeoArray

41from pandas import DataFrame

42from specclassify import SAM_Classifier, classify_image

44from .utils import im2spectra

47class KMeansRSImage(object):

48 """Class for clustering a given input image by using K-Means algorithm.

50 NOTE: Based on the nodata value of the input GeoArray those pixels that have nodata values in some bands are

51 ignored when computing the cluster coefficients. Nodata values would affect clustering result otherwise.

52 """

54 def __init__(self, im, n_clusters, sam_classassignment=False, CPUs=1, v=False):

55 # type: (GeoArray, int, bool, Union[None, int], bool) -> None

57 # privates

58 self._clusters = None

59 self._clustermap = None

60 self._goodSpecMask = None

61 self._spectra = None

62 self._labels_with_nodata = None

63 self._spectral_distances = None

64 self._spectral_distances_with_nodata = None

65 self._spectral_angles = None

66 self._spectral_angles_with_nodata = None

68 self.im = im

69 self.n_clusters = n_clusters

70 self.sam_classassignment = sam_classassignment

71 self.CPUs = CPUs or cpu_count()

72 self.v = v

74 @classmethod

75 def from_disk(cls, path_clf, im):

76 # type: (str, GeoArray) -> KMeansRSImage

77 """Get an instance of KMeansRSImage from a previously saved classifier.

79 :param path_clf: path of serialzed classifier (dill file)

80 :param im: path of the image cube belonging to that classifier

81 :return: KMeansRSImage

82 """

83 with open(path_clf, 'rb') as inF:

84 undilled = dill.load(inF) # type: dict

86 KM = KMeansRSImage(im,

87 undilled['clusters'].n_clusters,

88 undilled['clusters'].n_jobs,

89 undilled['clusters'].verbose)

90 KM._clusters = undilled['clusters']

91 KM._goodSpecMask = undilled['_goodSpecMask']

92 KM._spectral_distances = undilled['_spectral_distances']

94 return KM

96 @property

97 def goodSpecMask(self):

98 if self._goodSpecMask is None:

99 if self.im.nodata is not None:

100 mask_nodata = im2spectra(self.im) == self.im.nodata

101 goodSpecMask = np.all(~mask_nodata, axis=1)

102

103 if True not in goodSpecMask:

104 raise RuntimeError('All spectra contain no data values in one or multiple bands and are therefore '

105 'not usable for clustering. Clustering failed.')

106

107 self._goodSpecMask = goodSpecMask

108 else:

109 self._goodSpecMask = np.ones((self.im.rows * self.im.cols), dtype=bool)

110

111 return self._goodSpecMask

112

113 @property

114 def spectra(self):

115 """Get spectra used for clustering (excluding spectra containing nodata values that would affect clustering)."""

116 if self._spectra is None:

117 self._spectra = im2spectra(self.im)[self.goodSpecMask, :]

118 return self._spectra

119

120 @property

121 def n_spectra(self):

122 """Get number of spectra used for clustering (excluding spectra containing nodata values)."""

123 return int(np.sum(self.goodSpecMask))

124

125 @property

126 def clusters(self):

127 # type: () -> KMeans

128 if not self._clusters:

129 self._clusters = self.compute_clusters()

130 return self._clusters

131

132 @clusters.setter

133 def clusters(self, clusters):

134 self._clusters = clusters

135

136 @property

137 def clustermap(self):

138 if self._clustermap is None:

139 self._clustermap = self.labels_with_nodata.reshape((self.im.rows, self.im.cols))

140

141 return self._clustermap

142

143 def compute_clusters(self, nmax_spectra=100000):

144 """Compute the cluster means and labels.

145

146 :param nmax_spectra: maximum number of spectra to be included (pseudo-randomly selected (reproducable))

147 :return:

148 """

149 from sklearn.cluster import KMeans # avoids static TLS ImportError here

150 from sklearn import __version__ as skver

151

152 _old_environ = dict(os.environ)

153

154 try:

155 kwargs_kmeans = dict(n_clusters=self.n_clusters, random_state=0, verbose=self.v)

156 # scikit-learn>0.23 uses all cores by default; number is adjustable via OMP_NUM_THREADS

157 if float('%s.%s' % tuple(skver.split('.')[:2])) < 0.23:

158 kwargs_kmeans['n_jobs'] = self.CPUs

159 else:

160 os.environ['OMP_NUM_THREADS '] = str(self.CPUs)

161

162 # data reduction in case we have too many spectra

163 if self.spectra.shape[0] > nmax_spectra:

164 if self.v:

165 print('Reducing data...')

166 idxs_specIncl = np.random.RandomState(seed=0).choice(range(self.n_spectra), nmax_spectra)

167 idxs_specNotIncl = np.array(range(self.n_spectra))[~np.in1d(range(self.n_spectra), idxs_specIncl)]

168 spectra_incl = self.spectra[idxs_specIncl, :]

169 spectra_notIncl = self.spectra[idxs_specNotIncl, :]

170

171 if self.v:

172 print('Fitting KMeans...')

173 kmeans = KMeans(**kwargs_kmeans)

174 distmatrix_incl = kmeans.fit_transform(spectra_incl)

175

176 if self.v:

177 print('Computing full resolution labels...')

178 labels = np.zeros((self.n_spectra,), dtype=kmeans.labels_.dtype)

179 distances = np.zeros((self.n_spectra,), dtype=distmatrix_incl.dtype)

180

181 labels[idxs_specIncl] = kmeans.labels_

182 distances[idxs_specIncl] = np.min(distmatrix_incl, axis=1)

183

184 if self.sam_classassignment:

185 # override cluster labels with labels computed via SAM (distances have be recomputed then)

186 print('Using SAM class assignment.')

187 SC = SAM_Classifier(kmeans.cluster_centers_, CPUs=self.CPUs)

188 im_sam_labels = SC.classify(self.im)

189 sam_labels = im_sam_labels.flatten()[self.goodSpecMask]

190 self._spectral_angles = SC.angles_deg.flatten()[self.goodSpecMask]

191

192 # update distances at those positions where SAM assigns different class labels

193 distsPos2update = labels != sam_labels

194 distsPos2update[idxs_specNotIncl] = True

195 distances[distsPos2update] = \

196 self.compute_euclidian_distance_for_labelled_spectra(

197 self.spectra[distsPos2update, :], sam_labels[distsPos2update], kmeans.cluster_centers_)

198

199 kmeans.labels_ = sam_labels

200

201 else:

202 distmatrix_specNotIncl = kmeans.transform(spectra_notIncl)

203 labels[idxs_specNotIncl] = np.argmin(distmatrix_specNotIncl, axis=1)

204 distances[idxs_specNotIncl] = np.min(distmatrix_specNotIncl, axis=1)

205

206 kmeans.labels_ = labels

207

208 else:

209 if self.v:

210 print('Fitting KMeans...')

211 kmeans = KMeans(**kwargs_kmeans)

212

213 distmatrix = kmeans.fit_transform(self.spectra)

214 kmeans_labels = kmeans.labels_

215 distances = np.min(distmatrix, axis=1)

216

217 if self.sam_classassignment:

218 # override cluster labels with labels computed via SAM (distances have be recomputed then)

219 print('Using SAM class assignment.')

220 SC = SAM_Classifier(kmeans.cluster_centers_, CPUs=self.CPUs)

221 im_sam_labels = SC.classify(self.im)

222 sam_labels = im_sam_labels.flatten()[self.goodSpecMask]

223 self._spectral_angles = SC.angles_deg.flatten()[self.goodSpecMask]

224

225 # update distances at those positions where SAM assigns different class labels

226 distsPos2update = kmeans_labels != sam_labels

227 distances[distsPos2update] = \

228 self.compute_euclidian_distance_for_labelled_spectra(

229 self.spectra[distsPos2update, :], sam_labels[distsPos2update], kmeans.cluster_centers_)

230 finally:

231 os.environ.clear()

232 os.environ.update(_old_environ)

233

234 self.clusters = kmeans

235 self._spectral_distances = distances

236

237 return self.clusters

238

239 def apply_clusters(self, image):

240 labels = self.clusters.predict(im2spectra(GeoArray(image)))

241 return labels

242

243 def compute_spectral_distances(self):

244 self._spectral_distances = np.min(self.clusters.transform(self.spectra), axis=1)

245 return self.spectral_distances

246

247 @staticmethod

248 def compute_euclidian_distance_2D(spectra, endmembers):

249 n_samples, n_features = endmembers.shape

250

251 if not spectra.shape[1] == endmembers.shape[1]:

252 raise RuntimeError('Matrix dimensions are not aligned. Input spectra have %d bands but endmembers '

253 'have %d.' % (spectra.shape[1], endmembers.shape[1]))

254

255 dists = np.zeros((spectra.shape[0], n_samples), np.float32)

256

257 # loop over all endmember spectra and compute spectral angle for each input spectrum

258 for n_sample in range(n_samples):

259 train_spectrum = endmembers[n_sample, :].astype(float)

260 diff = spectra - train_spectrum

261 dists[:, n_sample] = np.sqrt((diff ** 2).sum(axis=1))

262

263 return dists

264

265 @staticmethod

266 def compute_euclidian_distance_for_labelled_spectra(spectra, labels, endmembers):

267 if not spectra.shape[1] == endmembers.shape[1]:

268 raise RuntimeError('Matrix dimensions are not aligned. Input spectra have %d bands but endmembers '

269 'have %d.' % (spectra.shape[1], endmembers.shape[1]))

270

271 dists = np.zeros((spectra.shape[0]), np.float32)

272

273 # loop over all endmember spectra and compute spectral angle for each input spectrum

274 for lbl in np.unique(labels):

275 train_spectrum = endmembers[lbl, :].astype(float)

276 mask_curlbl = labels == lbl

277 spectra_curlbl = spectra[mask_curlbl, :]

278 diff_curlbl = spectra_curlbl - train_spectrum

279 dists[mask_curlbl] = np.sqrt((diff_curlbl ** 2).sum(axis=1))

280

281 return dists

282

283 def compute_spectral_angles(self):

284 spectral_angles = classify_image(self.im, self.clusters.cluster_centers_, list(range(self.n_clusters)),

285 'SAM', in_nodataVal=self.im.nodata, cmap_nodataVal=-9999,

286 tiledims=(400, 400), CPUs=self.CPUs, return_distance=True,

287 unclassified_pixVal=-1)[1]

288

289 self._spectral_angles = spectral_angles.flatten()[self.goodSpecMask]

290 return self.spectral_angles

291

292 @property

293 def labels(self):

294 """Get labels for all clustered spectra (excluding spectra that contain nodata values)."""

295 return self.clusters.labels_

296

297 @property

298 def labels_with_nodata(self):

299 """Get the labels for all pixels (including those containing nodata values)."""

300 if self._labels_with_nodata is None:

301 if self.n_spectra == (self.im.rows * self.im.cols):

302 self._labels_with_nodata = self.clusters.labels_

303 else:

304 labels = np.full_like(self.goodSpecMask, -9999, dtype=self.clusters.labels_.dtype)

305 labels[self.goodSpecMask] = self.clusters.labels_

306 self._labels_with_nodata = labels

307

308 return self._labels_with_nodata

309

310 @property

311 def spectral_distances(self):

312 """Get spectral distances for all pixels that don't contain nodata values."""

313 if self._spectral_distances is None:

314 self._spectral_distances = self.compute_spectral_distances()

315

316 return self._spectral_distances

317

318 @property

319 def spectral_distances_with_nodata(self):

320 if self._clusters is not None and self._spectral_distances_with_nodata is None:

321 if self.n_spectra == (self.im.rows * self.im.cols):

322 self._spectral_distances_with_nodata = self.spectral_distances

323 else:

324 dists = np.full_like(self.goodSpecMask, np.nan, dtype=float)

325 dists[self.goodSpecMask] = self.spectral_distances

326 self._spectral_distances_with_nodata = dists

327

328 return self._spectral_distances_with_nodata

329

330 @property

331 def spectral_angles(self):

332 """Get spectral angles in degrees for all pixels that don't contain nodata values."""

333 if self._spectral_angles is None:

334 self._spectral_angles = self.compute_spectral_angles()

335

336 return self._spectral_angles

337

338 @property

339 def spectral_angles_with_nodata(self):

340 if self._clusters is not None and self._spectral_angles_with_nodata is None:

341 if self.n_spectra == (self.im.rows * self.im.cols):

342 self._spectral_angles_with_nodata = self.spectral_angles

343 else:

344 angles = np.full_like(self.goodSpecMask, np.nan, dtype=float)

345 angles[self.goodSpecMask] = self.spectral_angles

346 self._spectral_angles_with_nodata = angles

347

348 return self._spectral_angles_with_nodata

349

350 def dump(self, path_out):

351 with open(path_out, 'wb') as outF:

352 dill.dump(dict(

353 clusters=self.clusters,

354 _goodSpecMask=self._goodSpecMask,

355 _spectral_distances=self._spectral_distances),

356 outF)

357

358 def save_clustermap(self, path_out, **kw_save):

359 GeoArray(self.clustermap, geotransform=self.im.gt, projection=self.im.prj, nodata=-9999)\

360 .save(path_out, **kw_save)

361

362 def plot_cluster_centers(self, figsize=(15, 5)):

363 # type: (tuple) -> None

364 """Show a plot of the cluster center signatures.

365

366 :param figsize: figure size (inches)

367 """

368 from matplotlib import pyplot as plt

369

370 plt.figure(figsize=figsize)

371 for i, center_signature in enumerate(self.clusters.cluster_centers_):

372 plt.plot(range(1, self.im.bands + 1), center_signature, label='Cluster #%s' % (i + 1))

373

374 plt.title('KMeans cluster centers for %s clusters' % self.n_clusters)

375 plt.xlabel('Spectral band')

376 plt.ylabel('Pixel values')

377 plt.legend(loc='upper right')

378

379 plt.show()

380

381 def plot_cluster_histogram(self, figsize=(15, 5)):

382 # type: (tuple) -> None

383 """Show a histogram indicating the proportion of each cluster label in percent.

384

385 :param figsize: figure size (inches)

386 """

387 from matplotlib import pyplot as plt

388

389 # grab the number of different clusters and create a histogram

390 # based on the number of pixels assigned to each cluster

391 numLabels = np.arange(0, len(np.unique(self.clusters.labels_)) + 1)

392 hist, bins = np.histogram(self.clusters.labels_, bins=numLabels)

393

394 # normalize the histogram, such that it sums to 100

395 hist = hist.astype("float")

396 hist /= hist.sum() / 100

397

398 # plot the histogram as bar plot

399 plt.figure(figsize=figsize)

400

401 plt.bar(bins[:-1], hist, width=1)

402

403 plt.title('Proportion of cluster labels (%s clusters)' % self.n_clusters)

404 plt.xlabel('# Cluster')

405 plt.ylabel('Proportion [%]')

406

407 plt.show()

408

409 def plot_clustermap(self, figsize=None):

410 # type: (tuple) -> None

411 """Show a the clustered image.

412

413 :param figsize: figure size (inches)

414 """

415 from matplotlib import pyplot as plt

416

417 plt.figure(figsize=figsize)

418 rows, cols = self.clustermap.shape[:2]

419

420 image2plot = self.clustermap if self.im.nodata is None else np.ma.masked_equal(self.clustermap, self.im.nodata)

421

422 plt.imshow(image2plot, plt.get_cmap('prism'), interpolation='none', extent=(0, cols, rows, 0))

423 plt.show()

424

425 def get_random_spectra_from_each_cluster(self, samplesize=50, max_distance=None, max_angle=None,

426 nmin_unique_spectra=50):

427 # type: (int, Union[int, float, str], Union[int, float, str], int) -> dict

428 """Return a given number of spectra randomly selected within each cluster.

429

430 E.g., 50 spectra belonging to cluster 1, 50 spectra belonging to cluster 2 and so on.

431

432 :param samplesize: number of spectra to be randomly selected from each cluster

433 :param max_distance: spectra with a larger spectral distance than the given value will be excluded from

434 random sampling.

435 - if given as string like '20%', the maximum spectral distance is computed as 20%

436 percentile within each cluster

437 :param max_angle: spectra with a larger spectral angle than the given value will be excluded from

438 random sampling.

439 - if given as string like '20%', the maximum spectral angle is computed as 20%

440 percentile within each cluster

441 :param nmin_unique_spectra: in case a cluster has less than the given number, do not use its spectra

442 (return missing values)

443 :return:

444 """

445 # get DataFrame with columns [cluster_label, B1, B2, B3, ...]

446 # NOTE: spectra with nodata values are already excluded here in the self.spectra getter

447 # which uses self.goodSpecMask (excludes spectra with nodata in ANY band)

448 df = DataFrame(self.spectra, columns=['B%s' % band for band in range(1, self.im.bands + 1)], )

449 df.insert(0, 'cluster_label', self.clusters.labels_)

450

451 if max_angle is not None:

452 if not (isinstance(max_angle, (int, float)) and max_angle > 0) and \

453 not (isinstance(max_angle, str) and max_angle.endswith('%')):

454 raise ValueError(max_angle)

455 if isinstance(max_angle, str):

456 max_angle = np.percentile(self.spectral_angles, float(max_angle.split('%')[0].strip()))

457 df.insert(1, 'spectral_angle', self.spectral_angles)

458

459 if max_distance is not None:

460 if not (isinstance(max_distance, (int, float)) and 0 < max_distance < 100) and \

461 not (isinstance(max_distance, str) and max_distance.endswith('%')):

462 raise ValueError(max_distance)

463 if isinstance(max_distance, str):

464 max_distance = np.percentile(self.spectral_distances, float(max_distance.split('%')[0].strip()))

465 df.insert(1, 'spectral_distance', self.spectral_distances)

466

467 # get random sample from each cluster and generate a dict like {cluster_label: random_sample}

468 # NOTE: nodata label is skipped

469 random_samples = dict()

470 for label in range(self.n_clusters):

471 if max_angle is None and max_distance is None:

472 cluster_subset = df[df.cluster_label == label].loc[:, 'B1':]

473 else:

474 cluster_subset = df[df.cluster_label == label]

475

476 # if self.n_clusters > 1:

477 # filter by spectral angle

478 if len(cluster_subset.index) >= nmin_unique_spectra and max_angle is not None:

479 cluster_subset = cluster_subset[cluster_subset.spectral_angle < max_angle]

480

481 # filter by spectral distance

482 if len(cluster_subset.index) >= nmin_unique_spectra and max_distance is not None:

483 cluster_subset = cluster_subset[cluster_subset.spectral_distance < max_distance]

484

485 cluster_subset = cluster_subset.loc[:, 'B1':]

486

487 # handle clusters with less than nmin_unique_spectra in there

488 if len(cluster_subset.index) < nmin_unique_spectra:

489 if len(cluster_subset.index) > 0:

490 # don't use the cluster (return nodata)

491 cluster_subset[:] = -9999

492

493 else:

494 # cluster_subset is empty after filtering -> return nodata

495 cluster_subset.loc[0] = [-9999] * len(cluster_subset.columns)

496

497 # get random sample while filling it with duplicates of the same sample when cluster has not enough spectra

498 random_samples[label] = np.array(cluster_subset.sample(samplesize, replace=True, random_state=20))

499

500 return random_samples

501

502 def get_purest_spectra_from_each_cluster(self, samplesize=50):

503 # type: (int) -> dict

504 """Return a given number of spectra directly surrounding the center of each cluster.

505

506 E.g., 50 spectra belonging to cluster 1, 50 spectra belonging to cluster 2 and so on.

507

508 :param samplesize: number of spectra to be selected from each cluster

509 :return:

510 """

511 # get DataFrame with columns [cluster_label, B1, B2, B3, ...]

512 df = DataFrame(self.spectra, columns=['B%s' % band for band in range(1, self.im.bands + 1)], )

513 df.insert(0, 'cluster_label', self.clusters.labels_)

514 df.insert(1, 'spectral_distance', self.spectral_distances)

515

516 # get random sample from each cluster and generate a dict like {cluster_label: random_sample}

517 random_samples = dict()

518 for label in range(self.n_clusters):

519 cluster_subset = df[df.cluster_label == label].loc[:, 'spectral_distance':]

520

521 # catch the case that the cluster does not contain enough spectra (duplicate existing ones)

522 if len(cluster_subset.index) < samplesize:

523 cluster_subset = cluster_subset.sample(samplesize, replace=True, random_state=20)

524

525 cluster_subset_sorted = cluster_subset.sort_values(by=['spectral_distance'], ascending=True)

526 random_samples[label] = np.array(cluster_subset_sorted.loc[:, 'B1':][:samplesize])

527

528 return random_samples

Coverage for spechomo/clustering.py : 81%

277 statements

Coverage for spechomo/clustering.py : 81%

277 statements 224 run 53 missing 0 excluded

277 statements