SemiDiscreteOT 1.0
Semi-Discrete Optimal Transport Library
Loading...
Searching...
No Matches
dkm.hpp
Go to the documentation of this file.
1#pragma once
2
3// only included in case there's a C++11 compiler out there that doesn't support `#pragma once`
4#ifndef DKM_KMEANS_H
5#define DKM_KMEANS_H
6
7#include <algorithm>
8#include <array>
9#include <cassert>
10#include <cstddef>
11#include <cstdint>
12#include <random>
13#include <tuple>
14#include <type_traits>
15#include <vector>
16
17/*
18DKM - A k-means implementation that is generic across variable data dimensions.
19*/
20namespace dkm {
21
22/*
23These functions are all private implementation details and shouldn't be referenced outside of this
24file.
25*/
26namespace details {
27
28/*
29Calculate the square of the distance between two points.
30*/
31template <typename T, size_t N>
32T distance_squared(const std::array<T, N>& point_a, const std::array<T, N>& point_b) {
33 T d_squared = T();
34 for (typename std::array<T, N>::size_type i = 0; i < N; ++i) {
35 auto delta = point_a[i] - point_b[i];
36 d_squared += delta * delta;
37 }
38 return d_squared;
39}
40
41template <typename T, size_t N>
42T distance(const std::array<T, N>& point_a, const std::array<T, N>& point_b) {
43 return std::sqrt(distance_squared(point_a, point_b));
44}
45
46/*
47Calculate the smallest distance between each of the data points and any of the input means.
48*/
49template <typename T, size_t N>
50std::vector<T> closest_distance(
51 const std::vector<std::array<T, N>>& means, const std::vector<std::array<T, N>>& data) {
52 std::vector<T> distances;
53 distances.reserve(data.size());
54 for (auto& d : data) {
55 T closest = distance_squared(d, means[0]);
56 for (auto& m : means) {
57 T distance = distance_squared(d, m);
58 if (distance < closest)
59 closest = distance;
60 }
61 distances.push_back(closest);
62 }
63 return distances;
64}
65
66/*
67This is an alternate initialization method based on the [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B)
68initialization algorithm.
69*/
70template <typename T, size_t N>
71std::vector<std::array<T, N>> random_plusplus(const std::vector<std::array<T, N>>& data, uint32_t k, uint64_t seed) {
72 assert(k > 0);
73 assert(data.size() > 0);
74 using input_size_t = typename std::array<T, N>::size_type;
75 std::vector<std::array<T, N>> means;
76 // Using a very simple PRBS generator, parameters selected according to
77 // https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use
78 std::linear_congruential_engine<uint64_t, 6364136223846793005, 1442695040888963407, UINT64_MAX> rand_engine(seed);
79
80 // Select first mean at random from the set
81 {
82 std::uniform_int_distribution<input_size_t> uniform_generator(0, data.size() - 1);
83 means.push_back(data[uniform_generator(rand_engine)]);
84 }
85
86 for (uint32_t count = 1; count < k; ++count) {
87 // Calculate the distance to the closest mean for each data point
88 auto distances = details::closest_distance(means, data);
89 // Pick a random point weighted by the distance from existing means
90 // TODO: This might convert floating point weights to ints, distorting the distribution for small weights
91#if !defined(_MSC_VER) || _MSC_VER >= 1900
92 std::discrete_distribution<input_size_t> generator(distances.begin(), distances.end());
93#else // MSVC++ older than 14.0
94 input_size_t i = 0;
95 std::discrete_distribution<input_size_t> generator(distances.size(), 0.0, 0.0, [&distances, &i](double) { return distances[i++]; });
96#endif
97 means.push_back(data[generator(rand_engine)]);
98 }
99 return means;
100}
101
102/*
103Calculate the index of the mean a particular data point is closest to (euclidean distance)
104*/
105template <typename T, size_t N>
106uint32_t closest_mean(const std::array<T, N>& point, const std::vector<std::array<T, N>>& means) {
107 assert(!means.empty());
108 T smallest_distance = distance_squared(point, means[0]);
109 typename std::array<T, N>::size_type index = 0;
110 T distance;
111 for (size_t i = 1; i < means.size(); ++i) {
112 distance = distance_squared(point, means[i]);
113 if (distance < smallest_distance) {
114 smallest_distance = distance;
115 index = i;
116 }
117 }
118 return index;
119}
120
121/*
122Calculate the index of the mean each data point is closest to (euclidean distance).
123*/
124template <typename T, size_t N>
125std::vector<uint32_t> calculate_clusters(
126 const std::vector<std::array<T, N>>& data, const std::vector<std::array<T, N>>& means) {
127 std::vector<uint32_t> clusters;
128 for (auto& point : data) {
129 clusters.push_back(closest_mean(point, means));
130 }
131 return clusters;
132}
133
134/*
135Calculate means based on data points and their cluster assignments.
136*/
137template <typename T, size_t N>
138std::vector<std::array<T, N>> calculate_means(const std::vector<std::array<T, N>>& data,
139 const std::vector<uint32_t>& clusters,
140 const std::vector<std::array<T, N>>& old_means,
141 uint32_t k) {
142 std::vector<std::array<T, N>> means(k);
143 std::vector<T> count(k, T());
144 for (size_t i = 0; i < std::min(clusters.size(), data.size()); ++i) {
145 auto& mean = means[clusters[i]];
146 count[clusters[i]] += 1;
147 for (size_t j = 0; j < std::min(data[i].size(), mean.size()); ++j) {
148 mean[j] += data[i][j];
149 }
150 }
151 for (size_t i = 0; i < k; ++i) {
152 if (count[i] == 0) {
153 means[i] = old_means[i];
154 } else {
155 for (size_t j = 0; j < means[i].size(); ++j) {
156 means[i][j] /= count[i];
157 }
158 }
159 }
160 return means;
161}
162
163template <typename T, size_t N>
164std::vector<T> deltas(
165 const std::vector<std::array<T, N>>& old_means, const std::vector<std::array<T, N>>& means)
166{
167 std::vector<T> distances;
168 distances.reserve(means.size());
169 assert(old_means.size() == means.size());
170 for (size_t i = 0; i < means.size(); ++i) {
171 distances.push_back(distance(means[i], old_means[i]));
172 }
173 return distances;
174}
175
176template <typename T>
177bool deltas_below_limit(const std::vector<T>& deltas, T min_delta) {
178 for (T d : deltas) {
179 if (d > min_delta) {
180 return false;
181 }
182 }
183 return true;
184}
185
186} // namespace details
187
188/*
189clustering_parameters is the configuration used for running the kmeans_lloyd algorithm.
190
191It requires a k value for initialization, and can subsequently be configured with your choice
192of optional parameters, including:
193* Maximum iteration count; the algorithm will terminate if it reaches this iteration count
194 before converging on a solution. The results returned are the means and cluster assignments
195 calculated in the last iteration before termination.
196* Minimum delta; the algorithm will terminate if the change in position of all means is
197 smaller than the specified distance.
198* Random seed; if present, this will be used in place of `std::random_device` for kmeans++
199 initialization. This can be used to ensure reproducible/deterministic behavior.
200*/
201template <typename T>
203public:
204 explicit clustering_parameters(uint32_t k) :
205 _k(k),
206 _has_max_iter(false), _max_iter(),
207 _has_min_delta(false), _min_delta(),
208 _has_rand_seed(false), _rand_seed()
209 {}
210
211 void set_max_iteration(uint64_t max_iter)
212 {
213 _max_iter = max_iter;
214 _has_max_iter = true;
215 }
216
217 void set_min_delta(T min_delta)
218 {
219 _min_delta = min_delta;
220 _has_min_delta = true;
221 }
222
223 void set_random_seed(uint64_t rand_seed)
224 {
225 _rand_seed = rand_seed;
226 _has_rand_seed = true;
227 }
228
229 bool has_max_iteration() const { return _has_max_iter; }
230 bool has_min_delta() const { return _has_min_delta; }
231 bool has_random_seed() const { return _has_rand_seed; }
232
233 uint32_t get_k() const { return _k; };
234 uint64_t get_max_iteration() const { return _max_iter; }
235 T get_min_delta() const { return _min_delta; }
236 uint64_t get_random_seed() const { return _rand_seed; }
237
238private:
239 uint32_t _k;
241 uint64_t _max_iter;
245 uint64_t _rand_seed;
246};
247
248/*
249Implementation of k-means generic across the data type and the dimension of each data item. Expects
250the data to be a vector of fixed-size arrays. Generic parameters are the type of the base data (T)
251and the dimensionality of each data point (N). All points must have the same dimensionality.
252
253e.g. points of the form (X, Y, Z) would be N = 3.
254
255Takes a `clustering_parameters` struct for algorithm configuration. See the comments for the
256`clustering_parameters` struct for more information about the configuration values and how they
257affect the algorithm.
258
259Returns a std::tuple containing:
260 0: A vector holding the means for each cluster from 0 to k-1.
261 1: A vector containing the cluster number (0 to k-1) for each corresponding element of the input
262 data vector.
263
264Implementation details:
265This implementation of k-means uses [Lloyd's Algorithm](https://en.wikipedia.org/wiki/Lloyd%27s_algorithm)
266with the [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B)
267used for initializing the means.
268
269*/
270template <typename T, size_t N>
271std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>> kmeans_lloyd(
272 const std::vector<std::array<T, N>>& data, const clustering_parameters<T>& parameters) {
273 static_assert(std::is_arithmetic<T>::value && std::is_signed<T>::value,
274 "kmeans_lloyd requires the template parameter T to be a signed arithmetic type (e.g. float, double, int)");
275 assert(parameters.get_k() > 0); // k must be greater than zero
276 assert(data.size() >= parameters.get_k()); // there must be at least k data points
277 std::random_device rand_device;
278 uint64_t seed = parameters.has_random_seed() ? parameters.get_random_seed() : rand_device();
279 std::vector<std::array<T, N>> means = details::random_plusplus(data, parameters.get_k(), seed);
280
281 std::vector<std::array<T, N>> old_means;
282 std::vector<std::array<T, N>> old_old_means;
283 std::vector<uint32_t> clusters;
284 // Calculate new means until convergence is reached or we hit the maximum iteration count
285 uint64_t count = 0;
286 do {
287 clusters = details::calculate_clusters(data, means);
288 old_old_means = old_means;
289 old_means = means;
290 means = details::calculate_means(data, clusters, old_means, parameters.get_k());
291 ++count;
292 } while (means != old_means && means != old_old_means
293 && !(parameters.has_max_iteration() && count == parameters.get_max_iteration())
294 && !(parameters.has_min_delta() && details::deltas_below_limit(details::deltas(old_means, means), parameters.get_min_delta())));
295
296 return std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>>(means, clusters);
297}
298
299/*
300This overload exists to support legacy code which uses this signature of the kmeans_lloyd function.
301Any code still using this signature should move to the version of this function that uses a
302`clustering_parameters` struct for configuration.
303*/
304template <typename T, size_t N>
305std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>> kmeans_lloyd(
306 const std::vector<std::array<T, N>>& data, uint32_t k,
307 uint64_t max_iter = 0, T min_delta = -1.0) {
308 clustering_parameters<T> parameters(k);
309 if (max_iter != 0) {
310 parameters.set_max_iteration(max_iter);
311 }
312 if (min_delta != 0) {
313 parameters.set_min_delta(min_delta);
314 }
315 return kmeans_lloyd(data, parameters);
316}
317
318} // namespace dkm
319
320#endif /* DKM_KMEANS_H */
bool has_random_seed() const
Definition dkm.hpp:231
uint64_t get_max_iteration() const
Definition dkm.hpp:234
clustering_parameters(uint32_t k)
Definition dkm.hpp:204
uint32_t get_k() const
Definition dkm.hpp:233
void set_random_seed(uint64_t rand_seed)
Definition dkm.hpp:223
bool has_min_delta() const
Definition dkm.hpp:230
bool has_max_iteration() const
Definition dkm.hpp:229
void set_min_delta(T min_delta)
Definition dkm.hpp:217
void set_max_iteration(uint64_t max_iter)
Definition dkm.hpp:211
uint64_t get_random_seed() const
Definition dkm.hpp:236
std::vector< uint32_t > calculate_clusters(const std::vector< std::array< T, N > > &data, const std::vector< std::array< T, N > > &means)
Definition dkm.hpp:125
std::vector< T > deltas(const std::vector< std::array< T, N > > &old_means, const std::vector< std::array< T, N > > &means)
Definition dkm.hpp:164
T distance_squared(const std::array< T, N > &point_a, const std::array< T, N > &point_b)
Definition dkm.hpp:32
bool deltas_below_limit(const std::vector< T > &deltas, T min_delta)
Definition dkm.hpp:177
std::vector< std::array< T, N > > random_plusplus(const std::vector< std::array< T, N > > &data, uint32_t k, uint64_t seed)
Definition dkm.hpp:71
uint32_t closest_mean(const std::array< T, N > &point, const std::vector< std::array< T, N > > &means)
Definition dkm.hpp:106
T distance(const std::array< T, N > &point_a, const std::array< T, N > &point_b)
Definition dkm.hpp:42
std::vector< T > closest_distance(const std::vector< std::array< T, N > > &means, const std::vector< std::array< T, N > > &data)
Definition dkm.hpp:50
std::vector< std::array< T, N > > calculate_means(const std::vector< std::array< T, N > > &data, const std::vector< uint32_t > &clusters, const std::vector< std::array< T, N > > &old_means, uint32_t k)
Definition dkm.hpp:138
Definition dkm.hpp:20
std::tuple< std::vector< std::array< T, N > >, std::vector< uint32_t > > kmeans_lloyd(const std::vector< std::array< T, N > > &data, const clustering_parameters< T > &parameters)
Definition dkm.hpp:271