SemiDiscreteOT 1.0
Semi-Discrete Optimal Transport Library
Loading...
Searching...
No Matches
dkm_parallel.hpp
Go to the documentation of this file.
1#pragma once
2
3// only included in case there's a C++11 compiler out there that doesn't support `#pragma once`
4#ifndef DKM_PARALLEL_KMEANS_H
5#define DKM_PARALLEL_KMEANS_H
6
7#include <algorithm>
8#include <array>
9#include <cassert>
10#include <cstddef>
11#include <cstdint>
12#include <random>
13#include <tuple>
14#include <type_traits>
15#include <vector>
16
17#include "dkm.hpp"
18
19/*
20DKM - A k-means implementation that is generic across variable data dimensions.
21*/
22namespace dkm {
23
24/*
25These functions are all private implementation details and shouldn't be referenced outside of this
26file.
27*/
28namespace details {
29
30/*
31Calculate the smallest distance between each of the data points and any of the input means.
32*/
33template <typename T, size_t N>
35 const std::vector<std::array<T, N>>& means, const std::vector<std::array<T, N>>& data) {
36 std::vector<T> distances(data.size(), T());
37 #pragma omp parallel for
38 for (size_t i = 0; i < data.size(); ++i) {
39 T closest = distance_squared(data[i], means[0]);
40 for (const auto& m : means) {
41 T distance = distance_squared(data[i], m);
42 if (distance < closest)
43 closest = distance;
44 }
45 distances[i] = closest;
46 }
47 return distances;
48}
49
50/*
51This is an alternate initialization method based on the [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B)
52initialization algorithm.
53*/
54template <typename T, size_t N>
55std::vector<std::array<T, N>> random_plusplus_parallel(const std::vector<std::array<T, N>>& data, uint32_t k, uint64_t seed) {
56 assert(k > 0);
57 assert(data.size() > 0);
58 using input_size_t = typename std::array<T, N>::size_type;
59 std::vector<std::array<T, N>> means;
60 // Using a very simple PRBS generator, parameters selected according to
61 // https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use
62 std::linear_congruential_engine<uint64_t, 6364136223846793005, 1442695040888963407, UINT64_MAX> rand_engine(seed);
63
64 // Select first mean at random from the set
65 {
66 std::uniform_int_distribution<input_size_t> uniform_generator(0, data.size() - 1);
67 means.push_back(data[uniform_generator(rand_engine)]);
68 }
69
70 for (uint32_t count = 1; count < k; ++count) {
71 // Calculate the distance to the closest mean for each data point
72 auto distances = details::closest_distance_parallel(means, data);
73 // Pick a random point weighted by the distance from existing means
74 // TODO: This might convert floating point weights to ints, distorting the distribution for small weights
75#if !defined(_MSC_VER) || _MSC_VER >= 1900
76 std::discrete_distribution<input_size_t> generator(distances.begin(), distances.end());
77#else // MSVC++ older than 14.0
78 input_size_t i = 0;
79 std::discrete_distribution<input_size_t> generator(distances.size(), 0.0, 0.0, [&distances, &i](double) { return distances[i++]; });
80#endif
81 means.push_back(data[generator(rand_engine)]);
82 }
83 return means;
84}
85
86/*
87Calculate the index of the mean each data point is closest to (euclidean distance).
88*/
89template <typename T, size_t N>
90std::vector<uint32_t> calculate_clusters_parallel(
91 const std::vector<std::array<T, N>>& data, const std::vector<std::array<T, N>>& means) {
92 std::vector<uint32_t> clusters(data.size(), 0);
93 #pragma omp parallel for
94 for (size_t i = 0; i < data.size(); ++i) {
95 clusters[i] = closest_mean(data[i], means);
96 }
97 return clusters;
98}
99
100} // namespace details
101
102
103/*
104Implementation of k-means generic across the data type and the dimension of each data item. Expects
105the data to be a vector of fixed-size arrays. Generic parameters are the type of the base data (T)
106and the dimensionality of each data point (N). All points must have the same dimensionality.
107
108e.g. points of the form (X, Y, Z) would be N = 3.
109
110Returns a std::tuple containing:
111 0: A vector holding the means for each cluster from 0 to k-1.
112 1: A vector containing the cluster number (0 to k-1) for each corresponding element of the input
113 data vector.
114
115Implementation details:
116This implementation of k-means uses [Lloyd's Algorithm](https://en.wikipedia.org/wiki/Lloyd%27s_algorithm)
117with the [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B)
118used for initializing the means.
119*/
120template <typename T, size_t N>
121std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>> kmeans_lloyd_parallel(
122 const std::vector<std::array<T, N>>& data, const clustering_parameters<T>& parameters) {
123 static_assert(std::is_arithmetic<T>::value && std::is_signed<T>::value,
124 "kmeans_lloyd requires the template parameter T to be a signed arithmetic type (e.g. float, double, int)");
125 assert(parameters.get_k() > 0); // k must be greater than zero
126 assert(data.size() >= parameters.get_k()); // there must be at least k data points
127 std::random_device rand_device;
128 uint64_t seed = parameters.has_random_seed() ? parameters.get_random_seed() : rand_device();
129 std::vector<std::array<T, N>> means = details::random_plusplus_parallel(data, parameters.get_k(), seed);
130
131 std::vector<std::array<T, N>> old_means;
132 std::vector<std::array<T, N>> old_old_means;
133 std::vector<uint32_t> clusters;
134 // Calculate new means until convergence is reached or we hit the maximum iteration count
135 uint64_t count = 0;
136 do {
137 clusters = details::calculate_clusters_parallel(data, means);
138 old_old_means = old_means;
139 old_means = means;
140 means = details::calculate_means(data, clusters, old_means, parameters.get_k());
141 ++count;
142 } while ((means != old_means && means != old_old_means)
143 && !(parameters.has_max_iteration() && count == parameters.get_max_iteration())
144 && !(parameters.has_min_delta() && details::deltas_below_limit(details::deltas(old_means, means), parameters.get_min_delta())));
145
146 return std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>>(means, clusters);
147}
148
149/*
150This overload exists to support legacy code which uses this signature of the kmeans_lloyd function.
151Any code still using this signature should move to the version of this function that uses a
152`clustering_parameters` struct for configuration.
153*/
154template <typename T, size_t N>
155std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>> kmeans_lloyd_parallel(
156 const std::vector<std::array<T, N>>& data, uint32_t k,
157 uint64_t max_iter = 0, T min_delta = -1.0) {
158 clustering_parameters<T> parameters(k);
159 if (max_iter != 0) {
160 parameters.set_max_iteration(max_iter);
161 }
162 if (min_delta != 0) {
163 parameters.set_min_delta(min_delta);
164 }
165 return kmeans_lloyd_parallel(data, parameters);
166}
167
168} // namespace dkm
169
170#endif /* DKM_PARALLEL_KMEANS_H */
bool has_random_seed() const
Definition dkm.hpp:231
uint64_t get_max_iteration() const
Definition dkm.hpp:234
uint32_t get_k() const
Definition dkm.hpp:233
bool has_min_delta() const
Definition dkm.hpp:230
bool has_max_iteration() const
Definition dkm.hpp:229
void set_min_delta(T min_delta)
Definition dkm.hpp:217
void set_max_iteration(uint64_t max_iter)
Definition dkm.hpp:211
uint64_t get_random_seed() const
Definition dkm.hpp:236
std::vector< uint32_t > calculate_clusters_parallel(const std::vector< std::array< T, N > > &data, const std::vector< std::array< T, N > > &means)
std::vector< T > deltas(const std::vector< std::array< T, N > > &old_means, const std::vector< std::array< T, N > > &means)
Definition dkm.hpp:164
T distance_squared(const std::array< T, N > &point_a, const std::array< T, N > &point_b)
Definition dkm.hpp:32
bool deltas_below_limit(const std::vector< T > &deltas, T min_delta)
Definition dkm.hpp:177
std::vector< std::array< T, N > > random_plusplus_parallel(const std::vector< std::array< T, N > > &data, uint32_t k, uint64_t seed)
uint32_t closest_mean(const std::array< T, N > &point, const std::vector< std::array< T, N > > &means)
Definition dkm.hpp:106
T distance(const std::array< T, N > &point_a, const std::array< T, N > &point_b)
Definition dkm.hpp:42
std::vector< std::array< T, N > > calculate_means(const std::vector< std::array< T, N > > &data, const std::vector< uint32_t > &clusters, const std::vector< std::array< T, N > > &old_means, uint32_t k)
Definition dkm.hpp:138
std::vector< T > closest_distance_parallel(const std::vector< std::array< T, N > > &means, const std::vector< std::array< T, N > > &data)
Definition dkm.hpp:20
std::tuple< std::vector< std::array< T, N > >, std::vector< uint32_t > > kmeans_lloyd_parallel(const std::vector< std::array< T, N > > &data, const clustering_parameters< T > &parameters)