4#ifndef DKM_PARALLEL_KMEANS_H
5#define DKM_PARALLEL_KMEANS_H
33template <
typename T,
size_t N>
35 const std::vector<std::array<T, N>>& means,
const std::vector<std::array<T, N>>& data) {
36 std::vector<T> distances(data.size(), T());
37 #pragma omp parallel for
38 for (
size_t i = 0; i < data.size(); ++i) {
40 for (
const auto& m : means) {
45 distances[i] = closest;
54template <
typename T,
size_t N>
57 assert(data.size() > 0);
58 using input_size_t =
typename std::array<T, N>::size_type;
59 std::vector<std::array<T, N>> means;
62 std::linear_congruential_engine<uint64_t, 6364136223846793005, 1442695040888963407, UINT64_MAX> rand_engine(seed);
66 std::uniform_int_distribution<input_size_t> uniform_generator(0, data.size() - 1);
67 means.push_back(data[uniform_generator(rand_engine)]);
70 for (uint32_t count = 1; count < k; ++count) {
75#if !defined(_MSC_VER) || _MSC_VER >= 1900
76 std::discrete_distribution<input_size_t> generator(distances.begin(), distances.end());
79 std::discrete_distribution<input_size_t> generator(distances.size(), 0.0, 0.0, [&distances, &i](
double) { return distances[i++]; });
81 means.push_back(data[generator(rand_engine)]);
89template <
typename T,
size_t N>
91 const std::vector<std::array<T, N>>& data,
const std::vector<std::array<T, N>>& means) {
92 std::vector<uint32_t> clusters(data.size(), 0);
93 #pragma omp parallel for
94 for (
size_t i = 0; i < data.size(); ++i) {
120template <
typename T,
size_t N>
123 static_assert(std::is_arithmetic<T>::value && std::is_signed<T>::value,
124 "kmeans_lloyd requires the template parameter T to be a signed arithmetic type (e.g. float, double, int)");
125 assert(parameters.
get_k() > 0);
126 assert(data.size() >= parameters.
get_k());
127 std::random_device rand_device;
131 std::vector<std::array<T, N>> old_means;
132 std::vector<std::array<T, N>> old_old_means;
133 std::vector<uint32_t> clusters;
138 old_old_means = old_means;
142 }
while ((means != old_means && means != old_old_means)
146 return std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>>(means, clusters);
154template <
typename T,
size_t N>
156 const std::vector<std::array<T, N>>& data, uint32_t k,
157 uint64_t max_iter = 0, T min_delta = -1.0) {
162 if (min_delta != 0) {
bool has_random_seed() const
uint64_t get_max_iteration() const
bool has_min_delta() const
bool has_max_iteration() const
void set_min_delta(T min_delta)
void set_max_iteration(uint64_t max_iter)
uint64_t get_random_seed() const
std::vector< uint32_t > calculate_clusters_parallel(const std::vector< std::array< T, N > > &data, const std::vector< std::array< T, N > > &means)
std::vector< T > deltas(const std::vector< std::array< T, N > > &old_means, const std::vector< std::array< T, N > > &means)
T distance_squared(const std::array< T, N > &point_a, const std::array< T, N > &point_b)
bool deltas_below_limit(const std::vector< T > &deltas, T min_delta)
std::vector< std::array< T, N > > random_plusplus_parallel(const std::vector< std::array< T, N > > &data, uint32_t k, uint64_t seed)
uint32_t closest_mean(const std::array< T, N > &point, const std::vector< std::array< T, N > > &means)
T distance(const std::array< T, N > &point_a, const std::array< T, N > &point_b)
std::vector< std::array< T, N > > calculate_means(const std::vector< std::array< T, N > > &data, const std::vector< uint32_t > &clusters, const std::vector< std::array< T, N > > &old_means, uint32_t k)
std::vector< T > closest_distance_parallel(const std::vector< std::array< T, N > > &means, const std::vector< std::array< T, N > > &data)
std::tuple< std::vector< std::array< T, N > >, std::vector< uint32_t > > kmeans_lloyd_parallel(const std::vector< std::array< T, N > > &data, const clustering_parameters< T > ¶meters)