31template <
typename T,
size_t N>
34 for (
typename std::array<T, N>::size_type i = 0; i < N; ++i) {
35 auto delta = point_a[i] - point_b[i];
36 d_squared += delta * delta;
41template <
typename T,
size_t N>
42T
distance(
const std::array<T, N>& point_a,
const std::array<T, N>& point_b) {
49template <
typename T,
size_t N>
51 const std::vector<std::array<T, N>>& means,
const std::vector<std::array<T, N>>& data) {
52 std::vector<T> distances;
53 distances.reserve(data.size());
54 for (
auto& d : data) {
56 for (
auto& m : means) {
61 distances.push_back(closest);
70template <
typename T,
size_t N>
71std::vector<std::array<T, N>>
random_plusplus(
const std::vector<std::array<T, N>>& data, uint32_t k, uint64_t seed) {
73 assert(data.size() > 0);
74 using input_size_t =
typename std::array<T, N>::size_type;
75 std::vector<std::array<T, N>> means;
78 std::linear_congruential_engine<uint64_t, 6364136223846793005, 1442695040888963407, UINT64_MAX> rand_engine(seed);
82 std::uniform_int_distribution<input_size_t> uniform_generator(0, data.size() - 1);
83 means.push_back(data[uniform_generator(rand_engine)]);
86 for (uint32_t count = 1; count < k; ++count) {
91#if !defined(_MSC_VER) || _MSC_VER >= 1900
92 std::discrete_distribution<input_size_t> generator(distances.begin(), distances.end());
95 std::discrete_distribution<input_size_t> generator(distances.size(), 0.0, 0.0, [&distances, &i](
double) { return distances[i++]; });
97 means.push_back(data[generator(rand_engine)]);
105template <
typename T,
size_t N>
106uint32_t
closest_mean(
const std::array<T, N>& point,
const std::vector<std::array<T, N>>& means) {
107 assert(!means.empty());
109 typename std::array<T, N>::size_type index = 0;
111 for (
size_t i = 1; i < means.size(); ++i) {
124template <
typename T,
size_t N>
126 const std::vector<std::array<T, N>>& data,
const std::vector<std::array<T, N>>& means) {
127 std::vector<uint32_t> clusters;
128 for (
auto& point : data) {
137template <
typename T,
size_t N>
138std::vector<std::array<T, N>>
calculate_means(
const std::vector<std::array<T, N>>& data,
139 const std::vector<uint32_t>& clusters,
140 const std::vector<std::array<T, N>>& old_means,
142 std::vector<std::array<T, N>> means(k);
143 std::vector<T> count(k, T());
144 for (
size_t i = 0; i < std::min(clusters.size(), data.size()); ++i) {
145 auto& mean = means[clusters[i]];
146 count[clusters[i]] += 1;
147 for (
size_t j = 0; j < std::min(data[i].size(), mean.size()); ++j) {
148 mean[j] += data[i][j];
151 for (
size_t i = 0; i < k; ++i) {
153 means[i] = old_means[i];
155 for (
size_t j = 0; j < means[i].size(); ++j) {
156 means[i][j] /= count[i];
163template <
typename T,
size_t N>
165 const std::vector<std::array<T, N>>& old_means,
const std::vector<std::array<T, N>>& means)
167 std::vector<T> distances;
168 distances.reserve(means.size());
169 assert(old_means.size() == means.size());
170 for (
size_t i = 0; i < means.size(); ++i) {
171 distances.push_back(
distance(means[i], old_means[i]));
270template <
typename T,
size_t N>
271std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>>
kmeans_lloyd(
273 static_assert(std::is_arithmetic<T>::value && std::is_signed<T>::value,
274 "kmeans_lloyd requires the template parameter T to be a signed arithmetic type (e.g. float, double, int)");
275 assert(parameters.
get_k() > 0);
276 assert(data.size() >= parameters.
get_k());
277 std::random_device rand_device;
281 std::vector<std::array<T, N>> old_means;
282 std::vector<std::array<T, N>> old_old_means;
283 std::vector<uint32_t> clusters;
288 old_old_means = old_means;
292 }
while (means != old_means && means != old_old_means
296 return std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>>(means, clusters);
304template <
typename T,
size_t N>
305std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>>
kmeans_lloyd(
306 const std::vector<std::array<T, N>>& data, uint32_t k,
307 uint64_t max_iter = 0, T min_delta = -1.0) {
312 if (min_delta != 0) {
bool has_random_seed() const
uint64_t get_max_iteration() const
clustering_parameters(uint32_t k)
void set_random_seed(uint64_t rand_seed)
bool has_min_delta() const
bool has_max_iteration() const
void set_min_delta(T min_delta)
void set_max_iteration(uint64_t max_iter)
uint64_t get_random_seed() const
std::vector< uint32_t > calculate_clusters(const std::vector< std::array< T, N > > &data, const std::vector< std::array< T, N > > &means)
std::vector< T > deltas(const std::vector< std::array< T, N > > &old_means, const std::vector< std::array< T, N > > &means)
T distance_squared(const std::array< T, N > &point_a, const std::array< T, N > &point_b)
bool deltas_below_limit(const std::vector< T > &deltas, T min_delta)
std::vector< std::array< T, N > > random_plusplus(const std::vector< std::array< T, N > > &data, uint32_t k, uint64_t seed)
uint32_t closest_mean(const std::array< T, N > &point, const std::vector< std::array< T, N > > &means)
T distance(const std::array< T, N > &point_a, const std::array< T, N > &point_b)
std::vector< T > closest_distance(const std::vector< std::array< T, N > > &means, const std::vector< std::array< T, N > > &data)
std::vector< std::array< T, N > > calculate_means(const std::vector< std::array< T, N > > &data, const std::vector< uint32_t > &clusters, const std::vector< std::array< T, N > > &old_means, uint32_t k)
std::tuple< std::vector< std::array< T, N > >, std::vector< uint32_t > > kmeans_lloyd(const std::vector< std::array< T, N > > &data, const clustering_parameters< T > ¶meters)