18inline std::vector<std::string>
split_commas(
const std::string& line) {
19 std::vector<std::string> split;
21 std::copy(std::sregex_token_iterator(line.begin(), line.end(), reg, -1),
22 std::sregex_token_iterator(),
23 std::back_inserter(split));
38template <
typename T,
size_t N>
39std::vector<T>
dist_to_center(
const std::vector<std::array<T, N>>& points,
const std::array<T, N>& center) {
40 std::vector<T> result(points.size());
41 std::transform(points.begin(), points.end(), result.begin(), [¢er](
const std::array<T, N>& p) {
42 return details::distance(p, center);
56template <
typename T,
size_t N>
57T
sum_dist(
const std::vector<std::array<T, N>>& points,
const std::array<T, N>& center) {
59 return std::accumulate(distances.begin(), distances.end(), T());
73template <
typename T,
size_t N>
75 const std::vector<std::array<T, N>>& points,
const std::vector<uint32_t>& labels,
const uint32_t label) {
76 assert(points.size() == labels.size() &&
"Points and labels have different sizes");
78 std::vector<std::array<T, N>> cluster;
79 for (
size_t point_index = 0; point_index < points.size(); ++point_index) {
80 if (labels[point_index] == label) {
81 cluster.push_back(points[point_index]);
98template <
typename T,
size_t N>
100 const std::tuple<std::vector<std::array<T, N>>, std::vector<uint32_t>>& means,
102 std::vector<std::array<T, N>> centroids;
103 std::vector<uint32_t> labels;
104 std::tie(centroids, labels) = means;
107 for (uint32_t i = 0; i < k; ++i) {
109 inertia +=
sum_dist(cluster, centroids[i]);
125template <
typename T,
size_t N>
127 const std::vector<std::array<T, N>>& points, uint32_t k, uint32_t n_init = 10) {
131 for (uint32_t i = 0; i < n_init - 1; ++i) {
134 if (curr_inertia < best_inertia) {
135 best_inertia = curr_inertia;
136 best_means = curr_means;
149template <
typename T,
size_t N>
150size_t predict(
const std::vector<std::array<T, N>>& centroids,
const std::array<T, N>& query) {
153 for(
size_t i = 1; i < centroids.size(); i++) {
168template <
typename T,
size_t N>
169std::vector<std::array<T, N>>
load_csv(
const std::string& path) {
170 std::ifstream file(path);
171 std::vector<std::array<T, N>> data;
172 for (
auto it = std::istream_iterator<std::string>(file); it != std::istream_iterator<std::string>(); ++it) {
174 assert(split.size() == N);
175 std::array<T, N> row;
176 std::transform(split.begin(), split.end(), row.begin(), [](
const std::string& in) -> T {
177 return static_cast<T>(std::stod(in));
std::vector< std::string > split_commas(const std::string &line)
T distance(const std::array< T, N > &point_a, const std::array< T, N > &point_b)
std::tuple< std::vector< std::array< T, N > >, std::vector< uint32_t > > get_best_means(const std::vector< std::array< T, N > > &points, uint32_t k, uint32_t n_init=10)
std::vector< std::array< T, N > > get_cluster(const std::vector< std::array< T, N > > &points, const std::vector< uint32_t > &labels, const uint32_t label)
T sum_dist(const std::vector< std::array< T, N > > &points, const std::array< T, N > ¢er)
std::vector< T > dist_to_center(const std::vector< std::array< T, N > > &points, const std::array< T, N > ¢er)
T means_inertia(const std::vector< std::array< T, N > > &points, const std::tuple< std::vector< std::array< T, N > >, std::vector< uint32_t > > &means, uint32_t k)
std::vector< std::array< T, N > > load_csv(const std::string &path)
size_t predict(const std::vector< std::array< T, N > > ¢roids, const std::array< T, N > &query)
std::tuple< std::vector< std::array< T, N > >, std::vector< uint32_t > > kmeans_lloyd(const std::vector< std::array< T, N > > &data, const clustering_parameters< T > ¶meters)