Deep Learning Algorithm Implementations 1.0.0
C++ implementations of fundamental deep learning algorithms
Loading...
Searching...
No Matches
kmeans.cpp
Go to the documentation of this file.
1#include "ml/kmeans.hpp"
2#include <algorithm>
3#include <cmath>
4#include <limits>
5#include <random>
6#include <stdexcept>
7
8namespace ml {
9
10 template<typename T>
11 KMeans<T>::KMeans(size_t k, size_t max_iters, T tol, int random_state) :
12 k_(k), max_iters_(max_iters), tol_(tol), random_state_(random_state), inertia_(0), n_iter_(0),
13 is_fitted_(false) {
14 if (k == 0) {
15 throw std::invalid_argument("Number of clusters must be greater than 0");
16 }
17 }
18
19 template<typename T>
20 void KMeans<T>::fit(const Matrix<T> &data) {
21 if (data.rows() < k_) {
22 throw std::invalid_argument("Number of samples must be at least k");
23 }
24
25 size_t n_samples = data.rows();
26 size_t n_features = data.cols();
27
28 // Initialize centroids
29 init_centroids(data);
30
31 std::vector<int> labels(n_samples);
32 std::vector<int> prev_labels(n_samples, -1);
33
34 for (n_iter_ = 0; n_iter_ < max_iters_; ++n_iter_) {
35 // Assign points to clusters
36 labels = assign_clusters(data);
37
38 // Check for convergence
39 bool converged = true;
40 for (size_t i = 0; i < n_samples; ++i) {
41 if (labels[i] != prev_labels[i]) {
42 converged = false;
43 break;
44 }
45 }
46
47 if (converged) {
48 break;
49 }
50
51 // Update centroids
52 update_centroids(data, labels);
54 }
55
56 // Calculate final inertia
57 inertia_ = 0;
58 for (size_t i = 0; i < n_samples; ++i) {
59 std::vector<T> point(n_features);
60 for (size_t j = 0; j < n_features; ++j) {
61 point[j] = data(i, j);
62 }
63 std::vector<T> centroid(n_features);
64 for (size_t j = 0; j < n_features; ++j) {
65 centroid[j] = centroids_(labels[i], j);
66 }
67 inertia_ += squared_distance(point, centroid);
68 }
69
70 is_fitted_ = true;
71 }
72
73 template<typename T>
74 std::vector<int> KMeans<T>::predict(const Matrix<T> &data) const {
75 if (!is_fitted_) {
76 throw std::runtime_error("Model must be fitted before prediction");
77 }
78 return assign_clusters(data);
79 }
80
81 template<typename T>
82 std::vector<int> KMeans<T>::fit_predict(const Matrix<T> &data) {
83 fit(data);
84 return predict(data);
85 }
86
87 template<typename T>
89 if (!is_fitted_) {
90 throw std::runtime_error("Model must be fitted before accessing cluster centers");
91 }
92 return centroids_;
93 }
94
95 template<typename T>
97 if (!is_fitted_) {
98 throw std::runtime_error("Model must be fitted before accessing inertia");
99 }
100 return inertia_;
101 }
102
103 template<typename T>
104 size_t KMeans<T>::n_iter() const {
105 return n_iter_;
106 }
107
108 template<typename T>
109 void KMeans<T>::init_centroids(const Matrix<T> &data) {
110 size_t n_samples = data.rows();
111 size_t n_features = data.cols();
112
113 centroids_ = Matrix<T>(k_, n_features);
114
115 // Use K-means++ initialization for better results
116 std::mt19937 rng(random_state_ >= 0 ? random_state_ : std::random_device{}());
117 std::uniform_int_distribution<size_t> dist(0, n_samples - 1);
118
119 // Choose first centroid randomly
120 size_t first_idx = dist(rng);
121 for (size_t j = 0; j < n_features; ++j) {
122 centroids_(0, j) = data(first_idx, j);
123 }
124
125 // Choose remaining centroids using K-means++
126 for (size_t c = 1; c < k_; ++c) {
127 std::vector<T> distances(n_samples);
128 T total_distance = 0;
129
130 // Calculate distances to nearest centroid
131 for (size_t i = 0; i < n_samples; ++i) {
132 T min_dist = std::numeric_limits<T>::max();
133 for (size_t prev_c = 0; prev_c < c; ++prev_c) {
134 std::vector<T> point(n_features), centroid(n_features);
135 for (size_t j = 0; j < n_features; ++j) {
136 point[j] = data(i, j);
137 centroid[j] = centroids_(prev_c, j);
138 }
139 T dist = squared_distance(point, centroid);
140 min_dist = std::min(min_dist, dist);
141 }
142 distances[i] = min_dist;
143 total_distance += min_dist;
144 }
145
146 // Choose next centroid with probability proportional to squared distance
147 std::uniform_real_distribution<T> real_dist(0, total_distance);
148 T target = real_dist(rng);
149 T cumsum = 0;
150 for (size_t i = 0; i < n_samples; ++i) {
151 cumsum += distances[i];
152 if (cumsum >= target) {
153 for (size_t j = 0; j < n_features; ++j) {
154 centroids_(c, j) = data(i, j);
155 }
156 break;
157 }
158 }
159 }
160 }
161
162 template<typename T>
163 std::vector<int> KMeans<T>::assign_clusters(const Matrix<T> &data) const {
164 size_t n_samples = data.rows();
165 size_t n_features = data.cols();
166 std::vector<int> labels(n_samples);
167
168 for (size_t i = 0; i < n_samples; ++i) {
169 T min_distance = std::numeric_limits<T>::max();
170 int best_cluster = 0;
171
172 for (size_t c = 0; c < k_; ++c) {
173 std::vector<T> point(n_features), centroid(n_features);
174 for (size_t j = 0; j < n_features; ++j) {
175 point[j] = data(i, j);
176 centroid[j] = centroids_(c, j);
177 }
178 T distance = squared_distance(point, centroid);
179 if (distance < min_distance) {
180 min_distance = distance;
181 best_cluster = c;
182 }
183 }
184 labels[i] = best_cluster;
185 }
186
187 return labels;
188 }
189
190 template<typename T>
191 void KMeans<T>::update_centroids(const Matrix<T> &data, const std::vector<int> &labels) {
192 size_t n_samples = data.rows();
193 size_t n_features = data.cols();
194
195 // Reset centroids
196 centroids_ = Matrix<T>::zeros(k_, n_features);
197 std::vector<size_t> counts(k_, 0);
198
199 // Sum points for each cluster
200 for (size_t i = 0; i < n_samples; ++i) {
201 int cluster = labels[i];
202 counts[cluster]++;
203 for (size_t j = 0; j < n_features; ++j) {
204 centroids_(cluster, j) += data(i, j);
205 }
206 }
207
208 // Average to get new centroids
209 for (size_t c = 0; c < k_; ++c) {
210 if (counts[c] > 0) {
211 for (size_t j = 0; j < n_features; ++j) {
212 centroids_(c, j) /= counts[c];
213 }
214 }
215 }
216 }
217
218 template<typename T>
219 T KMeans<T>::squared_distance(const std::vector<T> &point1, const std::vector<T> &point2) const {
220 T distance = 0;
221 for (size_t i = 0; i < point1.size(); ++i) {
222 T diff = point1[i] - point2[i];
223 distance += diff * diff;
224 }
225 return distance;
226 }
227
228 // Explicit template instantiation
229 template class KMeans<float>;
230 template class KMeans<double>;
231
232} // namespace ml
T inertia() const
Get the within-cluster sum of squares (inertia)
Definition kmeans.cpp:96
KMeans(size_t k, size_t max_iters=300, T tol=1e-4, int random_state=-1)
Constructor with number of clusters.
Definition kmeans.cpp:11
std::vector< int > fit_predict(const Matrix< T > &data)
Fit the model and predict cluster labels.
Definition kmeans.cpp:82
std::vector< int > predict(const Matrix< T > &data) const
Predict cluster labels for the given data.
Definition kmeans.cpp:74
Matrix< T > cluster_centers() const
Get the cluster centroids.
Definition kmeans.cpp:88
size_t n_iter() const
Get the number of iterations performed.
Definition kmeans.cpp:104
void fit(const Matrix< T > &data)
Fit the K-Means model to the data.
Definition kmeans.cpp:20
static Matrix zeros(size_t rows, size_t cols)
Create a matrix filled with zeros.
Definition matrix.cpp:121
K-Means clustering algorithm implementation.
Namespace containing traditional machine learning algorithms.
Definition kmeans.hpp:15