15 throw std::invalid_argument(
"Number of clusters must be greater than 0");
21 if (data.rows() < k_) {
22 throw std::invalid_argument(
"Number of samples must be at least k");
34 for (n_iter_ = 0; n_iter_ < max_iters_; ++n_iter_) {
36 labels = assign_clusters(data);
52 update_centroids(data,
labels);
76 throw std::runtime_error(
"Model must be fitted before prediction");
78 return assign_clusters(data);
90 throw std::runtime_error(
"Model must be fitted before accessing cluster centers");
98 throw std::runtime_error(
"Model must be fitted before accessing inertia");
116 std::mt19937
rng(random_state_ >= 0 ? random_state_ : std::
random_device{}());
117 std::uniform_int_distribution<size_t>
dist(0,
n_samples - 1);
126 for (
size_t c = 1; c < k_; ++c) {
127 std::vector<T> distances(n_samples);
128 T total_distance = 0;
131 for (
size_t i = 0; i < n_samples; ++i) {
132 T min_dist = std::numeric_limits<T>::max();
133 for (
size_t prev_c = 0; prev_c < c; ++prev_c) {
134 std::vector<T> point(n_features), centroid(n_features);
135 for (
size_t j = 0; j < n_features; ++j) {
136 point[j] = data(i, j);
137 centroid[j] = centroids_(prev_c, j);
139 T dist = squared_distance(point, centroid);
140 min_dist = std::min(min_dist, dist);
142 distances[i] = min_dist;
143 total_distance += min_dist;
147 std::uniform_real_distribution<T> real_dist(0, total_distance);
148 T target = real_dist(rng);
150 for (
size_t i = 0; i < n_samples; ++i) {
151 cumsum += distances[i];
152 if (cumsum >= target) {
153 for (
size_t j = 0; j < n_features; ++j) {
154 centroids_(c, j) = data(i, j);
163 std::vector<int> KMeans<T>::assign_clusters(
const Matrix<T> &data)
const {
164 size_t n_samples = data.rows();
165 size_t n_features = data.cols();
166 std::vector<int> labels(n_samples);
168 for (
size_t i = 0; i < n_samples; ++i) {
169 T min_distance = std::numeric_limits<T>::max();
170 int best_cluster = 0;
172 for (
size_t c = 0; c < k_; ++c) {
173 std::vector<T> point(n_features), centroid(n_features);
174 for (
size_t j = 0; j < n_features; ++j) {
175 point[j] = data(i, j);
176 centroid[j] = centroids_(c, j);
178 T distance = squared_distance(point, centroid);
179 if (distance < min_distance) {
180 min_distance = distance;
184 labels[i] = best_cluster;
191 void KMeans<T>::update_centroids(
const Matrix<T> &data,
const std::vector<int> &labels) {
192 size_t n_samples = data.rows();
193 size_t n_features = data.cols();
197 std::vector<size_t> counts(k_, 0);
200 for (
size_t i = 0; i < n_samples; ++i) {
201 int cluster = labels[i];
203 for (
size_t j = 0; j < n_features; ++j) {
204 centroids_(cluster, j) += data(i, j);
209 for (
size_t c = 0; c < k_; ++c) {
211 for (
size_t j = 0; j < n_features; ++j) {
212 centroids_(c, j) /= counts[c];
219 T KMeans<T>::squared_distance(
const std::vector<T> &point1,
const std::vector<T> &point2)
const {
221 for (
size_t i = 0; i < point1.size(); ++i) {
222 T diff = point1[i] - point2[i];
223 distance += diff * diff;
229 template class KMeans<float>;
230 template class KMeans<double>;
T inertia() const
Get the within-cluster sum of squares (inertia)
KMeans(size_t k, size_t max_iters=300, T tol=1e-4, int random_state=-1)
Constructor with number of clusters.
std::vector< int > fit_predict(const Matrix< T > &data)
Fit the model and predict cluster labels.
std::vector< int > predict(const Matrix< T > &data) const
Predict cluster labels for the given data.
Matrix< T > cluster_centers() const
Get the cluster centroids.
size_t n_iter() const
Get the number of iterations performed.
void fit(const Matrix< T > &data)
Fit the K-Means model to the data.
static Matrix zeros(size_t rows, size_t cols)
Create a matrix filled with zeros.
K-Means clustering algorithm implementation.
Namespace containing traditional machine learning algorithms.