Deep Learning Algorithm Implementations 1.0.0
C++ implementations of fundamental deep learning algorithms
Loading...
Searching...
No Matches
pca.cpp
Go to the documentation of this file.
1#include "ml/pca.hpp"
2#include <algorithm>
3#include <numeric>
4#include <stdexcept>
5#include <xtensor-blas/xlinalg.hpp>
6
7namespace ml {
8
9 template<typename T>
10 void PCA<T>::fit(const Matrix<T> &data, bool center, bool scale) {
11 if (data.rows() < 2 || data.cols() < 2) {
12 throw std::invalid_argument("Data matrix must have at least 2 rows and 2 columns");
13 }
14
15 // Get data dimensions
16 size_t n_samples = data.rows();
17 size_t n_features = data.cols();
18
19 // Center the data (subtract mean)
20 mean_.resize(n_features);
22
23 if (center) {
24 // Compute mean for each feature
25 for (size_t j = 0; j < n_features; ++j) {
26 T sum = 0;
27 for (size_t i = 0; i < n_samples; ++i) {
28 sum += data(i, j);
29 }
30 mean_[j] = sum / static_cast<T>(n_samples);
31
32 // Center the data
33 for (size_t i = 0; i < n_samples; ++i) {
34 centered_data(i, j) -= mean_[j];
35 }
36 }
37 } else {
38 // If not centering, set mean to zero
39 std::fill(mean_.begin(), mean_.end(), 0);
40 }
41
42 // Scale the data (divide by standard deviation)
43 scale_.resize(n_features, 1.0);
44
45 if (scale) {
46 // Compute standard deviation for each feature
47 for (size_t j = 0; j < n_features; ++j) {
48 T sum_sq = 0;
49 for (size_t i = 0; i < n_samples; ++i) {
50 T diff = centered_data(i, j);
51 sum_sq += diff * diff;
52 }
53 T variance = sum_sq / static_cast<T>(n_samples);
54 scale_[j] = std::sqrt(variance);
55
56 // Avoid division by zero
57 if (scale_[j] < 1e-10) {
58 scale_[j] = 1.0;
59 }
60
61 // Scale the data
62 for (size_t i = 0; i < n_samples; ++i) {
63 centered_data(i, j) /= scale_[j];
64 }
65 }
66 }
67
68 // Compute SVD using xtensor-blas
69 // X = U * S * V^T where V contains the principal components
70 auto X = centered_data.data(); // Get xtensor array from Matrix
71
72 // Compute covariance matrix (X^T * X) / (n_samples - 1)
73 auto cov = xt::linalg::dot(xt::transpose(X), X) / static_cast<T>(n_samples - 1);
74
75 // Compute eigendecomposition
76 auto eigen_result = xt::linalg::eigh(cov);
77 auto eigenvalues = std::get<0>(eigen_result);
78 auto eigenvectors = std::get<1>(eigen_result);
79
80 // Sort eigenvalues and eigenvectors in descending order
81 std::vector<size_t> indices(eigenvalues.size());
82 std::iota(indices.begin(), indices.end(), 0);
83 std::sort(indices.begin(), indices.end(),
84 [&eigenvalues](size_t i1, size_t i2) { return eigenvalues(i1) > eigenvalues(i2); });
85
86 // Reorder eigenvalues and eigenvectors
87 singular_values_.resize(n_features);
88 explained_variance_.resize(n_features);
89 explained_variance_ratio_.resize(n_features);
90
91 T total_variance = 0;
92 for (size_t i = 0; i < n_features; ++i) {
93 explained_variance_[i] = eigenvalues(indices[i]);
94 total_variance += explained_variance_[i];
95 }
96
97 // Create components matrix
98 components_ = Matrix<T>(n_features, n_features);
99
100 for (size_t i = 0; i < n_features; ++i) {
101 // Compute singular values (sqrt of eigenvalues)
102 singular_values_[i] = std::sqrt(explained_variance_[i]);
103
104 // Compute explained variance ratio
105 explained_variance_ratio_[i] = explained_variance_[i] / total_variance;
106
107 // Store eigenvectors as columns in components matrix
108 for (size_t j = 0; j < n_features; ++j) {
109 components_(j, i) = eigenvectors(j, indices[i]);
110 }
111 }
112
113 is_fitted_ = true;
114 }
115
116 template<typename T>
118 if (!is_fitted_) {
119 throw std::runtime_error("PCA model has not been fitted");
120 }
121
122 size_t n_samples = data.rows();
123 size_t n_features = data.cols();
124
125 if (n_features != components_.rows()) {
126 throw std::invalid_argument("Data has wrong number of features");
127 }
128
129 // If n_components is 0 or greater than available components, use all components
130 if (n_components == 0 || n_components > components_.cols()) {
131 n_components = components_.cols();
132 }
133
134 // Center and scale the data
136
137 // Apply centering and scaling
138 for (size_t i = 0; i < n_samples; ++i) {
139 for (size_t j = 0; j < n_features; ++j) {
140 processed_data(i, j) = (processed_data(i, j) - mean_[j]) / scale_[j];
141 }
142 }
143
144 // Project data onto principal components
145 // X_transformed = X * V[:, :n_components]
147 for (size_t i = 0; i < n_features; ++i) {
148 for (size_t j = 0; j < n_components; ++j) {
149 components_subset(i, j) = components_(i, j);
150 }
151 }
152
154 }
155
156 template<typename T>
158 fit(data, center, scale);
159 return transform(data, n_components);
160 }
161
162 template<typename T>
163 std::vector<T> PCA<T>::explained_variance_ratio() const {
164 if (!is_fitted_) {
165 throw std::runtime_error("PCA model has not been fitted");
166 }
167 return explained_variance_ratio_;
168 }
169
170 template<typename T>
172 if (!is_fitted_) {
173 throw std::runtime_error("PCA model has not been fitted");
174 }
175 return components_;
176 }
177
178 template<typename T>
179 std::vector<T> PCA<T>::singular_values() const {
180 if (!is_fitted_) {
181 throw std::runtime_error("PCA model has not been fitted");
182 }
183 return singular_values_;
184 }
185
186 // Explicit template instantiations
187 template class PCA<float>;
188 template class PCA<double>;
189
190} // namespace ml
void fit(const Matrix< T > &data)
Fit the K-Means model to the data.
Definition kmeans.cpp:20
void fit(const Matrix< T > &data, bool center=true, bool scale=false)
Fit the PCA model to the data.
Definition pca.cpp:10
std::vector< T > singular_values() const
Get the singular values (square roots of eigenvalues)
Definition pca.cpp:179
Matrix< T > components() const
Get the principal components (eigenvectors)
Definition pca.cpp:171
std::vector< T > explained_variance_ratio() const
Get the explained variance ratio for each component.
Definition pca.cpp:163
Matrix< T > transform(const Matrix< T > &data, size_t n_components=0) const
Transform data to the principal component space.
Definition pca.cpp:117
Matrix< T > fit_transform(const Matrix< T > &data, size_t n_components=0, bool center=true, bool scale=false)
Fit the model and transform the data in one step.
Definition pca.cpp:157
Namespace containing traditional machine learning algorithms.
Definition kmeans.hpp:15
T sum(const Matrix< T > &matrix)
Calculate sum of all matrix elements.
Definition matrix.cpp:166
Principal Component Analysis implementation.