Deep Learning Algorithm Implementations 1.0.0
C++ implementations of fundamental deep learning algorithms
Loading...
Searching...
No Matches
svm.cpp
Go to the documentation of this file.
1#include "ml/svm.hpp"
2#include <algorithm>
3#include <cmath>
4#include <random>
5#include <set>
6#include <stdexcept>
7#include <iostream>
8
9namespace ml {
10
11 template<typename T>
13 T tol, size_t max_iter, T learning_rate) :
14 kernel_type_(kernel_type), C_(C), gamma_(gamma), degree_(degree), coef0_(coef0),
15 tol_(tol), max_iter_(max_iter), learning_rate_(learning_rate), is_fitted_(false),
16 weights_(Tensor<T>(1, 1, 0.0), true), // Initialize with dummy data, will be resized in fit
17 bias_(Tensor<T>(1, 1, 0.0), true) { // Initialize with dummy data
18
19 if (C <= 0) {
20 throw std::invalid_argument("C must be positive");
21 }
22 if (gamma <= 0) {
23 throw std::invalid_argument("gamma must be positive");
24 }
25 if (learning_rate <= 0) {
26 throw std::invalid_argument("learning_rate must be positive");
27 }
28 }
29
30 template<typename T>
31 void SVM<T>::fit(const Tensor<T> &X, const std::vector<int> &y) {
32 if (X.rows() != y.size()) {
33 throw std::invalid_argument("Number of samples in X and y must match");
34 }
35
36 // Get unique classes
37 std::set<int> unique_classes(y.begin(), y.end());
38 classes_.assign(unique_classes.begin(), unique_classes.end());
39
40 if (classes_.size() != 2) {
41 throw std::invalid_argument("Currently only binary classification is supported");
42 }
43
44 // Convert labels to -1, +1
45 std::vector<int> binary_labels(y.size());
46 for (size_t i = 0; i < y.size(); ++i) {
47 binary_labels[i] = (y[i] == classes_[0]) ? -1 : 1;
48 }
49
50 // Initialize parameters with autograd
51 // For linear SVM, we use weight vector; for kernel SVM, we use dual variables
52 if (kernel_type_ == KernelType::LINEAR) {
53 // Initialize weights randomly
54 Tensor<T> w_init = Tensor<T>::random(X.cols(), 1, -0.1, 0.1);
55 weights_ = Variable<T>(w_init, true); // requires_grad = true
56
57 Tensor<T> b_init(1, 1, 0.0);
58 bias_ = Variable<T>(b_init, true);
59 } else {
60 // For kernel methods, initialize dual variables
61 Tensor<T> alpha_init = Tensor<T>::random(X.rows(), 1, 0.0, 0.1);
62 weights_ = Variable<T>(alpha_init, true);
63
64 Tensor<T> b_init(1, 1, 0.0);
65 bias_ = Variable<T>(b_init, true);
66 }
67
68 loss_history_.clear();
69
70 // Training loop with gradient descent
71 std::cout << "Training SVM with automatic differentiation..." << std::endl;
72
73 for (size_t iter = 0; iter < max_iter_; ++iter) {
74 T loss = gradient_step(X, binary_labels);
75 loss_history_.push_back(loss);
76
77 if (iter % 100 == 0) {
78 std::cout << "Iteration " << iter << ", Loss: " << loss << std::endl;
79 }
80
81 // Check convergence
82 if (loss_history_.size() > 1) {
83 T loss_diff = std::abs(loss_history_[loss_history_.size()-1] -
84 loss_history_[loss_history_.size()-2]);
85 if (loss_diff < tol_) {
86 std::cout << "Converged at iteration " << iter << std::endl;
87 break;
88 }
89 }
90 }
91
92 is_fitted_ = true;
93 std::cout << "Training completed." << std::endl;
94 }
95
96 template<typename T>
97 T SVM<T>::gradient_step(const Tensor<T> &X, const std::vector<int> &y) {
98 // Zero gradients
99 weights_.zero_grad();
100 bias_.zero_grad();
101
102 // Convert input to Variable
103 Variable<T> X_var = to_variable(X, false);
104
105 // Compute loss
106 Variable<T> loss = compute_loss(X_var, y);
107
108 // Backward pass - compute gradients
109 loss.backward();
110
111 // Update parameters using gradient descent
112 // weights_ = weights_ - learning_rate * weights_.grad()
113 Tensor<T> weight_grad_scaled(weights_.grad().rows(), weights_.grad().cols());
114 for (size_t i = 0; i < weights_.grad().rows(); ++i) {
115 for (size_t j = 0; j < weights_.grad().cols(); ++j) {
116 weight_grad_scaled(i, j) = weights_.grad()(i, j) * (-learning_rate_);
117 }
118 }
119 weights_ = Variable<T>(weights_.data() + weight_grad_scaled, true);
120
121 Tensor<T> bias_grad_scaled(bias_.grad().rows(), bias_.grad().cols());
122 for (size_t i = 0; i < bias_.grad().rows(); ++i) {
123 for (size_t j = 0; j < bias_.grad().cols(); ++j) {
124 bias_grad_scaled(i, j) = bias_.grad()(i, j) * (-learning_rate_);
125 }
126 }
127 bias_ = Variable<T>(bias_.data() + bias_grad_scaled, true);
128
129 return loss.data()(0, 0);
130 }
131
132 template<typename T>
133 Variable<T> SVM<T>::compute_loss(const Variable<T> &X, const std::vector<int> &y) const {
134 size_t n_samples = X.rows();
135
136 if (kernel_type_ == KernelType::LINEAR) {
137 // Linear SVM loss: L = (1/2) * ||w||^2 + C * sum(max(0, 1 - y_i * (w^T * x_i + b)))
138
139 // Regularization term: (1/2) * ||w||^2
140 Variable<T> w_squared = weights_.transpose().dot(weights_);
141 Variable<T> reg_term = w_squared * Variable<T>(Tensor<T>(1, 1, 0.5), false);
142
143 // Hinge loss term
144 Variable<T> hinge_loss_sum = Variable<T>(Tensor<T>(1, 1, 0.0), false);
145
146 for (size_t i = 0; i < n_samples; ++i) {
147 // Extract sample x_i
148 Tensor<T> x_i_data(1, X.cols());
149 for (size_t j = 0; j < X.cols(); ++j) {
150 x_i_data(0, j) = X.data()(i, j);
151 }
152 Variable<T> x_i = Variable<T>(x_i_data, false);
153
154 // Compute decision function: w^T * x_i + b
155 // weights_ is (n_features, 1), x_i is (1, n_features)
156 // x_i.dot(weights_) should give (1, 1) result
157 Variable<T> decision = x_i.dot(weights_) + bias_;
158
159 // Compute margin: y_i * decision
160 T y_i = static_cast<T>(y[i]);
161 Variable<T> margin = decision * Variable<T>(Tensor<T>(1, 1, y_i), false);
162
163 // Hinge loss: max(0, 1 - margin)
164 T margin_val = margin.data()(0, 0);
165 if (margin_val < 1.0) {
166 Variable<T> hinge = Variable<T>(Tensor<T>(1, 1, 1.0), false) - margin;
167 hinge_loss_sum = hinge_loss_sum + hinge;
168 }
169 }
170
171 // Total loss: regularization + C * hinge_loss
172 Variable<T> C_var = Variable<T>(Tensor<T>(1, 1, C_), false);
173 return reg_term + C_var * hinge_loss_sum;
174
175 } else {
176 // Kernel SVM - dual formulation
177 // This is more complex and would require implementing the full dual optimization
178 // For now, we'll use a simplified approach
179
180 Variable<T> loss_sum = Variable<T>(Tensor<T>(1, 1, 0.0), false);
181
182 for (size_t i = 0; i < n_samples; ++i) {
183 // Extract sample x_i
184 Tensor<T> x_i_data(1, X.cols());
185 for (size_t j = 0; j < X.cols(); ++j) {
186 x_i_data(0, j) = X(i, j);
187 }
188 Variable<T> x_i = Variable<T>(x_i_data, false);
189
190 // Simplified kernel-based decision function
191 Variable<T> decision = bias_;
192
193 // Add contribution from each training sample (simplified)
194 for (size_t k = 0; k < std::min(n_samples, static_cast<size_t>(10)); ++k) {
195 Tensor<T> x_k_data(1, X.cols());
196 for (size_t j = 0; j < X.cols(); ++j) {
197 x_k_data(0, j) = X.data()(k, j);
198 }
199 Variable<T> x_k = Variable<T>(x_k_data, false);
200
201 Variable<T> kernel_val = kernel(x_i, x_k);
202
203 // Use a subset of weights as dual variables
204 if (k < weights_.rows()) {
205 Tensor<T> alpha_k(1, 1, weights_.data()(k, 0));
206 Variable<T> alpha_k_var = Variable<T>(alpha_k, false);
207 decision = decision + alpha_k_var * kernel_val * Variable<T>(Tensor<T>(1, 1, static_cast<T>(y[k])), false);
208 }
209 }
210
211 // Hinge loss
212 T y_i = static_cast<T>(y[i]);
213 Variable<T> margin = decision * Variable<T>(Tensor<T>(1, 1, y_i), false);
214
215 T margin_val = margin.data()(0, 0);
216 if (margin_val < 1.0) {
217 Variable<T> hinge = Variable<T>(Tensor<T>(1, 1, 1.0), false) - margin;
218 loss_sum = loss_sum + hinge;
219 }
220 }
221
222 return loss_sum;
223 }
224 }
225
226 template<typename T>
227 Variable<T> SVM<T>::kernel(const Variable<T> &x1, const Variable<T> &x2) const {
228 switch (kernel_type_) {
230 return x1.dot(x2.transpose());
231
232 case KernelType::RBF: {
233 // RBF kernel: exp(-gamma * ||x1 - x2||^2)
234 Variable<T> diff = x1 - x2;
235 Variable<T> squared_norm = diff.dot(diff.transpose());
236 Variable<T> gamma_var = Variable<T>(Tensor<T>(1, 1, -gamma_), false);
237 return (gamma_var * squared_norm).exp();
238 }
239
241 // Polynomial kernel: (gamma * x1^T * x2 + coef0)^degree
242 Variable<T> dot_product = x1.dot(x2.transpose());
243 Variable<T> gamma_var = Variable<T>(Tensor<T>(1, 1, gamma_), false);
244 Variable<T> coef0_var = Variable<T>(Tensor<T>(1, 1, coef0_), false);
245 Variable<T> base = gamma_var * dot_product + coef0_var;
246
247 // Simple power implementation (for degree = 2)
248 if (degree_ == 2) {
249 return base * base;
250 } else {
251 return base; // Simplified for other degrees
252 }
253 }
254
255 case KernelType::SIGMOID: {
256 // Sigmoid kernel: tanh(gamma * x1^T * x2 + coef0)
257 Variable<T> dot_product = x1.dot(x2.transpose());
258 Variable<T> gamma_var = Variable<T>(Tensor<T>(1, 1, gamma_), false);
259 Variable<T> coef0_var = Variable<T>(Tensor<T>(1, 1, coef0_), false);
260 return (gamma_var * dot_product + coef0_var).tanh();
261 }
262
263 default:
264 return x1.dot(x2.transpose());
265 }
266 }
267
268 template<typename T>
269 Variable<T> SVM<T>::to_variable(const Tensor<T> &matrix, bool requires_grad) const {
270 return Variable<T>(matrix, requires_grad);
271 }
272
273 template<typename T>
274 std::vector<int> SVM<T>::predict(const Tensor<T> &X) const {
275 if (!is_fitted_) {
276 throw std::runtime_error("Model must be fitted before prediction");
277 }
278
279 std::vector<T> decision_values = decision_function(X);
280 std::vector<int> predictions(decision_values.size());
281
282 for (size_t i = 0; i < decision_values.size(); ++i) {
283 predictions[i] = (decision_values[i] >= 0) ? classes_[1] : classes_[0];
284 }
285
286 return predictions;
287 }
288
289 template<typename T>
290 std::vector<T> SVM<T>::decision_function(const Tensor<T> &X) const {
291 if (!is_fitted_) {
292 throw std::runtime_error("Model must be fitted before prediction");
293 }
294
295 std::vector<T> decisions(X.rows());
296
297 for (size_t i = 0; i < X.rows(); ++i) {
298 if (kernel_type_ == KernelType::LINEAR) {
299 // Linear decision function: w^T * x + b
300 T decision = bias_.data()(0, 0);
301 for (size_t j = 0; j < X.cols(); ++j) {
302 decision += weights_.data()(j, 0) * X(i, j);
303 }
305 } else {
306 // Kernel-based decision function (simplified)
307 decisions[i] = bias_.data()(0, 0);
308 // This would require storing support vectors and computing kernel values
309 // For now, using a simplified approach
310 }
311 }
312
313 return decisions;
314 }
315
316 template<typename T>
318 std::vector<T> decision_values = decision_function(X);
319 Tensor<T> probabilities(X.rows(), 2);
320
321 for (size_t i = 0; i < decision_values.size(); ++i) {
322 // Convert decision function to probability using sigmoid
323 T prob_positive = 1.0 / (1.0 + std::exp(-decision_values[i]));
324 probabilities(i, 0) = 1.0 - prob_positive;
326 }
327
328 return probabilities;
329 }
330
331 template<typename T>
333 return support_vectors_;
334 }
335
336 template<typename T>
337 std::vector<size_t> SVM<T>::support() const {
338 return support_indices_;
339 }
340
341 template<typename T>
342 std::vector<T> SVM<T>::dual_coef() const {
343 return dual_coef_;
344 }
345
346 template<typename T>
348 return bias_.data()(0, 0);
349 }
350
351 // Explicit template instantiations
352 template class SVM<float>;
353 template class SVM<double>;
354
355} // namespace ml
std::vector< size_t > support() const
Get support vector indices.
Definition svm.cpp:337
std::vector< T > decision_function(const Tensor< T > &X) const
Compute the decision function for samples.
Definition svm.cpp:290
Tensor< T > support_vectors() const
Get support vectors.
Definition svm.cpp:332
Tensor< T > predict_proba(const Tensor< T > &X) const
Predict class probabilities for samples.
Definition svm.cpp:317
SVM(KernelType kernel_type=KernelType::RBF, T C=1.0, T gamma=1.0, int degree=3, T coef0=0.0, T tol=1e-3, size_t max_iter=1000, T learning_rate=0.01)
Constructor.
Definition svm.cpp:12
void fit(const Tensor< T > &X, const std::vector< int > &y)
Fit the SVM model to training data using automatic differentiation.
Definition svm.cpp:31
T intercept() const
Get intercept term.
Definition svm.cpp:347
std::vector< T > dual_coef() const
Get dual coefficients.
Definition svm.cpp:342
std::vector< int > predict(const Tensor< T > &X) const
Predict class labels for samples.
Definition svm.cpp:274
static Tensor random(const std::vector< size_t > &shape)
Create a random tensor with values between 0 and 1.
Definition tensor.cpp:302
Namespace containing traditional machine learning algorithms.
Definition kmeans.hpp:15
KernelType
Kernel function types for SVM.
Definition svm.hpp:21
@ RBF
Radial Basis Function kernel: K(x, y) = exp(-gamma * ||x - y||^2)
@ LINEAR
Linear kernel: K(x, y) = x^T * y.
@ SIGMOID
Sigmoid kernel: K(x, y) = tanh(gamma * x^T * y + coef0)
@ POLYNOMIAL
Polynomial kernel: K(x, y) = (gamma * x^T * y + coef0)^degree.
Support Vector Machine implementation with automatic differentiation.