20 throw std::invalid_argument(
"C must be positive");
23 throw std::invalid_argument(
"gamma must be positive");
26 throw std::invalid_argument(
"learning_rate must be positive");
32 if (
X.rows() !=
y.size()) {
33 throw std::invalid_argument(
"Number of samples in X and y must match");
40 if (classes_.size() != 2) {
41 throw std::invalid_argument(
"Currently only binary classification is supported");
46 for (
size_t i = 0;
i <
y.size(); ++
i) {
68 loss_history_.clear();
71 std::cout <<
"Training SVM with automatic differentiation..." << std::endl;
75 loss_history_.push_back(loss);
77 if (
iter % 100 == 0) {
78 std::cout <<
"Iteration " <<
iter <<
", Loss: " << loss << std::endl;
82 if (loss_history_.size() > 1) {
83 T
loss_diff = std::abs(loss_history_[loss_history_.size()-1] -
84 loss_history_[loss_history_.size()-2]);
86 std::cout <<
"Converged at iteration " <<
iter << std::endl;
93 std::cout <<
"Training completed." << std::endl;
114 for (
size_t i = 0;
i < weights_.grad().rows(); ++
i) {
115 for (
size_t j = 0;
j < weights_.grad().cols(); ++
j) {
119 weights_ = Variable<T>(weights_.data() + weight_grad_scaled,
true);
121 Tensor<T> bias_grad_scaled(bias_.grad().rows(), bias_.grad().cols());
122 for (
size_t i = 0; i < bias_.grad().rows(); ++i) {
123 for (
size_t j = 0; j < bias_.grad().cols(); ++j) {
124 bias_grad_scaled(i, j) = bias_.grad()(i, j) * (-learning_rate_);
127 bias_ = Variable<T>(bias_.data() + bias_grad_scaled,
true);
129 return loss.data()(0, 0);
133 Variable<T> SVM<T>::compute_loss(
const Variable<T> &X,
const std::vector<int> &y)
const {
134 size_t n_samples = X.rows();
140 Variable<T> w_squared = weights_.transpose().dot(weights_);
141 Variable<T> reg_term = w_squared * Variable<T>(Tensor<T>(1, 1, 0.5),
false);
144 Variable<T> hinge_loss_sum = Variable<T>(Tensor<T>(1, 1, 0.0),
false);
146 for (
size_t i = 0; i < n_samples; ++i) {
148 Tensor<T> x_i_data(1, X.cols());
149 for (
size_t j = 0; j < X.cols(); ++j) {
150 x_i_data(0, j) = X.data()(i, j);
152 Variable<T> x_i = Variable<T>(x_i_data,
false);
157 Variable<T> decision = x_i.dot(weights_) + bias_;
160 T y_i =
static_cast<T
>(y[i]);
161 Variable<T> margin = decision * Variable<T>(Tensor<T>(1, 1, y_i),
false);
164 T margin_val = margin.data()(0, 0);
165 if (margin_val < 1.0) {
166 Variable<T> hinge = Variable<T>(Tensor<T>(1, 1, 1.0),
false) - margin;
167 hinge_loss_sum = hinge_loss_sum + hinge;
172 Variable<T> C_var = Variable<T>(Tensor<T>(1, 1, C_),
false);
173 return reg_term + C_var * hinge_loss_sum;
180 Variable<T> loss_sum = Variable<T>(Tensor<T>(1, 1, 0.0),
false);
182 for (
size_t i = 0; i < n_samples; ++i) {
184 Tensor<T> x_i_data(1, X.cols());
185 for (
size_t j = 0; j < X.cols(); ++j) {
186 x_i_data(0, j) = X(i, j);
188 Variable<T> x_i = Variable<T>(x_i_data,
false);
191 Variable<T> decision = bias_;
194 for (
size_t k = 0; k < std::min(n_samples, static_cast<size_t>(10)); ++k) {
195 Tensor<T> x_k_data(1, X.cols());
196 for (
size_t j = 0; j < X.cols(); ++j) {
197 x_k_data(0, j) = X.data()(k, j);
199 Variable<T> x_k = Variable<T>(x_k_data,
false);
201 Variable<T> kernel_val = kernel(x_i, x_k);
204 if (k < weights_.rows()) {
205 Tensor<T> alpha_k(1, 1, weights_.data()(k, 0));
206 Variable<T> alpha_k_var = Variable<T>(alpha_k,
false);
207 decision = decision + alpha_k_var * kernel_val * Variable<T>(Tensor<T>(1, 1,
static_cast<T
>(y[k])),
false);
212 T y_i =
static_cast<T
>(y[i]);
213 Variable<T> margin = decision * Variable<T>(Tensor<T>(1, 1, y_i),
false);
215 T margin_val = margin.data()(0, 0);
216 if (margin_val < 1.0) {
217 Variable<T> hinge = Variable<T>(Tensor<T>(1, 1, 1.0),
false) - margin;
218 loss_sum = loss_sum + hinge;
227 Variable<T> SVM<T>::kernel(
const Variable<T> &x1,
const Variable<T> &x2)
const {
228 switch (kernel_type_) {
230 return x1.dot(x2.transpose());
234 Variable<T> diff = x1 - x2;
235 Variable<T> squared_norm = diff.dot(diff.transpose());
236 Variable<T> gamma_var = Variable<T>(Tensor<T>(1, 1, -gamma_),
false);
237 return (gamma_var * squared_norm).exp();
242 Variable<T> dot_product = x1.dot(x2.transpose());
243 Variable<T> gamma_var = Variable<T>(Tensor<T>(1, 1, gamma_),
false);
244 Variable<T> coef0_var = Variable<T>(Tensor<T>(1, 1, coef0_),
false);
245 Variable<T> base = gamma_var * dot_product + coef0_var;
257 Variable<T> dot_product = x1.dot(x2.transpose());
258 Variable<T> gamma_var = Variable<T>(Tensor<T>(1, 1, gamma_),
false);
259 Variable<T> coef0_var = Variable<T>(Tensor<T>(1, 1, coef0_),
false);
260 return (gamma_var * dot_product + coef0_var).tanh();
264 return x1.dot(x2.transpose());
269 Variable<T> SVM<T>::to_variable(
const Tensor<T> &matrix,
bool requires_grad)
const {
270 return Variable<T>(matrix, requires_grad);
276 throw std::runtime_error(
"Model must be fitted before prediction");
292 throw std::runtime_error(
"Model must be fitted before prediction");
297 for (
size_t i = 0;
i <
X.rows(); ++
i) {
301 for (
size_t j = 0;
j <
X.cols(); ++
j) {
333 return support_vectors_;
338 return support_indices_;
348 return bias_.data()(0, 0);
std::vector< size_t > support() const
Get support vector indices.
std::vector< T > decision_function(const Tensor< T > &X) const
Compute the decision function for samples.
Tensor< T > support_vectors() const
Get support vectors.
Tensor< T > predict_proba(const Tensor< T > &X) const
Predict class probabilities for samples.
SVM(KernelType kernel_type=KernelType::RBF, T C=1.0, T gamma=1.0, int degree=3, T coef0=0.0, T tol=1e-3, size_t max_iter=1000, T learning_rate=0.01)
Constructor.
void fit(const Tensor< T > &X, const std::vector< int > &y)
Fit the SVM model to training data using automatic differentiation.
T intercept() const
Get intercept term.
std::vector< T > dual_coef() const
Get dual coefficients.
std::vector< int > predict(const Tensor< T > &X) const
Predict class labels for samples.
static Tensor random(const std::vector< size_t > &shape)
Create a random tensor with values between 0 and 1.
Namespace containing traditional machine learning algorithms.
KernelType
Kernel function types for SVM.
@ RBF
Radial Basis Function kernel: K(x, y) = exp(-gamma * ||x - y||^2)
@ LINEAR
Linear kernel: K(x, y) = x^T * y.
@ SIGMOID
Sigmoid kernel: K(x, y) = tanh(gamma * x^T * y + coef0)
@ POLYNOMIAL
Polynomial kernel: K(x, y) = (gamma * x^T * y + coef0)^degree.
Support Vector Machine implementation with automatic differentiation.