Deep Learning Algorithm Implementations 1.0.0
C++ implementations of fundamental deep learning algorithms
Loading...
Searching...
No Matches
optimizers.cpp
Go to the documentation of this file.
2#include <cmath>
3#include <algorithm>
4
5namespace dl::optimization {
6
7 // SGD Implementation
8 template<typename T>
9 SGD<T>::SGD(std::vector<Variable<T>*> parameters,
10 T lr,
11 T momentum,
12 T weight_decay,
13 bool nesterov)
14 : AutogradOptimizer<T>(parameters), lr_(lr), momentum_(momentum),
15 weight_decay_(weight_decay), nesterov_(nesterov) {
16 initialize_momentum_buffers();
17 }
18
19 template<typename T>
21 momentum_buffers_.clear();
22 momentum_buffers_.reserve(this->parameters_.size());
23
24 for (const auto* param : this->parameters_) {
25 // TODO: Initialize momentum buffer with same shape as parameter
26 // momentum_buffers_.emplace_back(param->data().rows(), param->data().cols());
27 }
28 }
29
30 template<typename T>
31 void SGD<T>::step() {
32 for (size_t i = 0; i < this->parameters_.size(); ++i) {
33 auto* param = this->parameters_[i];
34
35 // TODO: Implement SGD update logic
36 // 1. Apply weight decay if specified
37 // 2. Update momentum buffer if momentum > 0
38 // 3. Apply Nesterov momentum if enabled
39 // 4. Update parameter: param = param - lr * effective_grad
40 }
41 }
42
43 // Adam Implementation
44 template<typename T>
45 Adam<T>::Adam(std::vector<Variable<T>*> parameters,
46 T lr,
47 T beta1,
48 T beta2,
49 T eps,
50 T weight_decay)
51 : AutogradOptimizer<T>(parameters), lr_(lr), beta1_(beta1), beta2_(beta2),
52 eps_(eps), weight_decay_(weight_decay), step_count_(0) {
53 initialize_state();
54 }
55
56 template<typename T>
58 exp_avg_.clear();
59 exp_avg_sq_.clear();
60 exp_avg_.reserve(this->parameters_.size());
61 exp_avg_sq_.reserve(this->parameters_.size());
62
63 for (const auto* param : this->parameters_) {
64 // TODO: Initialize first and second moment estimates
65 // exp_avg_.emplace_back(param->data().rows(), param->data().cols());
66 // exp_avg_sq_.emplace_back(param->data().rows(), param->data().cols());
67 }
68 }
69
70 template<typename T>
72 step_count_++;
73
74 for (size_t i = 0; i < this->parameters_.size(); ++i) {
75 auto* param = this->parameters_[i];
76
77 // TODO: Implement Adam update logic
78 // 1. Apply weight decay if specified
79 // 2. Update biased first moment estimate: m_t = beta1 * m_{t-1} + (1 - beta1) * grad
80 // 3. Update biased second moment estimate: v_t = beta2 * v_{t-1} + (1 - beta2) * grad^2
81 // 4. Compute bias-corrected first moment: m_hat = m_t / (1 - beta1^t)
82 // 5. Compute bias-corrected second moment: v_hat = v_t / (1 - beta2^t)
83 // 6. Update parameter: param = param - lr * m_hat / (sqrt(v_hat) + eps)
84 }
85 }
86
87 // AdamW Implementation
88 template<typename T>
89 AdamW<T>::AdamW(std::vector<Variable<T>*> parameters,
90 T lr,
91 T beta1,
92 T beta2,
93 T eps,
94 T weight_decay)
95 : AutogradOptimizer<T>(parameters), lr_(lr), beta1_(beta1), beta2_(beta2),
96 eps_(eps), weight_decay_(weight_decay), step_count_(0) {
97 initialize_state();
98 }
99
100 template<typename T>
102 exp_avg_.clear();
103 exp_avg_sq_.clear();
104 exp_avg_.reserve(this->parameters_.size());
105 exp_avg_sq_.reserve(this->parameters_.size());
106
107 for (const auto* param : this->parameters_) {
108 // TODO: Initialize first and second moment estimates
109 // exp_avg_.emplace_back(param->data().rows(), param->data().cols());
110 // exp_avg_sq_.emplace_back(param->data().rows(), param->data().cols());
111 }
112 }
113
114 template<typename T>
116 step_count_++;
117
118 for (size_t i = 0; i < this->parameters_.size(); ++i) {
119 auto* param = this->parameters_[i];
120
121 // TODO: Implement AdamW update logic
122 // 1. Update biased first moment estimate: m_t = beta1 * m_{t-1} + (1 - beta1) * grad
123 // 2. Update biased second moment estimate: v_t = beta2 * v_{t-1} + (1 - beta2) * grad^2
124 // 3. Compute bias-corrected first moment: m_hat = m_t / (1 - beta1^t)
125 // 4. Compute bias-corrected second moment: v_hat = v_t / (1 - beta2^t)
126 // 5. Apply decoupled weight decay: param = param * (1 - lr * weight_decay)
127 // 6. Update parameter: param = param - lr * m_hat / (sqrt(v_hat) + eps)
128 }
129 }
130
131 // RMSprop Implementation
132 template<typename T>
133 RMSprop<T>::RMSprop(std::vector<Variable<T>*> parameters,
134 T lr,
135 T alpha,
136 T eps,
137 T weight_decay,
138 T momentum)
139 : AutogradOptimizer<T>(parameters), lr_(lr), alpha_(alpha), eps_(eps),
140 weight_decay_(weight_decay), momentum_(momentum) {
141 initialize_state();
142 }
143
144 template<typename T>
146 square_avg_.clear();
147 momentum_buffer_.clear();
148 square_avg_.reserve(this->parameters_.size());
149 momentum_buffer_.reserve(this->parameters_.size());
150
151 for (const auto* param : this->parameters_) {
152 // TODO: Initialize moving average of squared gradients
153 // square_avg_.emplace_back(param->data().rows(), param->data().cols());
154 // if (momentum_ > 0) {
155 // momentum_buffer_.emplace_back(param->data().rows(), param->data().cols());
156 // }
157 }
158 }
159
160 template<typename T>
162 for (size_t i = 0; i < this->parameters_.size(); ++i) {
163 auto* param = this->parameters_[i];
164
165 // TODO: Implement RMSprop update logic
166 // 1. Apply weight decay if specified
167 // 2. Update moving average of squared gradients: v_t = alpha * v_{t-1} + (1 - alpha) * grad^2
168 // 3. Compute update: update = grad / (sqrt(v_t) + eps)
169 // 4. Apply momentum if specified: buf = momentum * buf + update
170 // 5. Update parameter: param = param - lr * (momentum > 0 ? buf : update)
171 }
172 }
173
174 // StepLR Implementation
175 template<typename T>
177 last_epoch_++;
178 if (last_epoch_ % step_size_ == 0) {
179 T new_lr = base_lr_ * std::pow(gamma_, last_epoch_ / step_size_);
180 this->optimizer_->set_lr(new_lr);
181 }
182 }
183
184 // Explicit template instantiations
185 template class SGD<float>;
186 template class SGD<double>;
187 template class Adam<float>;
188 template class Adam<double>;
189 template class AdamW<float>;
190 template class AdamW<double>;
191 template class RMSprop<float>;
192 template class RMSprop<double>;
193 template class StepLR<float>;
194 template class StepLR<double>;
195
196} // namespace dl::optimization
AdamW optimizer with autograd support.
void step() override
Perform one AdamW step.
AdamW(std::vector< Variable< T > * > parameters, T lr=1e-3, T beta1=0.9, T beta2=0.999, T eps=1e-8, T weight_decay=1e-2)
Constructor.
Adam optimizer with autograd support.
Adam(std::vector< Variable< T > * > parameters, T lr=1e-3, T beta1=0.9, T beta2=0.999, T eps=1e-8, T weight_decay=0.0)
Constructor.
void step() override
Perform one Adam step.
Base class for autograd-compatible optimizers.
RMSprop optimizer with autograd support.
void step() override
Perform one RMSprop step.
RMSprop(std::vector< Variable< T > * > parameters, T lr=1e-2, T alpha=0.99, T eps=1e-8, T weight_decay=0.0, T momentum=0.0)
Constructor.
Stochastic Gradient Descent optimizer with autograd support.
SGD(std::vector< Variable< T > * > parameters, T lr, T momentum=0.0, T weight_decay=0.0, bool nesterov=false)
Constructor.
Definition optimizers.cpp:9
void step() override
Perform one SGD step.
Step learning rate scheduler Decays learning rate by gamma every step_size epochs.
void step() override
Update learning rate.
Variable class that supports automatic differentiation.
Definition autograd.hpp:58
PyTorch-like optimizers with automatic differentiation support.