Deep Learning Algorithm Implementations 1.0.0
C++ implementations of fundamental deep learning algorithms
Loading...
Searching...
No Matches
data_loader.cpp
Go to the documentation of this file.
2#include <algorithm>
3#include <fstream>
4#include <numeric>
5#include <random>
6#include <sstream>
7#include <stdexcept>
8#include <string>
9
10
11namespace dl {
12 namespace utils {
13 // Dataset Implementation
14 template<typename T>
15 Dataset<T>::Dataset() : features_(), labels_() {}
16
17 template<typename T>
18 Dataset<T>::Dataset(const MatrixD &features, const MatrixD &labels) : features_(features), labels_(labels) {
19 // TODO: Validate that features and labels have compatible dimensions
20 if (features.rows() != labels.rows()) {
21 throw std::invalid_argument("Features and labels must have same number of samples");
22 }
23 }
24
25 template<typename T>
26 void Dataset<T>::add_sample(const std::vector<T> &feature, const std::vector<T> &label) {
27 // TODO: Implement adding individual samples
28 // This would require dynamic resizing of the matrices
29 }
30
31 template<typename T>
32 size_t Dataset<T>::size() const {
33 // TODO: Return number of samples
34 return features_.rows();
35 }
36
37 template<typename T>
38 std::pair<MatrixD, MatrixD> Dataset<T>::get_batch(size_t start_idx, size_t batch_size) const {
39 // TODO: Implement batch extraction
40 // Extract a subset of features and labels
41 size_t end_idx = std::min(start_idx + batch_size, size());
42 size_t actual_batch_size = end_idx - start_idx;
43
44 // Placeholder implementation - would need proper matrix slicing
45 MatrixD batch_features(actual_batch_size, features_.cols());
46 MatrixD batch_labels(actual_batch_size, labels_.cols());
47
48 return {batch_features, batch_labels};
49 }
50
51 template<typename T>
53 // TODO: Implement dataset shuffling
54 // Shuffle the order of samples while maintaining feature-label correspondence
55 }
56
57 // DataLoader Implementation
58 template<typename T>
59 DataLoader<T>::DataLoader(const Dataset<T> &dataset, size_t batch_size, bool shuffle) :
60 dataset_(dataset), batch_size_(batch_size), shuffle_(shuffle), current_idx_(0) {
61 // TODO: Initialize data loader
62 }
63
64 template<typename T>
66 // TODO: Check if there are more batches
67 return current_idx_ < dataset_.size();
68 }
69
70 template<typename T>
71 std::pair<MatrixD, MatrixD> DataLoader<T>::next_batch() {
72 // TODO: Get next batch
73 if (!has_next()) {
74 throw std::runtime_error("No more batches available");
75 }
76
77 auto batch = dataset_.get_batch(current_idx_, batch_size_);
78 current_idx_ += batch_size_;
79
80 return batch;
81 }
82
83 template<typename T>
85 // TODO: Reset iterator to beginning
86 current_idx_ = 0;
87 if (shuffle_) {
88 // Would shuffle dataset here
89 }
90 }
91
92 // CSVLoader Implementation
93 MatrixD CSVLoader::load_csv(const std::string &filename, bool has_header, char delimiter) {
94 // TODO: Implement CSV loading
95 std::ifstream file(filename);
96 if (!file.is_open()) {
97 throw std::runtime_error("Could not open file: " + filename);
98 }
99
100 std::vector<std::vector<double>> data;
101 std::string line;
102
103 // Skip header if present
104 if (has_header && std::getline(file, line)) {
105 // Header skipped
106 }
107
108 while (std::getline(file, line)) {
109 std::vector<double> row;
110 std::stringstream ss(line);
111 std::string cell;
112
113 while (std::getline(ss, cell, delimiter)) {
114 try {
115 row.push_back(std::stod(cell));
116 } catch (const std::exception &) {
117 // Handle parsing errors
118 row.push_back(0.0);
119 }
120 }
121
122 if (!row.empty()) {
123 data.push_back(row);
124 }
125 }
126
127 if (data.empty()) {
128 return MatrixD(0, 0);
129 }
130
131 // Convert to matrix
132 size_t rows = data.size();
133 size_t cols = data[0].size();
134 MatrixD result(rows, cols);
135
136 for (size_t i = 0; i < rows; ++i) {
137 for (size_t j = 0; j < cols && j < data[i].size(); ++j) {
138 result(i, j) = data[i][j];
139 }
140 }
141
142 return result;
143 }
144
145 std::pair<MatrixD, MatrixD> CSVLoader::load_features_labels(const std::string &filename,
146 const std::vector<size_t> &feature_cols,
147 const std::vector<size_t> &label_cols,
148 bool has_header, char delimiter) {
149 // TODO: Implement feature-label separation
150 MatrixD full_data = load_csv(filename, has_header, delimiter);
151
152 // Extract features and labels based on column indices
153 MatrixD features(full_data.rows(), feature_cols.size());
154 MatrixD labels(full_data.rows(), label_cols.size());
155
156 // Placeholder implementation - would need proper column extraction
157
158 return {features, labels};
159 }
160
161 // ImageLoader Implementation
162 MatrixD ImageLoader::load_image(const std::string &filename, size_t target_width, size_t target_height) {
163 // TODO: Implement image loading
164 // This would typically use a library like OpenCV or STBI
165 // For now, return a placeholder matrix
166 return MatrixD(target_height, target_width);
167 }
168
169 std::vector<MatrixD> ImageLoader::load_images_from_directory(const std::string &directory_path,
170 size_t target_width, size_t target_height) {
171 // TODO: Implement batch image loading from directory
172 // This would scan the directory and load all image files
173 std::vector<MatrixD> images;
174 return images;
175 }
176
177 // Preprocessor Implementation
178 MatrixD Preprocessor::normalize(const MatrixD &data, double min_val, double max_val) {
179 // TODO: Implement normalization
180 // Scale data to [min_val, max_val] range
181 MatrixD result(data.rows(), data.cols());
182
183 // Find min and max values in data
184 // Apply normalization formula: (x - data_min) / (data_max - data_min) * (max_val - min_val) + min_val
185
186 return result;
187 }
188
190 // TODO: Implement standardization (z-score normalization)
191 // Formula: (x - mean) / std_dev
192 MatrixD result(data.rows(), data.cols());
193
194 // Calculate mean and standard deviation for each feature
195 // Apply standardization formula
196
197 return result;
198 }
199
200 MatrixD Preprocessor::one_hot_encode(const std::vector<int> &labels, size_t num_classes) {
201 // TODO: Implement one-hot encoding
202 MatrixD result(labels.size(), num_classes);
203
204 for (size_t i = 0; i < labels.size(); ++i) {
205 if (labels[i] >= 0 && static_cast<size_t>(labels[i]) < num_classes) {
206 result(i, labels[i]) = 1.0;
207 }
208 }
209
210 return result;
211 }
212
213 // Removed train_test_split method - using train_val_test_split instead
214
215 std::tuple<Dataset<double>, Dataset<double>, Dataset<double>>
216 Preprocessor::train_val_test_split(const Dataset<double> &data, double train_ratio, double val_ratio) {
217 // TODO: Implement train-validation-test split
218 // For now, return empty datasets
219 Dataset<double> train_set;
220 Dataset<double> val_set;
221 Dataset<double> test_set;
222
223 return {train_set, val_set, test_set};
224 }
225
226 // Explicit template instantiations
227 template class Dataset<double>;
228 template class Dataset<float>;
229 template class DataLoader<double>;
230 template class DataLoader<float>;
231 } // namespace utils
232} // namespace dl
static MatrixD load_csv(const std::string &filename, bool has_header=true, char delimiter=',')
Load CSV file into a matrix.
static std::pair< MatrixD, MatrixD > load_features_labels(const std::string &filename, const std::vector< size_t > &feature_cols, const std::vector< size_t > &label_cols, bool has_header=true, char delimiter=',')
Load specific columns as features and labels.
void reset()
Reset iterator to start of dataset.
std::pair< MatrixD, MatrixD > next_batch()
Get the next batch of data.
DataLoader(const Dataset< T > &dataset, size_t batch_size, bool shuffle=false)
Constructor for data loader.
bool has_next() const
Check if more batches are available in current epoch.
std::pair< MatrixD, MatrixD > get_batch(size_t start_idx, size_t batch_size) const
Extract a batch of samples from the dataset.
Dataset()
Default constructor for empty dataset.
void shuffle()
Randomly shuffle the dataset samples.
void add_sample(const std::vector< T > &feature, const std::vector< T > &label)
Add a single sample to the dataset.
size_t size() const
Get the number of samples in the dataset.
static MatrixD load_image(const std::string &filename, size_t target_width=0, size_t target_height=0)
Load a single image file.
static std::vector< MatrixD > load_images_from_directory(const std::string &directory_path, size_t target_width=0, size_t target_height=0)
Load all images from a directory.
static std::tuple< Dataset< double >, Dataset< double >, Dataset< double > > train_val_test_split(const Dataset< double > &data, double train_ratio=0.7, double val_ratio=0.15)
Split dataset into training, validation, and test sets.
static MatrixD one_hot_encode(const std::vector< int > &labels, size_t num_classes)
Convert categorical labels to one-hot encoding.
static MatrixD standardize(const MatrixD &data)
Standardize data to zero mean and unit variance.
static MatrixD normalize(const MatrixD &data, double min_val=0.0, double max_val=1.0)
Normalize data to specified range.
size_t cols() const
Get the number of columns.
Definition matrix.hpp:200
size_t rows() const
Get the number of rows.
Definition matrix.hpp:194
Data loading and preprocessing utilities for deep learning.
::utils::Matrix< double > MatrixD