20 if (features.
rows() != labels.
rows()) {
21 throw std::invalid_argument(
"Features and labels must have same number of samples");
34 return features_.rows();
41 size_t end_idx = std::min(start_idx + batch_size, size());
42 size_t actual_batch_size = end_idx - start_idx;
45 MatrixD batch_features(actual_batch_size, features_.cols());
46 MatrixD batch_labels(actual_batch_size, labels_.cols());
48 return {batch_features, batch_labels};
60 dataset_(dataset), batch_size_(batch_size), shuffle_(shuffle), current_idx_(0) {
67 return current_idx_ < dataset_.size();
74 throw std::runtime_error(
"No more batches available");
77 auto batch = dataset_.get_batch(current_idx_, batch_size_);
78 current_idx_ += batch_size_;
95 std::ifstream file(filename);
96 if (!file.is_open()) {
97 throw std::runtime_error(
"Could not open file: " + filename);
100 std::vector<std::vector<double>> data;
104 if (has_header && std::getline(file, line)) {
108 while (std::getline(file, line)) {
109 std::vector<double> row;
110 std::stringstream ss(line);
113 while (std::getline(ss, cell, delimiter)) {
115 row.push_back(std::stod(cell));
116 }
catch (
const std::exception &) {
132 size_t rows = data.size();
133 size_t cols = data[0].size();
136 for (
size_t i = 0; i < rows; ++i) {
137 for (
size_t j = 0; j < cols && j < data[i].size(); ++j) {
138 result(i, j) = data[i][j];
146 const std::vector<size_t> &feature_cols,
147 const std::vector<size_t> &label_cols,
148 bool has_header,
char delimiter) {
153 MatrixD features(full_data.
rows(), feature_cols.size());
154 MatrixD labels(full_data.
rows(), label_cols.size());
158 return {features, labels};
166 return MatrixD(target_height, target_width);
170 size_t target_width,
size_t target_height) {
173 std::vector<MatrixD> images;
202 MatrixD result(labels.size(), num_classes);
204 for (
size_t i = 0; i < labels.size(); ++i) {
205 if (labels[i] >= 0 &&
static_cast<size_t>(labels[i]) < num_classes) {
206 result(i, labels[i]) = 1.0;
223 return {train_set, val_set, test_set};
static MatrixD load_csv(const std::string &filename, bool has_header=true, char delimiter=',')
Load CSV file into a matrix.
static std::pair< MatrixD, MatrixD > load_features_labels(const std::string &filename, const std::vector< size_t > &feature_cols, const std::vector< size_t > &label_cols, bool has_header=true, char delimiter=',')
Load specific columns as features and labels.
void reset()
Reset iterator to start of dataset.
std::pair< MatrixD, MatrixD > next_batch()
Get the next batch of data.
DataLoader(const Dataset< T > &dataset, size_t batch_size, bool shuffle=false)
Constructor for data loader.
bool has_next() const
Check if more batches are available in current epoch.
std::pair< MatrixD, MatrixD > get_batch(size_t start_idx, size_t batch_size) const
Extract a batch of samples from the dataset.
Dataset()
Default constructor for empty dataset.
void shuffle()
Randomly shuffle the dataset samples.
void add_sample(const std::vector< T > &feature, const std::vector< T > &label)
Add a single sample to the dataset.
size_t size() const
Get the number of samples in the dataset.
static MatrixD load_image(const std::string &filename, size_t target_width=0, size_t target_height=0)
Load a single image file.
static std::vector< MatrixD > load_images_from_directory(const std::string &directory_path, size_t target_width=0, size_t target_height=0)
Load all images from a directory.
static std::tuple< Dataset< double >, Dataset< double >, Dataset< double > > train_val_test_split(const Dataset< double > &data, double train_ratio=0.7, double val_ratio=0.15)
Split dataset into training, validation, and test sets.
static MatrixD one_hot_encode(const std::vector< int > &labels, size_t num_classes)
Convert categorical labels to one-hot encoding.
static MatrixD standardize(const MatrixD &data)
Standardize data to zero mean and unit variance.
static MatrixD normalize(const MatrixD &data, double min_val=0.0, double max_val=1.0)
Normalize data to specified range.
size_t cols() const
Get the number of columns.
size_t rows() const
Get the number of rows.
Data loading and preprocessing utilities for deep learning.
::utils::Matrix< double > MatrixD