Dataset container for machine learning data.
Dataset container for machine learning dataA flexible container class that holds features and labels for training, validation, or testing. Supports batch processing, data shuffling, and efficient memory management.
MatrixD features(100, 784);
MatrixD labels(100, 10);
Dataset<double> dataset(features, labels);
std::vector<double> sample_features = {1.0, 2.0, 3.0};
std::vector<double> sample_label = {0.0, 1.0, 0.0};
dataset.add_sample(sample_features, sample_label);
auto batch = dataset.get_batch(0, 32);
#pragma once
#include <memory>
#include <string>
#include <vector>
template<typename T>
class Dataset {
public:
void add_sample(
const std::vector<T> &feature,
const std::vector<T> &label);
std::pair<MatrixD, MatrixD>
get_batch(
size_t start_idx,
size_t batch_size)
const;
private:
};
template<typename T>
class DataLoader {
public:
DataLoader(const Dataset<T> &dataset, size_t batch_size, bool shuffle = false);
private:
const Dataset<T> &dataset_;
size_t batch_size_;
bool shuffle_;
size_t current_idx_;
};
class CSVLoader {
public:
static MatrixD load_csv(
const std::string &filename,
bool has_header =
true,
char delimiter =
',');
const std::vector<size_t> &feature_cols,
const std::vector<size_t> &label_cols,
bool has_header = true, char delimiter = ',');
};
class ImageLoader {
public:
static MatrixD load_image(
const std::string &filename,
size_t target_width = 0,
size_t target_height = 0);
size_t target_width = 0, size_t target_height = 0);
};
class Preprocessor {
public:
static std::tuple<Dataset<double>, Dataset<double>, Dataset<double>>
train_val_test_split(
const Dataset<double> &data,
double train_ratio = 0.7,
double val_ratio = 0.15);
};
}
}
static MatrixD load_csv(const std::string &filename, bool has_header=true, char delimiter=',')
Load CSV file into a matrix.
static std::pair< MatrixD, MatrixD > load_features_labels(const std::string &filename, const std::vector< size_t > &feature_cols, const std::vector< size_t > &label_cols, bool has_header=true, char delimiter=',')
Load specific columns as features and labels.
void reset()
Reset iterator to start of dataset.
std::pair< MatrixD, MatrixD > next_batch()
Get the next batch of data.
bool has_next() const
Check if more batches are available in current epoch.
std::pair< MatrixD, MatrixD > get_batch(size_t start_idx, size_t batch_size) const
Extract a batch of samples from the dataset.
Dataset()
Default constructor for empty dataset.
void shuffle()
Randomly shuffle the dataset samples.
void add_sample(const std::vector< T > &feature, const std::vector< T > &label)
Add a single sample to the dataset.
size_t size() const
Get the number of samples in the dataset.
static MatrixD load_image(const std::string &filename, size_t target_width=0, size_t target_height=0)
Load a single image file.
static std::vector< MatrixD > load_images_from_directory(const std::string &directory_path, size_t target_width=0, size_t target_height=0)
Load all images from a directory.
static std::tuple< Dataset< double >, Dataset< double >, Dataset< double > > train_val_test_split(const Dataset< double > &data, double train_ratio=0.7, double val_ratio=0.15)
Split dataset into training, validation, and test sets.
static MatrixD one_hot_encode(const std::vector< int > &labels, size_t num_classes)
Convert categorical labels to one-hot encoding.
static MatrixD standardize(const MatrixD &data)
Standardize data to zero mean and unit variance.
static MatrixD normalize(const MatrixD &data, double min_val=0.0, double max_val=1.0)
Normalize data to specified range.
Matrix utility class for deep learning operations.
::utils::Matrix< double > MatrixD