diff --git a/CMakeLists.txt b/CMakeLists.txt index 9913b33..abec1fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -183,3 +183,24 @@ set (SAVE_LOAD_TEST_SOURCE_FILES "test_model_save_load.cpp") add_executable(${SAVE_LOAD_TEST_TARGET_NAME} ${SAVE_LOAD_TEST_SOURCE_FILES}) target_link_libraries(${SAVE_LOAD_TEST_TARGET_NAME} ${EXTRA_LIBS}) add_test(NAME ModelSaveLoadTests COMMAND test_model_save_load) + +# # # # MNIST DATA LOADER TESTS # # # + +set (MNIST_LOADER_TEST_TARGET_NAME "test_mnist_loader") +set (MNIST_LOADER_TEST_SOURCE_FILES "test_mnist_loader.cpp") +add_executable(${MNIST_LOADER_TEST_TARGET_NAME} ${MNIST_LOADER_TEST_SOURCE_FILES}) +target_link_libraries(${MNIST_LOADER_TEST_TARGET_NAME} ${EXTRA_LIBS}) + +# # # # MNIST TRAINING # # # + +set (TRAIN_MNIST_TARGET_NAME "train_mnist") +set (TRAIN_MNIST_SOURCE_FILES "train_mnist.cpp") +add_executable(${TRAIN_MNIST_TARGET_NAME} ${TRAIN_MNIST_SOURCE_FILES}) +target_link_libraries(${TRAIN_MNIST_TARGET_NAME} ${EXTRA_LIBS}) + +# # # # MNIST TRAINING TESTS # # # + +set (TEST_MNIST_TRAINING_TARGET_NAME "test_mnist_training") +set (TEST_MNIST_TRAINING_SOURCE_FILES "test_mnist_training.cpp") +add_executable(${TEST_MNIST_TRAINING_TARGET_NAME} ${TEST_MNIST_TRAINING_SOURCE_FILES}) +target_link_libraries(${TEST_MNIST_TRAINING_TARGET_NAME} ${EXTRA_LIBS}) diff --git a/MNIST_README.md b/MNIST_README.md new file mode 100644 index 0000000..9b16dc7 --- /dev/null +++ b/MNIST_README.md @@ -0,0 +1,274 @@ +# MNIST Data Loader + +This directory contains a complete MNIST data loader implementation for the Neural Network library. + +## Overview + +The MNIST loader (`mnist_loader.h`) provides functionality to: +- Load MNIST dataset files in IDX format +- Parse and normalize image data (28×28 grayscale images) +- Convert labels to one-hot encoded format +- Access individual samples or batches +- Visualize digits as ASCII art + +## Getting the MNIST Dataset + +Download the MNIST dataset files from: http://yann.lecun.com/exdb/mnist/ + +You need these four files: +- `train-images-idx3-ubyte` (9.9 MB) - 60,000 training images +- `train-labels-idx1-ubyte` (29 KB) - 60,000 training labels +- `t10k-images-idx3-ubyte` (1.6 MB) - 10,000 test images +- `t10k-labels-idx1-ubyte` (5 KB) - 10,000 test labels + +**Important**: Download the files and place them in your working directory (or adjust the paths in your code). + +### Quick Download Commands + +```bash +# Download training set +curl -O http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz +curl -O http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz + +# Download test set +curl -O http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz +curl -O http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz + +# Decompress all files +gunzip *.gz +``` + +## Usage + +### Basic Loading + +```cpp +#include "mnist_loader.h" + +using namespace ml; + +// Load training dataset +MNISTDataset trainDataset; +loadMNISTDataset( + "train-images-idx3-ubyte", + "train-labels-idx1-ubyte", + trainDataset +); + +// Dataset properties +std::cout << "Samples: " << trainDataset.numSamples << std::endl; +std::cout << "Image size: " << trainDataset.imageSize << std::endl; +std::cout << "Classes: " << trainDataset.numClasses << std::endl; +``` + +### Accessing Individual Samples + +```cpp +// Get a single sample (image + label) +auto sample = getSample(trainDataset, 0); +ml::Mat image = sample.first; // (1, 784) matrix +ml::Mat label = sample.second; // (1, 10) one-hot encoded + +// Get the raw label value (0-9) +int rawLabel = trainDataset.rawLabels[0]; +``` + +### Getting Batches for Training + +```cpp +// Get a batch of 32 samples starting at index 0 +int batchSize = 32; +auto batch = getBatch(trainDataset, 0, batchSize); + +ml::Mat batchImages = batch.first; // (32, 784) +ml::Mat batchLabels = batch.second; // (32, 10) +``` + +### Visualizing Digits + +```cpp +// Print an ASCII visualization of a digit +auto sample = getSample(trainDataset, 0); +printMNISTDigit(sample.first, trainDataset.rawLabels[0]); +``` + +## Data Format + +### Images +- **Dimensions**: Each image is 28×28 pixels = 784 values +- **Format**: Flattened row-major order (rows concatenated) +- **Normalization**: Pixel values are normalized to [0.0, 1.0] range +- **Storage**: `trainDataset.images` is a matrix of size (numSamples, 784) + +### Labels +- **Raw Labels**: Integer values 0-9 stored in `trainDataset.rawLabels` +- **One-Hot Encoded**: `trainDataset.labels` is a matrix of size (numSamples, 10) + - Example: Label "3" becomes `[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]` + +## Building and Testing + +### Build the MNIST loader test: + +```bash +# Using Make +make test_mnist_loader + +# Or using CMake directly +cd build +cmake .. +make test_mnist_loader +``` + +### Run the test: + +```bash +# Using Make +make run_test_mnist_loader + +# Or directly +cd build +./test_mnist_loader +``` + +**Note**: Make sure the MNIST dataset files are in the same directory where you run the test. + +## Integration with Neural Network + +Here's a simple example of training a network on MNIST: + +```cpp +#include "network.h" +#include "mnist_loader.h" + +using namespace ml; + +int main() { + // Load MNIST data + MNISTDataset trainDataset; + loadMNISTDataset( + "train-images-idx3-ubyte", + "train-labels-idx1-ubyte", + trainDataset + ); + + // Create network: 784 -> 256 -> 128 -> 10 + Network* network = new Network(); + ILayer* input = new Layer(784, "Input", ActivationType::RELU); + ILayer* hidden1 = new Layer(256, "Hidden1", ActivationType::RELU); + ILayer* hidden2 = new Layer(128, "Hidden2", ActivationType::RELU); + ILayer* output = new Layer(10, "Output", ActivationType::SIGMOID); + + network->setInputLayer(input); + network->connect(input, hidden1); + network->connect(hidden1, hidden2); + network->connect(hidden2, output); + network->setOutputLayer(output); + + // Use Adam optimizer + network->setOptimizerType(OptimizerType::ADAM); + network->init(); + + // Training loop + double learningRate = 0.001; + int epochs = 10; + int batchSize = 1; // Single sample per update + + for (int epoch = 0; epoch < epochs; epoch++) { + for (int i = 0; i < trainDataset.numSamples; i++) { + // Get single sample + auto sample = getSample(trainDataset, i); + + // Forward pass + ml::Mat predicted = network->feed(sample.first); + + // Compute error + ml::Mat error = ml::Diff(sample.second, predicted); + network->getOutputLayer()->setErrors(error); + + // Backward pass + network->backprop(); + + // Update weights + network->updateWeights(learningRate); + } + + std::cout << "Epoch " << (epoch + 1) << " complete" << std::endl; + } + + return 0; +} +``` + +## File Structure + +- `mnist_loader.h` - Main MNIST data loader implementation +- `test_mnist_loader.cpp` - Test suite demonstrating usage +- `MNIST_README.md` - This documentation file + +## Technical Details + +### IDX File Format + +MNIST uses the IDX file format: + +**Image File Header (16 bytes)**: +- Bytes 0-3: Magic number (2051 for images) +- Bytes 4-7: Number of images +- Bytes 8-11: Number of rows (28) +- Bytes 12-15: Number of columns (28) +- Remaining: Pixel data (unsigned bytes 0-255) + +**Label File Header (8 bytes)**: +- Bytes 0-3: Magic number (2049 for labels) +- Bytes 4-7: Number of labels +- Remaining: Label data (unsigned bytes 0-9) + +All integers are stored in MSB (Most Significant Byte) first format (big-endian). + +## Next Steps + +After successfully loading MNIST data, you can: + +1. **Implement a complete MNIST training script** with: + - Batch training support + - Training/validation split + - Accuracy evaluation + - Model checkpointing + +2. **Experiment with different architectures**: + - Vary hidden layer sizes + - Try different activation functions + - Test different optimizers (SGD, Adam, RMSprop) + +3. **Add advanced features**: + - Learning rate scheduling + - Early stopping + - Data augmentation + - Cross-entropy loss for better classification + +## Expected Performance + +With the current fully-connected architecture (784→256→128→10) and Adam optimizer: +- **Expected accuracy**: 85-92% on test set +- **Training time**: ~5-10 minutes for 10 epochs (CPU-dependent) + +For better performance, consider: +- Convolutional layers (when implemented) +- Dropout regularization (when implemented) +- Batch normalization (when implemented) + +## Troubleshooting + +**Problem**: "Cannot open file" error +- **Solution**: Make sure MNIST files are in the current working directory or provide full paths + +**Problem**: "Invalid MNIST file" error +- **Solution**: Ensure files are decompressed (.gz files must be gunzipped) + +**Problem**: Segmentation fault or memory errors +- **Solution**: Check that your Matrix library is properly initialized and sized correctly + +## References + +- MNIST Database: http://yann.lecun.com/exdb/mnist/ +- Original Paper: LeCun et al., "Gradient-Based Learning Applied to Document Recognition" (1998) diff --git a/Makefile b/Makefile index 64be284..085b0eb 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: run_test_training build_dir test_training +.PHONY: run_test_training build_dir test_training test_mnist_loader run_test_mnist_loader train_mnist run_mnist test_mnist_training run_test_mnist_training # Default target all: build_dir @@ -21,3 +21,27 @@ test_training: build_dir run_test_training: test_training cd build && ./test_training +# Build the test_mnist_loader target +test_mnist_loader: build_dir + cd build && make test_mnist_loader + +# Build and run test_mnist_loader +run_test_mnist_loader: test_mnist_loader + cd build && ./test_mnist_loader + +# Build the train_mnist target +train_mnist: build_dir + cd build && make train_mnist + +# Build and run train_mnist +run_mnist: train_mnist + cd build && ./train_mnist + +# Build the test_mnist_training target +test_mnist_training: build_dir + cd build && make test_mnist_training + +# Build and run test_mnist_training +run_test_mnist_training: test_mnist_training + cd build && ./test_mnist_training + diff --git a/loss.h b/loss.h new file mode 100644 index 0000000..4f4ebc4 --- /dev/null +++ b/loss.h @@ -0,0 +1,205 @@ +#pragma once + +#include +#include +#include "Matrix/matrix.h" + +namespace ml { + +/** + * Loss function types + * + * Choosing the right loss function: + * + * MSE (Mean Squared Error): + * - Best for: Regression tasks, simple binary gates (XOR, AND, OR) + * - Output range: Any real number + * - Use when: Predicting continuous values or simple binary problems + * - Example: House price prediction, temperature forecasting, XOR + * + * CROSS_ENTROPY: + * - Best for: Multi-class classification + * - Output range: Probabilities [0, 1] + * - Use when: Classifying into multiple categories (MNIST, ImageNet) + * - Example: Digit recognition (0-9), object classification + * - Note: Pair with Sigmoid or Softmax output activation + * + * BINARY_CROSS_ENTROPY: + * - Best for: Binary classification + * - Output range: Probability [0, 1] + * - Use when: Two-class problems with probability interpretation + * - Example: Spam detection, sentiment analysis (positive/negative) + * - Note: Pair with Sigmoid output activation + */ +enum class LossType { + MSE, // Mean Squared Error (for regression) + CROSS_ENTROPY, // Cross-Entropy (for classification) + BINARY_CROSS_ENTROPY // Binary Cross-Entropy (for binary classification) +}; + +/** + * Mean Squared Error (MSE) Loss + * L = (1/2) * sum((target - predicted)^2) + * Gradient: dL/dy = -(target - predicted) = (predicted - target) + */ +template +ml::Mat MSELoss(const ml::Mat& predicted, const ml::Mat& target) { + return ml::Diff(predicted, target); +} + +/** + * Cross-Entropy Loss for multi-class classification + * L = -sum(target * log(predicted)) + * + * For softmax output with one-hot encoded targets: + * Gradient: dL/dy = predicted - target + * + * For sigmoid output (treating each class independently): + * Gradient: dL/dy = -(target / predicted) + (1 - target) / (1 - predicted) + * But when combined with sigmoid, simplifies to: predicted - target + * + * @param predicted Network output (should be probabilities) + * @param target One-hot encoded target labels + * @return Error gradient for backpropagation + */ +template +ml::Mat CrossEntropyLoss(const ml::Mat& predicted, const ml::Mat& target) { + // The gradient of cross-entropy loss with respect to pre-activation (logits) + // when using softmax/sigmoid activation is simply: predicted - target + // This is a beautiful property that makes training stable! + return ml::Diff(predicted, target); +} + +/** + * Binary Cross-Entropy Loss (for binary classification) + * L = -[target * log(predicted) + (1 - target) * log(1 - predicted)] + * Gradient: dL/dy = predicted - target (when combined with sigmoid) + */ +template +ml::Mat BinaryCrossEntropyLoss(const ml::Mat& predicted, const ml::Mat& target) { + return ml::Diff(predicted, target); +} + +/** + * Compute loss value (for monitoring training progress) + * Returns the scalar loss value + */ +template +T ComputeLoss(const ml::Mat& predicted, const ml::Mat& target, LossType lossType) { + T totalLoss = 0.0; + int numSamples = predicted.size().cy; + int numOutputs = predicted.size().cx; + + const T epsilon = 1e-7; // Small constant to avoid log(0) + + switch (lossType) { + case LossType::MSE: { + // Mean Squared Error: (1/2n) * sum((predicted - target)^2) + for (int i = 0; i < numSamples; i++) { + for (int j = 0; j < numOutputs; j++) { + T diff = predicted.getAt(i, j) - target.getAt(i, j); + totalLoss += diff * diff; + } + } + return totalLoss / (2.0 * numSamples); + } + + case LossType::CROSS_ENTROPY: { + // Cross-Entropy: -(1/n) * sum(target * log(predicted)) + for (int i = 0; i < numSamples; i++) { + for (int j = 0; j < numOutputs; j++) { + T pred = std::max(epsilon, std::min(T(1.0) - epsilon, predicted.getAt(i, j))); + T targ = target.getAt(i, j); + if (targ > 0) { // Only compute for non-zero targets (one-hot encoding) + totalLoss += -targ * std::log(pred); + } + } + } + return totalLoss / numSamples; + } + + case LossType::BINARY_CROSS_ENTROPY: { + // Binary Cross-Entropy: -(1/n) * sum[target*log(pred) + (1-target)*log(1-pred)] + for (int i = 0; i < numSamples; i++) { + for (int j = 0; j < numOutputs; j++) { + T pred = std::max(epsilon, std::min(T(1.0) - epsilon, predicted.getAt(i, j))); + T targ = target.getAt(i, j); + totalLoss += -(targ * std::log(pred) + (T(1.0) - targ) * std::log(T(1.0) - pred)); + } + } + return totalLoss / numSamples; + } + + default: + return totalLoss; + } +} + +/** + * Compute loss gradient for backpropagation + * Returns the error gradient: dL/dy + */ +template +ml::Mat ComputeLossGradient(const ml::Mat& predicted, const ml::Mat& target, + LossType lossType) { + switch (lossType) { + case LossType::MSE: + return MSELoss(predicted, target); + + case LossType::CROSS_ENTROPY: + return CrossEntropyLoss(predicted, target); + + case LossType::BINARY_CROSS_ENTROPY: + return BinaryCrossEntropyLoss(predicted, target); + + default: + return MSELoss(predicted, target); + } +} + +/** + * Compute classification accuracy + * For multi-class classification with one-hot encoded labels + * + * @param predicted Network output probabilities (batch_size, num_classes) + * @param target One-hot encoded targets (batch_size, num_classes) + * @return Accuracy as a percentage (0-100) + */ +template +T ComputeAccuracy(const ml::Mat& predicted, const ml::Mat& target) { + int numSamples = predicted.size().cy; + int numClasses = predicted.size().cx; + int correct = 0; + + for (int i = 0; i < numSamples; i++) { + // Find predicted class (argmax of predictions) + int predictedClass = 0; + T maxPred = predicted.getAt(i, 0); + for (int j = 1; j < numClasses; j++) { + T pred = predicted.getAt(i, j); + if (pred > maxPred) { + maxPred = pred; + predictedClass = j; + } + } + + // Find true class (argmax of one-hot target) + int trueClass = 0; + T maxTarget = target.getAt(i, 0); + for (int j = 1; j < numClasses; j++) { + T targ = target.getAt(i, j); + if (targ > maxTarget) { + maxTarget = targ; + trueClass = j; + } + } + + if (predictedClass == trueClass) { + correct++; + } + } + + return (T(100.0) * correct) / numSamples; +} + +} // namespace ml diff --git a/main.cpp b/main.cpp index 4830114..7d16f1f 100644 --- a/main.cpp +++ b/main.cpp @@ -52,7 +52,7 @@ void test2() { void test3() { using namespace std; using namespace ml; - typedef int T; + typedef double T; Timer timer; timer.start(); @@ -97,7 +97,7 @@ void test3() { void test_crazy_network_1() { using namespace std; using namespace ml; - typedef int T; + typedef double T; Timer timer; timer.start(); diff --git a/mnist_loader.h b/mnist_loader.h new file mode 100644 index 0000000..e99b687 --- /dev/null +++ b/mnist_loader.h @@ -0,0 +1,290 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "Matrix/matrix.h" + +namespace ml { + +// Helper function to reverse bytes for big-endian to little-endian conversion +inline uint32_t reverseInt(uint32_t i) { + unsigned char c1, c2, c3, c4; + c1 = i & 255; + c2 = (i >> 8) & 255; + c3 = (i >> 16) & 255; + c4 = (i >> 24) & 255; + return ((uint32_t)c1 << 24) + ((uint32_t)c2 << 16) + ((uint32_t)c3 << 8) + c4; +} + +// MNIST Dataset container +template +struct MNISTDataset { + ml::Mat images; // Each row is a flattened 28x28 image (784 values) + ml::Mat labels; // Each row is a one-hot encoded label (10 values) + std::vector rawLabels; // Original label values (0-9) + int numSamples; + int imageSize; // 784 for MNIST (28x28) + int numClasses; // 10 for MNIST (digits 0-9) + + MNISTDataset() : numSamples(0), imageSize(784), numClasses(10) {} +}; + +/** + * Read MNIST image file (IDX3-UBYTE format) + * + * File format: + * [offset] [type] [value] [description] + * 0000 32 bit integer 0x00000803(2051) magic number + * 0004 32 bit integer 60000 number of images + * 0008 32 bit integer 28 number of rows + * 0012 32 bit integer 28 number of columns + * 0016 unsigned byte ?? pixel + * 0017 unsigned byte ?? pixel + * ........ + * xxxx unsigned byte ?? pixel + */ +template +bool readMNISTImages(const std::string& filename, ml::Mat& images, int& numImages, int& imageSize) { + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Error: Cannot open file " << filename << std::endl; + return false; + } + + // Read magic number + uint32_t magic = 0; + file.read((char*)&magic, sizeof(magic)); + magic = reverseInt(magic); + if (magic != 2051) { + std::cerr << "Error: Invalid MNIST image file (magic number: " << magic << ")" << std::endl; + return false; + } + + // Read dimensions + uint32_t numImagesU32 = 0, rows = 0, cols = 0; + file.read((char*)&numImagesU32, sizeof(numImagesU32)); + file.read((char*)&rows, sizeof(rows)); + file.read((char*)&cols, sizeof(cols)); + + numImagesU32 = reverseInt(numImagesU32); + rows = reverseInt(rows); + cols = reverseInt(cols); + + numImages = static_cast(numImagesU32); + imageSize = rows * cols; + + std::cout << "Loading " << numImages << " images of size " + << rows << "x" << cols << " = " << imageSize << " pixels" << std::endl; + + // Create matrix: each row is a flattened image + images = ml::Mat(numImages, imageSize, 0); + + // Read pixel data + for (int i = 0; i < numImages; i++) { + for (int j = 0; j < imageSize; j++) { + unsigned char pixel = 0; + file.read((char*)&pixel, sizeof(pixel)); + // Normalize to [0, 1] range + images.setAt(i, j, static_cast(pixel) / 255.0); + } + } + + file.close(); + std::cout << "Successfully loaded " << numImages << " images" << std::endl; + return true; +} + +/** + * Read MNIST label file (IDX1-UBYTE format) + * + * File format: + * [offset] [type] [value] [description] + * 0000 32 bit integer 0x00000801(2049) magic number (MSB first) + * 0004 32 bit integer 60000 number of items + * 0008 unsigned byte ?? label + * 0009 unsigned byte ?? label + * ........ + * xxxx unsigned byte ?? label + */ +template +bool readMNISTLabels(const std::string& filename, std::vector& rawLabels, + ml::Mat& oneHotLabels, int& numLabels) { + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Error: Cannot open file " << filename << std::endl; + return false; + } + + // Read magic number + uint32_t magic = 0; + file.read((char*)&magic, sizeof(magic)); + magic = reverseInt(magic); + if (magic != 2049) { + std::cerr << "Error: Invalid MNIST label file (magic number: " << magic << ")" << std::endl; + return false; + } + + // Read number of labels + uint32_t numLabelsU32 = 0; + file.read((char*)&numLabelsU32, sizeof(numLabelsU32)); + numLabelsU32 = reverseInt(numLabelsU32); + numLabels = static_cast(numLabelsU32); + + std::cout << "Loading " << numLabels << " labels" << std::endl; + + // Read labels + rawLabels.resize(numLabels); + for (int i = 0; i < numLabels; i++) { + unsigned char label = 0; + file.read((char*)&label, sizeof(label)); + rawLabels[i] = static_cast(label); + } + + // Create one-hot encoded labels (10 classes for digits 0-9) + const int numClasses = 10; + oneHotLabels = ml::Mat(numLabels, numClasses, 0); + + for (int i = 0; i < numLabels; i++) { + int label = rawLabels[i]; + if (label >= 0 && label < numClasses) { + oneHotLabels.setAt(i, label, 1.0); + } + } + + file.close(); + std::cout << "Successfully loaded " << numLabels << " labels" << std::endl; + return true; +} + +/** + * Load MNIST dataset from files + * + * @param imageFile Path to MNIST image file (e.g., "train-images-idx3-ubyte") + * @param labelFile Path to MNIST label file (e.g., "train-labels-idx1-ubyte") + * @param dataset Output dataset structure + * @return true if successful, false otherwise + */ +template +bool loadMNISTDataset(const std::string& imageFile, const std::string& labelFile, + MNISTDataset& dataset) { + int numImages = 0, imageSize = 0; + int numLabels = 0; + + // Read images + if (!readMNISTImages(imageFile, dataset.images, numImages, imageSize)) { + return false; + } + + // Read labels + if (!readMNISTLabels(labelFile, dataset.rawLabels, dataset.labels, numLabels)) { + return false; + } + + // Verify consistency + if (numImages != numLabels) { + std::cerr << "Error: Number of images (" << numImages + << ") doesn't match number of labels (" << numLabels << ")" << std::endl; + return false; + } + + dataset.numSamples = numImages; + dataset.imageSize = imageSize; + dataset.numClasses = 10; + + std::cout << "MNIST dataset loaded successfully:" << std::endl; + std::cout << " - Samples: " << dataset.numSamples << std::endl; + std::cout << " - Image size: " << dataset.imageSize << " pixels" << std::endl; + std::cout << " - Classes: " << dataset.numClasses << std::endl; + + return true; +} + +/** + * Helper function to get a single training sample + * Returns a pair of (image, label) matrices, each as a single row + */ +template +std::pair, ml::Mat> getSample(const MNISTDataset& dataset, int index) { + if (index < 0 || index >= dataset.numSamples) { + throw std::out_of_range("Sample index out of range"); + } + + // Extract single row for image and label + ml::Mat image(1, dataset.imageSize, 0); + ml::Mat label(1, dataset.numClasses, 0); + + for (int i = 0; i < dataset.imageSize; i++) { + image.setAt(0, i, dataset.images.getAt(index, i)); + } + + for (int i = 0; i < dataset.numClasses; i++) { + label.setAt(0, i, dataset.labels.getAt(index, i)); + } + + return std::make_pair(image, label); +} + +/** + * Helper function to get a batch of samples + * Returns a pair of (images, labels) matrices + */ +template +std::pair, ml::Mat> getBatch(const MNISTDataset& dataset, + int startIdx, int batchSize) { + if (startIdx < 0 || startIdx >= dataset.numSamples) { + throw std::out_of_range("Start index out of range"); + } + + // Clamp batch size to available samples + int actualBatchSize = std::min(batchSize, dataset.numSamples - startIdx); + + ml::Mat images(actualBatchSize, dataset.imageSize, 0); + ml::Mat labels(actualBatchSize, dataset.numClasses, 0); + + for (int i = 0; i < actualBatchSize; i++) { + int srcIdx = startIdx + i; + for (int j = 0; j < dataset.imageSize; j++) { + images.setAt(i, j, dataset.images.getAt(srcIdx, j)); + } + for (int j = 0; j < dataset.numClasses; j++) { + labels.setAt(i, j, dataset.labels.getAt(srcIdx, j)); + } + } + + return std::make_pair(images, labels); +} + +/** + * Print ASCII visualization of an MNIST digit + */ +template +void printMNISTDigit(const ml::Mat& image, int label) { + std::cout << "Label: " << label << std::endl; + + // Assume image is either a single row vector (1, 784) or already the pixel values + int numPixels = image.size().cx; + if (numPixels != 784) { + std::cerr << "Error: Image must have 784 pixels" << std::endl; + return; + } + + const char* grayscale = " .:-=+*#%@"; + const int levels = 10; + + for (int row = 0; row < 28; row++) { + for (int col = 0; col < 28; col++) { + int idx = row * 28 + col; + T pixelValue = image.getAt(0, idx); + int level = static_cast(pixelValue * (levels - 1)); + level = std::min(std::max(level, 0), levels - 1); + std::cout << grayscale[level] << grayscale[level]; + } + std::cout << std::endl; + } +} + +} // namespace ml diff --git a/network.h b/network.h index e177aec..3767efe 100644 --- a/network.h +++ b/network.h @@ -11,6 +11,7 @@ #include "utility.h" #include "activation.h" #include "optimizer.h" +#include "loss.h" #include "thirdparty/jsonxx/jsonxx.h" @@ -62,24 +63,6 @@ namespace ml { mat.pushCol(col); delete[] col; } - - - template - ml::Mat Sigmoid(ml::Mat mat) { - ml::Mat result(mat.size(), 0); - for (int i = 0; i < mat.size().cy; ++i) { - for (int j = 0; j < mat.size().cx; ++j) { - T val = mat.getAt(i, j); - result.setAt(i, j, 1.0 / (1.0 + std::exp(-val))); - } - } - return result; - } - - template - ml::Mat SigGrad(ml::Mat mat) { - return ElementMult(mat, Diff(T(1), mat)); - } } @@ -508,6 +491,16 @@ namespace ml { mOwnsOptimizer = true; } + // Loss function configuration + void setLossType(LossType type) { mLossType = type; } + LossType getLossType() const { return mLossType; } + + // Batch training methods + virtual void trainSingle(const ml::Mat& input, const ml::Mat& target, T learningRate); + virtual void trainBatch(const ml::Mat& inputs, const ml::Mat& targets, T learningRate); + virtual T evaluateLoss(const ml::Mat& inputs, const ml::Mat& targets); + virtual T evaluateAccuracy(const ml::Mat& inputs, const ml::Mat& targets); + // ILayer overrides public: virtual void init() override; @@ -552,6 +545,7 @@ namespace ml { ILayer* pOutputLayer; IOptimizer* mOptimizer; bool mOwnsOptimizer; + LossType mLossType; }; @@ -574,6 +568,8 @@ namespace ml { // Default to SGD optimizer mOptimizer = new SGDOptimizer(); mOwnsOptimizer = true; + // Default to Cross-Entropy loss for classification + mLossType = LossType::CROSS_ENTROPY; } template @@ -1197,4 +1193,226 @@ namespace ml { return true; } + /** + * Train on a single sample + * Performs forward pass, backpropagation, and weight update for one input-target pair + */ + template + void Network::trainSingle(const ml::Mat& input, const ml::Mat& target, T learningRate) { + // Forward pass + ml::Mat predicted = feed(input); + + // Compute loss gradient + ml::Mat error = ComputeLossGradient(predicted, target, mLossType); + getOutputLayer()->setErrors(error); + + // Backward pass + backprop(); + + // Update weights + updateWeights(learningRate); + } + + /** + * Train on a batch of samples + * Accumulates gradients over the batch and updates weights once + */ + template + void Network::trainBatch(const ml::Mat& inputs, const ml::Mat& targets, T learningRate) { + int batchSize = inputs.size().cy; + if (batchSize == 0 || inputs.size().cy != targets.size().cy) { + std::cerr << "Error: Invalid batch - inputs and targets must have same number of rows" << std::endl; + return; + } + + // Store weight gradients for accumulation + std::map*, std::map*, ml::Mat>> accumulatedGradients; + + // Process each sample in the batch + for (int i = 0; i < batchSize; i++) { + // Extract single sample + ml::Mat sampleInput(1, inputs.size().cx, 0); + ml::Mat sampleTarget(1, targets.size().cx, 0); + + for (int j = 0; j < inputs.size().cx; j++) { + sampleInput.setAt(0, j, inputs.getAt(i, j)); + } + for (int j = 0; j < targets.size().cx; j++) { + sampleTarget.setAt(0, j, targets.getAt(i, j)); + } + + // Forward pass + ml::Mat predicted = feed(sampleInput); + + // Compute loss gradient + ml::Mat error = ComputeLossGradient(predicted, sampleTarget, mLossType); + getOutputLayer()->setErrors(error); + + // Backward pass + backprop(); + + // Accumulate gradients (but don't update weights yet) + std::vector*> toProcess; + std::set*> visited; + + toProcess.push_back(pInputLayer); + visited.insert(pInputLayer); + + for (size_t idx = 0; idx < toProcess.size(); ++idx) { + ILayer* pCurLayer = toProcess[idx]; + if (!pCurLayer) continue; + + for (ILayer* pNextLayer : pCurLayer->getSiblings()) { + if (visited.find(pNextLayer) == visited.end()) { + toProcess.push_back(pNextLayer); + visited.insert(pNextLayer); + } + + ml::Mat errors = pNextLayer->getErrors(); + if (!errors.IsGood()) continue; + + ml::Mat activated = pCurLayer->getActivatedInput(); + if (!activated.IsGood()) continue; + + // Add bias to activated output + ml::Mat activatedWithBias = activated.Copy(); + for (int b = 0; b < ILayer::GetNumBiasNodes(); ++b) + pushBiasCol(activatedWithBias); + + ml::Mat weights = pCurLayer->getWeights(pNextLayer); + if (!weights.IsGood()) continue; + + // Compute gradients + ml::Mat gradients(weights.size(), 0); + int numOutputs = errors.size().cx; + int numInputs = activatedWithBias.size().cx; + + for (int row = 0; row < numOutputs; ++row) { + T err = errors.getAt(0, row); + for (int col = 0; col < numInputs; ++col) { + T act = activatedWithBias.getAt(0, col); + gradients.setAt(row, col, err * act); + } + } + + // Accumulate gradients + if (accumulatedGradients.find(pCurLayer) == accumulatedGradients.end() || + accumulatedGradients[pCurLayer].find(pNextLayer) == accumulatedGradients[pCurLayer].end()) { + accumulatedGradients[pCurLayer][pNextLayer] = gradients; + } else { + // Add to existing gradients + for (int r = 0; r < gradients.size().cy; ++r) { + for (int c = 0; c < gradients.size().cx; ++c) { + T oldVal = accumulatedGradients[pCurLayer][pNextLayer].getAt(r, c); + accumulatedGradients[pCurLayer][pNextLayer].setAt(r, c, oldVal + gradients.getAt(r, c)); + } + } + } + } + } + } + + // Average gradients and update weights + int layerIdx = 0; + for (auto& layerPair : accumulatedGradients) { + ILayer* pCurLayer = layerPair.first; + int siblingIdx = 0; + + for (auto& siblingPair : layerPair.second) { + ILayer* pNextLayer = siblingPair.first; + ml::Mat& avgGradients = siblingPair.second; + + // Average the accumulated gradients + for (int r = 0; r < avgGradients.size().cy; ++r) { + for (int c = 0; c < avgGradients.size().cx; ++c) { + T avgVal = avgGradients.getAt(r, c) / batchSize; + avgGradients.setAt(r, c, avgVal); + } + } + + // Update weights using optimizer + ml::Mat weights = pCurLayer->getWeights(pNextLayer); + ml::Mat updatedWeights = weights.Copy(); + + std::string layerKey = pCurLayer->getName() + "_to_" + pNextLayer->getName() + + "_" + std::to_string(layerIdx) + "_" + std::to_string(siblingIdx); + + mOptimizer->updateWeights(updatedWeights, avgGradients, learningRate, layerKey); + pCurLayer->setWeights(pNextLayer, updatedWeights); + + siblingIdx++; + } + layerIdx++; + } + } + + /** + * Evaluate loss on a dataset + * Returns the average loss over all samples + */ + template + T Network::evaluateLoss(const ml::Mat& inputs, const ml::Mat& targets) { + int numSamples = inputs.size().cy; + if (numSamples == 0 || inputs.size().cy != targets.size().cy) { + return T(0.0); + } + + T totalLoss = T(0.0); + + for (int i = 0; i < numSamples; i++) { + // Extract single sample + ml::Mat sampleInput(1, inputs.size().cx, 0); + ml::Mat sampleTarget(1, targets.size().cx, 0); + + for (int j = 0; j < inputs.size().cx; j++) { + sampleInput.setAt(0, j, inputs.getAt(i, j)); + } + for (int j = 0; j < targets.size().cx; j++) { + sampleTarget.setAt(0, j, targets.getAt(i, j)); + } + + // Forward pass + ml::Mat predicted = feed(sampleInput); + + // Compute loss for this sample + T sampleLoss = ComputeLoss(predicted, sampleTarget, mLossType); + totalLoss += sampleLoss; + } + + return totalLoss / numSamples; + } + + /** + * Evaluate accuracy on a dataset + * Returns accuracy as a percentage (0-100) + */ + template + T Network::evaluateAccuracy(const ml::Mat& inputs, const ml::Mat& targets) { + int numSamples = inputs.size().cy; + if (numSamples == 0 || inputs.size().cy != targets.size().cy) { + return T(0.0); + } + + // Create predictions matrix + ml::Mat allPredictions(numSamples, targets.size().cx, 0); + + for (int i = 0; i < numSamples; i++) { + // Extract single sample + ml::Mat sampleInput(1, inputs.size().cx, 0); + for (int j = 0; j < inputs.size().cx; j++) { + sampleInput.setAt(0, j, inputs.getAt(i, j)); + } + + // Forward pass + ml::Mat predicted = feed(sampleInput); + + // Store predictions + for (int j = 0; j < targets.size().cx; j++) { + allPredictions.setAt(i, j, predicted.getAt(0, j)); + } + } + + return ComputeAccuracy(allPredictions, targets); + } + } // namespace ml diff --git a/test_comprehensive.cpp b/test_comprehensive.cpp index cff9107..5cbcb93 100644 --- a/test_comprehensive.cpp +++ b/test_comprehensive.cpp @@ -217,7 +217,7 @@ void test_sigmoid_gradient() { Mat halfMat(1, 1, 0); halfMat.setAt(0, 0, 0.5); // Set value explicitly since constructor takes int - Mat grad = SigGrad(halfMat); + Mat grad = SigmoidGrad(halfMat); T gradValue = grad.getAt(0, 0); bool test1 = approxEqual(gradValue, 0.25, 1e-5); printTestResult("σ'(σ(0)) = 0.25", test1); @@ -227,14 +227,14 @@ void test_sigmoid_gradient() { nearZero.setAt(0, 0, 0.001); Mat nearOne(1, 1, 0); nearOne.setAt(0, 0, 0.999); - T gradNearZero = SigGrad(nearZero).getAt(0, 0); - T gradNearOne = SigGrad(nearOne).getAt(0, 0); + T gradNearZero = SigmoidGrad(nearZero).getAt(0, 0); + T gradNearOne = SigmoidGrad(nearOne).getAt(0, 0); bool test2 = (gradNearZero < 0.001 && gradNearOne < 0.001); printTestResult("Gradient near 0 and 1 is small", test2); // Test 3: Maximum gradient is at 0.5 Mat values{{0.1, 0.3, 0.5, 0.7, 0.9}}; - Mat grads = SigGrad(values); + Mat grads = SigmoidGrad(values); T maxGrad = grads.getAt(0, 2); // Should be at 0.5 bool test3 = true; for (int i = 0; i < 5; ++i) { @@ -571,6 +571,9 @@ void test_xor_convergence() { network->connect(inputLayer, hiddenLayer); network->connect(hiddenLayer, outputLayer); network->setOutputLayer(outputLayer); + + // Use MSE loss for XOR (better for simple regression-like problems) + network->setLossType(LossType::MSE); network->init(); // XOR training data @@ -596,12 +599,13 @@ void test_xor_convergence() { T finalError = 0; // Training loop + // Note: Using manual training loop instead of trainSingle API to match existing test expectations for (int epoch = 0; epoch < epochs; ++epoch) { T totalError = 0; for (size_t i = 0; i < inputs.size(); ++i) { Mat output = network->feed(inputs[i]); - Mat error = Diff(expected[i], output); + Mat error = Diff(expected[i], output); // target - predicted T sampleError = 0; for (int j = 0; j < error.size().cx; ++j) { diff --git a/test_mnist_loader.cpp b/test_mnist_loader.cpp new file mode 100644 index 0000000..52e557b --- /dev/null +++ b/test_mnist_loader.cpp @@ -0,0 +1,141 @@ +#include +#include +#include "allheader.h" +#include "mnist_loader.h" + +using namespace std; +using namespace ml; +using namespace Utility; + +/** + * Test MNIST data loading functionality + * + * This test demonstrates how to: + * 1. Load MNIST dataset from IDX files + * 2. Access individual samples + * 3. Visualize digits + * 4. Get batches of data for training + * + * To run this test, you need MNIST dataset files: + * - train-images-idx3-ubyte + * - train-labels-idx1-ubyte + * - t10k-images-idx3-ubyte (optional, for test set) + * - t10k-labels-idx1-ubyte (optional, for test set) + * + * Download from: http://yann.lecun.com/exdb/mnist/ + */ +void test_mnist_loading() { + BEGIN_TESTS("Testing MNIST Data Loading"); + typedef double T; + + // Path to MNIST data files (adjust these paths as needed) + const string trainImagesFile = "train-images-idx3-ubyte"; + const string trainLabelsFile = "train-labels-idx1-ubyte"; + + // Load training dataset + cout << "\n=== Loading MNIST Training Dataset ===" << endl; + MNISTDataset trainDataset; + + if (!loadMNISTDataset(trainImagesFile, trainLabelsFile, trainDataset)) { + cout << "Failed to load MNIST dataset. Make sure the MNIST files are in the current directory." << endl; + cout << "Download MNIST from: http://yann.lecun.com/exdb/mnist/" << endl; + return; + } + + // Verify dataset properties + cout << "\n=== Dataset Properties ===" << endl; + cout << "Number of training samples: " << trainDataset.numSamples << endl; + cout << "Image dimensions: 28x28 = " << trainDataset.imageSize << " pixels" << endl; + cout << "Number of classes: " << trainDataset.numClasses << endl; + cout << "Images matrix size: (" << trainDataset.images.size().cy + << ", " << trainDataset.images.size().cx << ")" << endl; + cout << "Labels matrix size: (" << trainDataset.labels.size().cy + << ", " << trainDataset.labels.size().cx << ")" << endl; + + // Display first few samples + cout << "\n=== Displaying Sample Digits ===" << endl; + for (int i = 0; i < 3; i++) { + auto sample = getSample(trainDataset, i); + cout << "\nSample " << i << ":" << endl; + printMNISTDigit(sample.first, trainDataset.rawLabels[i]); + + // Show the one-hot encoded label + cout << "One-hot label: ["; + for (int j = 0; j < trainDataset.numClasses; j++) { + cout << sample.second.getAt(0, j); + if (j < trainDataset.numClasses - 1) cout << ", "; + } + cout << "]" << endl; + } + + // Test batch loading + cout << "\n=== Testing Batch Loading ===" << endl; + int batchSize = 32; + auto batch = getBatch(trainDataset, 0, batchSize); + + cout << "Batch images matrix: (" << batch.first.size().cy + << ", " << batch.first.size().cx << ")" << endl; + cout << "Batch labels matrix: (" << batch.second.size().cy + << ", " << batch.second.size().cx << ")" << endl; + + // Verify pixel value ranges (should be normalized to [0, 1]) + cout << "\n=== Verifying Data Normalization ===" << endl; + T minVal = 1.0, maxVal = 0.0; + for (int i = 0; i < trainDataset.imageSize; i++) { + T val = trainDataset.images.getAt(0, i); + if (val < minVal) minVal = val; + if (val > maxVal) maxVal = val; + } + cout << "Pixel value range: [" << minVal << ", " << maxVal << "]" << endl; + + // Show label distribution + cout << "\n=== Label Distribution (first 1000 samples) ===" << endl; + int labelCounts[10] = {0}; + int samplesToCheck = std::min(1000, trainDataset.numSamples); + for (int i = 0; i < samplesToCheck; i++) { + int label = trainDataset.rawLabels[i]; + if (label >= 0 && label < 10) { + labelCounts[label]++; + } + } + for (int i = 0; i < 10; i++) { + cout << "Digit " << i << ": " << labelCounts[i] << " samples" << endl; + } + + cout << "\n=== MNIST Loading Test Complete ===" << endl; +} + +/** + * Test loading MNIST test set (optional) + */ +void test_mnist_test_set() { + BEGIN_TESTS("Testing MNIST Test Set Loading"); + typedef double T; + + const string testImagesFile = "t10k-images-idx3-ubyte"; + const string testLabelsFile = "t10k-labels-idx1-ubyte"; + + cout << "\n=== Loading MNIST Test Dataset ===" << endl; + MNISTDataset testDataset; + + if (!loadMNISTDataset(testImagesFile, testLabelsFile, testDataset)) { + cout << "Failed to load MNIST test dataset." << endl; + return; + } + + cout << "Test set size: " << testDataset.numSamples << " samples" << endl; +} + +int main() { + cout << "========================================" << endl; + cout << " MNIST Data Loader Test Suite " << endl; + cout << "========================================" << endl; + + test_mnist_loading(); + + // Optionally test the test set + cout << "\n\n"; + test_mnist_test_set(); + + return 0; +} diff --git a/test_mnist_training.cpp b/test_mnist_training.cpp new file mode 100644 index 0000000..4cc7a7b --- /dev/null +++ b/test_mnist_training.cpp @@ -0,0 +1,303 @@ +#include +#include +#include +#include "allheader.h" +#include "network.h" +#include "mnist_loader.h" + +using namespace std; +using namespace ml; +using namespace Utility; + +/** + * MNIST Training Test Suite + * + * Tests the MNIST training functionality including: + * - Network creation and initialization + * - Batch training + * - Loss computation + * - Accuracy evaluation + * - Cross-entropy loss + */ + +// Helper to check approximate equality +template +bool approxEqual(T a, T b, T epsilon = 0.01) { + return std::abs(a - b) < epsilon; +} + +void test_network_creation() { + BEGIN_TESTS("MNIST Network Creation"); + typedef double T; + + Network* network = new Network(); + ILayer* input = new Layer(784, "Input", ActivationType::RELU); + ILayer* hidden1 = new Layer(128, "Hidden1", ActivationType::RELU); + ILayer* output = new Layer(10, "Output", ActivationType::SIGMOID); + + network->setInputLayer(input); + network->connect(input, hidden1); + network->connect(hidden1, output); + network->setOutputLayer(output); + network->setOptimizerType(OptimizerType::ADAM); + network->setLossType(LossType::CROSS_ENTROPY); + + network->init(); + + cout << "✓ Network created successfully" << endl; + cout << "✓ Optimizer set to Adam" << endl; + cout << "✓ Loss set to Cross-Entropy" << endl; + + // Test forward pass with random input + ml::Mat testInput(1, 784, 0.5); + ml::Mat output_result = network->feed(testInput); + + cout << "✓ Forward pass works" << endl; + cout << " Output size: (" << output_result.size().cy << ", " << output_result.size().cx << ")" << endl; + + if (output_result.size().cy == 1 && output_result.size().cx == 10) { + cout << "✓ Output dimensions correct" << endl; + } else { + cout << "✗ Output dimensions incorrect" << endl; + } + + delete network; +} + +void test_batch_training() { + BEGIN_TESTS("Batch Training Functionality"); + typedef double T; + + // Create small network for testing + Network* network = new Network(); + ILayer* input = new Layer(784, "Input", ActivationType::RELU); + ILayer* hidden = new Layer(64, "Hidden", ActivationType::RELU); + ILayer* output = new Layer(10, "Output", ActivationType::SIGMOID); + + network->setInputLayer(input); + network->connect(input, hidden); + network->connect(hidden, output); + network->setOutputLayer(output); + network->setOptimizerType(OptimizerType::ADAM); + network->setLossType(LossType::CROSS_ENTROPY); + network->init(); + + // Create synthetic batch (batch_size=4, input_size=784) + int batchSize = 4; + ml::Mat batchInputs(batchSize, 784, 0); + ml::Mat batchTargets(batchSize, 10, 0); + + // Fill with simple patterns + for (int i = 0; i < batchSize; i++) { + // Simple pattern for each sample + for (int j = 0; j < 784; j++) { + batchInputs.setAt(i, j, (T)(i * 0.1 + j * 0.001)); + } + // One-hot target + int targetClass = i % 10; + batchTargets.setAt(i, targetClass, 1.0); + } + + // Get initial loss + T initialLoss = network->evaluateLoss(batchInputs, batchTargets); + cout << "Initial loss: " << std::fixed << std::setprecision(4) << initialLoss << endl; + + // Train for a few iterations + T learningRate = 0.01; + for (int iter = 0; iter < 10; iter++) { + network->trainBatch(batchInputs, batchTargets, learningRate); + } + + // Get final loss + T finalLoss = network->evaluateLoss(batchInputs, batchTargets); + cout << "Final loss after 10 iterations: " << std::setprecision(4) << finalLoss << endl; + + if (finalLoss < initialLoss) { + cout << "✓ Loss decreased during training" << endl; + } else { + cout << "✗ Loss did not decrease" << endl; + } + + delete network; +} + +void test_accuracy_computation() { + BEGIN_TESTS("Accuracy Computation"); + typedef double T; + + // Create test predictions and targets + ml::Mat predictions(5, 10, 0); + ml::Mat targets(5, 10, 0); + + // Sample 0: pred=0, target=0 ✓ + predictions.setAt(0, 0, 0.9); + targets.setAt(0, 0, 1.0); + + // Sample 1: pred=1, target=1 ✓ + predictions.setAt(1, 1, 0.8); + targets.setAt(1, 1, 1.0); + + // Sample 2: pred=2, target=3 ✗ + predictions.setAt(2, 2, 0.7); + targets.setAt(2, 3, 1.0); + + // Sample 3: pred=4, target=4 ✓ + predictions.setAt(3, 4, 0.85); + targets.setAt(3, 4, 1.0); + + // Sample 4: pred=5, target=5 ✓ + predictions.setAt(4, 5, 0.95); + targets.setAt(4, 5, 1.0); + + T accuracy = ComputeAccuracy(predictions, targets); + cout << "Accuracy: " << std::setprecision(1) << accuracy << "%" << endl; + + // Expected: 4 correct out of 5 = 80% + if (approxEqual(accuracy, T(80.0), T(0.1))) { + cout << "✓ Accuracy computation correct (4/5 = 80%)" << endl; + } else { + cout << "✗ Accuracy computation incorrect (expected 80%, got " << accuracy << "%)" << endl; + } +} + +void test_mnist_mini_training() { + BEGIN_TESTS("MNIST Mini Training (with actual data)"); + typedef double T; + + cout << "Loading small MNIST subset..." << endl; + + MNISTDataset trainDataset; + + if (!loadMNISTDataset("train-images-idx3-ubyte", "train-labels-idx1-ubyte", trainDataset)) { + cout << "MNIST data not available, skipping this test" << endl; + cout << "Download MNIST from: http://yann.lecun.com/exdb/mnist/" << endl; + return; + } + + cout << "✓ MNIST data loaded" << endl; + + // Create small network + Network* network = new Network(); + ILayer* input = new Layer(784, "Input", ActivationType::RELU); + ILayer* hidden = new Layer(128, "Hidden", ActivationType::RELU); + ILayer* output = new Layer(10, "Output", ActivationType::SIGMOID); + + network->setInputLayer(input); + network->connect(input, hidden); + network->connect(hidden, output); + network->setOutputLayer(output); + network->setOptimizerType(OptimizerType::ADAM); + network->setLossType(LossType::CROSS_ENTROPY); + network->init(); + + cout << "✓ Network initialized (784-128-10)" << endl; + + // Use first 100 samples for quick test + int numSamples = std::min(100, trainDataset.numSamples); + ml::Mat trainImages(numSamples, 784, 0); + ml::Mat trainLabels(numSamples, 10, 0); + + for (int i = 0; i < numSamples; i++) { + for (int j = 0; j < 784; j++) { + trainImages.setAt(i, j, trainDataset.images.getAt(i, j)); + } + for (int j = 0; j < 10; j++) { + trainLabels.setAt(i, j, trainDataset.labels.getAt(i, j)); + } + } + + // Initial evaluation + T initialAccuracy = network->evaluateAccuracy(trainImages, trainLabels); + T initialLoss = network->evaluateLoss(trainImages, trainLabels); + + cout << "Initial accuracy: " << std::setprecision(2) << initialAccuracy << "%" << endl; + cout << "Initial loss: " << std::setprecision(4) << initialLoss << endl; + + // Train for 2 epochs with small batches + cout << "\nTraining for 2 epochs..." << endl; + int batchSize = 16; + T learningRate = 0.001; + + for (int epoch = 0; epoch < 2; epoch++) { + int numBatches = (numSamples + batchSize - 1) / batchSize; + + for (int batch = 0; batch < numBatches; batch++) { + int startIdx = batch * batchSize; + int endIdx = std::min(startIdx + batchSize, numSamples); + int actualBatchSize = endIdx - startIdx; + + ml::Mat batchImages(actualBatchSize, 784, 0); + ml::Mat batchLabels(actualBatchSize, 10, 0); + + for (int i = 0; i < actualBatchSize; i++) { + for (int j = 0; j < 784; j++) { + batchImages.setAt(i, j, trainImages.getAt(startIdx + i, j)); + } + for (int j = 0; j < 10; j++) { + batchLabels.setAt(i, j, trainLabels.getAt(startIdx + i, j)); + } + } + + network->trainBatch(batchImages, batchLabels, learningRate); + } + + T epochAccuracy = network->evaluateAccuracy(trainImages, trainLabels); + T epochLoss = network->evaluateLoss(trainImages, trainLabels); + + cout << " Epoch " << (epoch + 1) << ": Accuracy=" << std::setprecision(2) << epochAccuracy + << "%, Loss=" << std::setprecision(4) << epochLoss << endl; + } + + T finalAccuracy = network->evaluateAccuracy(trainImages, trainLabels); + T finalLoss = network->evaluateLoss(trainImages, trainLabels); + + cout << "\nFinal accuracy: " << std::setprecision(2) << finalAccuracy << "%" << endl; + cout << "Final loss: " << std::setprecision(4) << finalLoss << endl; + + // Check if training improved performance + if (finalAccuracy > initialAccuracy) { + cout << "✓ Accuracy improved: " << std::setprecision(2) + << (finalAccuracy - initialAccuracy) << "% gain" << endl; + } else { + cout << "⚠ Accuracy did not improve (might need more training)" << endl; + } + + if (finalLoss < initialLoss) { + cout << "✓ Loss decreased" << endl; + } else { + cout << "⚠ Loss did not decrease (might need more training)" << endl; + } + + // Minimum expectation: final accuracy should be > 20% (better than random 10%) + if (finalAccuracy > 20.0) { + cout << "✓ Model is learning (accuracy > 20%)" << endl; + } else { + cout << "✗ Model might not be learning properly" << endl; + } + + delete network; +} + +int main() { + cout << "========================================" << endl; + cout << " MNIST Training Test Suite" << endl; + cout << "========================================\n" << endl; + + test_network_creation(); + cout << endl; + + test_batch_training(); + cout << endl; + + test_accuracy_computation(); + cout << endl; + + test_mnist_mini_training(); + cout << endl; + + cout << "========================================" << endl; + cout << "All tests complete!" << endl; + cout << "========================================" << endl; + + return 0; +} diff --git a/test_network.cpp b/test_network.cpp index d3b2a75..e816bc3 100644 --- a/test_network.cpp +++ b/test_network.cpp @@ -38,7 +38,7 @@ void test_sigmoid() { cout << ">> Sigmoid function test PASSED" << endl; } -// Test 2: Test SigGrad (Sigmoid Gradient) function +// Test 2: Test SigmoidGrad (Sigmoid Gradient) function void test_sigmoid_gradient() { BEGIN_TESTS("Testing Sigmoid Gradient Function"); typedef double T; @@ -50,7 +50,7 @@ void test_sigmoid_gradient() { sigmoidOutput.setAt(1, 0, 0.269); sigmoidOutput.setAt(1, 1, 0.881); - Mat gradient = SigGrad(sigmoidOutput); + Mat gradient = SigmoidGrad(sigmoidOutput); // Sigmoid gradient is: sig(x) * (1 - sig(x)) // For sig(x) = 0.5: grad = 0.5 * 0.5 = 0.25 diff --git a/test_training.cpp b/test_training.cpp index b4c0070..34547cf 100644 --- a/test_training.cpp +++ b/test_training.cpp @@ -38,9 +38,12 @@ void test_xor_training() { network->connect(inputLayer, hiddenLayer); network->connect(hiddenLayer, outputLayer); network->setOutputLayer(outputLayer); + + // Use MSE loss for XOR (better for simple regression-like problems) + network->setLossType(LossType::MSE); network->init(); - cout << ">> Network initialized with 2-4-1 architecture" << endl; + cout << ">> Network initialized with 2-4-1 architecture (MSE loss)" << endl; // XOR training data: [input1, input2] -> [expected_output] // XOR truth table: @@ -168,9 +171,12 @@ void test_and_gate_training() { network->connect(inputLayer, hiddenLayer); network->connect(hiddenLayer, outputLayer); network->setOutputLayer(outputLayer); + + // Use MSE loss for AND gate (better for simple regression-like problems) + network->setLossType(LossType::MSE); network->init(); - cout << ">> Network initialized with 2-2-1 architecture" << endl; + cout << ">> Network initialized with 2-2-1 architecture (MSE loss)" << endl; // AND truth table: both inputs must be 1 for output to be 1 vector> inputs; @@ -407,9 +413,12 @@ void test_or_gate_training() { network->connect(inputLayer, hiddenLayer); network->connect(hiddenLayer, outputLayer); network->setOutputLayer(outputLayer); + + // Use MSE loss for OR gate (better for simple regression-like problems) + network->setLossType(LossType::MSE); network->init(); - cout << ">> Network initialized with 2-2-1 architecture" << endl; + cout << ">> Network initialized with 2-2-1 architecture (MSE loss)" << endl; // OR truth table: output is 1 if any input is 1 vector> inputs; diff --git a/train_mnist.cpp b/train_mnist.cpp new file mode 100644 index 0000000..13d879d --- /dev/null +++ b/train_mnist.cpp @@ -0,0 +1,312 @@ +#include +#include +#include +#include +#include +#include +#include "allheader.h" +#include "network.h" +#include "mnist_loader.h" + +using namespace std; +using namespace ml; +using namespace Utility; + +/** + * MNIST Training Script + * + * Trains a fully-connected neural network on the MNIST digit classification dataset. + * Architecture: 784 (input) → 256 (ReLU) → 128 (ReLU) → 10 (Sigmoid + Cross-Entropy) + * + * Features: + * - Batch training for faster convergence + * - Adam optimizer for adaptive learning + * - Cross-entropy loss for classification + * - Training/validation monitoring + * - Model checkpointing + * - Accuracy evaluation + * + * Expected performance: 85-92% test accuracy after 10-20 epochs + */ + +// Training configuration +struct TrainConfig { + int epochs = 10; + int batchSize = 32; + double learningRate = 0.001; + int validationInterval = 1; // Evaluate every N epochs + int saveInterval = 5; // Save model every N epochs + bool shuffle = true; // Shuffle training data each epoch + string modelSavePath = "mnist_model.json"; +}; + +// Helper function to shuffle indices +void shuffleIndices(std::vector& indices) { + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(indices.begin(), indices.end(), g); +} + +int main(int argc, char* argv[]) { + cout << "========================================" << endl; + cout << " MNIST Digit Classification Training " << endl; + cout << "========================================\n" << endl; + + typedef double T; + TrainConfig config; + + // Parse command line arguments (optional) + for (int i = 1; i < argc; i++) { + string arg = argv[i]; + if (arg == "--epochs" && i + 1 < argc) { + config.epochs = std::atoi(argv[++i]); + } else if (arg == "--batch-size" && i + 1 < argc) { + config.batchSize = std::atoi(argv[++i]); + } else if (arg == "--lr" && i + 1 < argc) { + config.learningRate = std::atof(argv[++i]); + } else if (arg == "--help") { + cout << "Usage: " << argv[0] << " [options]" << endl; + cout << "Options:" << endl; + cout << " --epochs N Number of training epochs (default: 10)" << endl; + cout << " --batch-size N Batch size (default: 32)" << endl; + cout << " --lr RATE Learning rate (default: 0.001)" << endl; + cout << " --help Show this message" << endl; + return 0; + } + } + + cout << "Configuration:" << endl; + cout << " Epochs: " << config.epochs << endl; + cout << " Batch size: " << config.batchSize << endl; + cout << " Learning rate: " << config.learningRate << endl; + cout << endl; + + // ======================================== + // Load MNIST Dataset + // ======================================== + cout << "Loading MNIST dataset..." << endl; + + MNISTDataset trainDataset, testDataset; + + if (!loadMNISTDataset("train-images-idx3-ubyte", "train-labels-idx1-ubyte", trainDataset)) { + cerr << "Failed to load training data. Make sure MNIST files are in the current directory." << endl; + cerr << "Download from: http://yann.lecun.com/exdb/mnist/" << endl; + return 1; + } + + if (!loadMNISTDataset("t10k-images-idx3-ubyte", "t10k-labels-idx1-ubyte", testDataset)) { + cerr << "Failed to load test data." << endl; + return 1; + } + + cout << "\nDataset loaded successfully!" << endl; + cout << " Training samples: " << trainDataset.numSamples << endl; + cout << " Test samples: " << testDataset.numSamples << endl; + cout << endl; + + // ======================================== + // Create Neural Network + // ======================================== + cout << "Creating neural network..." << endl; + + Network* network = new Network(); + + // Architecture: 784 → 256 → 128 → 10 + ILayer* inputLayer = new Layer(784, "Input", ActivationType::RELU); + ILayer* hidden1 = new Layer(256, "Hidden1", ActivationType::RELU); + ILayer* hidden2 = new Layer(128, "Hidden2", ActivationType::RELU); + ILayer* outputLayer = new Layer(10, "Output", ActivationType::SIGMOID); + + network->setInputLayer(inputLayer); + network->connect(inputLayer, hidden1); + network->connect(hidden1, hidden2); + network->connect(hidden2, outputLayer); + network->setOutputLayer(outputLayer); + + // Configure optimizer and loss + network->setOptimizerType(OptimizerType::ADAM); + network->setLossType(LossType::CROSS_ENTROPY); + + network->init(); + + cout << "Network architecture:" << endl; + cout << " Input: 784 neurons (28x28 pixels)" << endl; + cout << " Hidden1: 256 neurons (ReLU)" << endl; + cout << " Hidden2: 128 neurons (ReLU)" << endl; + cout << " Output: 10 neurons (Sigmoid)" << endl; + cout << " Optimizer: Adam" << endl; + cout << " Loss: Cross-Entropy" << endl; + cout << endl; + + // ======================================== + // Training Loop + // ======================================== + cout << "Starting training...\n" << endl; + + int numBatches = (trainDataset.numSamples + config.batchSize - 1) / config.batchSize; + + // Track best accuracy for model saving + T bestTestAccuracy = 0.0; + + auto trainingStartTime = std::chrono::high_resolution_clock::now(); + + for (int epoch = 0; epoch < config.epochs; epoch++) { + auto epochStartTime = std::chrono::high_resolution_clock::now(); + + // Shuffle training data + std::vector indices(trainDataset.numSamples); + for (int i = 0; i < trainDataset.numSamples; i++) { + indices[i] = i; + } + if (config.shuffle) { + shuffleIndices(indices); + } + + cout << "Epoch " << (epoch + 1) << "/" << config.epochs << endl; + + // Training batches + T epochLoss = 0.0; + for (int batch = 0; batch < numBatches; batch++) { + int startIdx = batch * config.batchSize; + int endIdx = std::min(startIdx + config.batchSize, trainDataset.numSamples); + int actualBatchSize = endIdx - startIdx; + + // Extract batch data + ml::Mat batchImages(actualBatchSize, trainDataset.imageSize, 0); + ml::Mat batchLabels(actualBatchSize, trainDataset.numClasses, 0); + + for (int i = 0; i < actualBatchSize; i++) { + int sampleIdx = indices[startIdx + i]; + for (int j = 0; j < trainDataset.imageSize; j++) { + batchImages.setAt(i, j, trainDataset.images.getAt(sampleIdx, j)); + } + for (int j = 0; j < trainDataset.numClasses; j++) { + batchLabels.setAt(i, j, trainDataset.labels.getAt(sampleIdx, j)); + } + } + + // Train on batch + network->trainBatch(batchImages, batchLabels, config.learningRate); + + // Print progress every 100 batches + if ((batch + 1) % 100 == 0 || batch == numBatches - 1) { + // Compute loss on current batch + T batchLoss = network->evaluateLoss(batchImages, batchLabels); + epochLoss += batchLoss * actualBatchSize; + + cout << " Batch " << (batch + 1) << "/" << numBatches + << " - Loss: " << std::fixed << std::setprecision(4) << batchLoss; + + // Show progress bar + int barWidth = 30; + float progress = (float)(batch + 1) / numBatches; + cout << " ["; + int pos = barWidth * progress; + for (int i = 0; i < barWidth; ++i) { + if (i < pos) cout << "="; + else if (i == pos) cout << ">"; + else cout << " "; + } + cout << "] " << int(progress * 100.0) << "%\r"; + cout.flush(); + } + } + cout << endl; + + epochLoss /= trainDataset.numSamples; + + auto epochEndTime = std::chrono::high_resolution_clock::now(); + auto epochDuration = std::chrono::duration_cast( + epochEndTime - epochStartTime).count(); + + // Evaluate on training and test sets + if ((epoch + 1) % config.validationInterval == 0) { + cout << " Evaluating..." << endl; + + // Sample a subset for faster evaluation (use first 1000 samples) + int trainEvalSize = std::min(1000, trainDataset.numSamples); + ml::Mat trainEvalImages(trainEvalSize, trainDataset.imageSize, 0); + ml::Mat trainEvalLabels(trainEvalSize, trainDataset.numClasses, 0); + + for (int i = 0; i < trainEvalSize; i++) { + for (int j = 0; j < trainDataset.imageSize; j++) { + trainEvalImages.setAt(i, j, trainDataset.images.getAt(i, j)); + } + for (int j = 0; j < trainDataset.numClasses; j++) { + trainEvalLabels.setAt(i, j, trainDataset.labels.getAt(i, j)); + } + } + + T trainAccuracy = network->evaluateAccuracy(trainEvalImages, trainEvalLabels); + T testAccuracy = network->evaluateAccuracy(testDataset.images, testDataset.labels); + + cout << " Train Loss: " << std::fixed << std::setprecision(4) << epochLoss << endl; + cout << " Train Accuracy: " << std::setprecision(2) << trainAccuracy << "%" << endl; + cout << " Test Accuracy: " << std::setprecision(2) << testAccuracy << "%" << endl; + cout << " Time: " << epochDuration << "s" << endl; + + // Save best model + if (testAccuracy > bestTestAccuracy) { + bestTestAccuracy = testAccuracy; + string bestModelPath = "mnist_model_best.json"; + cout << " New best accuracy! Saving to " << bestModelPath << endl; + network->saveToFile(bestModelPath); + } + } + + // Save checkpoint + if ((epoch + 1) % config.saveInterval == 0) { + string checkpointPath = "mnist_model_epoch" + std::to_string(epoch + 1) + ".json"; + cout << " Saving checkpoint to " << checkpointPath << endl; + network->saveToFile(checkpointPath); + } + + cout << endl; + } + + auto trainingEndTime = std::chrono::high_resolution_clock::now(); + auto totalDuration = std::chrono::duration_cast( + trainingEndTime - trainingStartTime).count(); + + // ======================================== + // Final Evaluation + // ======================================== + cout << "========================================" << endl; + cout << "Training Complete!" << endl; + cout << "========================================" << endl; + cout << "Total training time: " << totalDuration << "s" << endl; + cout << "Best test accuracy: " << std::setprecision(2) << bestTestAccuracy << "%" << endl; + + // Final save + cout << "\nSaving final model to " << config.modelSavePath << endl; + network->saveToFile(config.modelSavePath); + + // Show some predictions + cout << "\nSample predictions:" << endl; + for (int i = 0; i < 5; i++) { + auto sample = getSample(testDataset, i); + ml::Mat predicted = network->feed(sample.first); + + // Find predicted class + int predictedClass = 0; + T maxProb = predicted.getAt(0, 0); + for (int j = 1; j < 10; j++) { + if (predicted.getAt(0, j) > maxProb) { + maxProb = predicted.getAt(0, j); + predictedClass = j; + } + } + + int trueClass = testDataset.rawLabels[i]; + cout << " Sample " << i << ": True=" << trueClass + << ", Predicted=" << predictedClass + << ", Confidence=" << std::setprecision(1) << (maxProb * 100) << "%" + << (predictedClass == trueClass ? " ✓" : " ✗") << endl; + } + + // Cleanup + delete network; + + cout << "\nDone!" << endl; + return 0; +}