diff --git a/README.md b/README.md index 00b7d9b..f428d11 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,68 @@ # Neural Network Library -A C++ implementation of a basic neural network with forward and backward propagation, featuring flexible layer architecture and comprehensive unit tests. +A C++ implementation of a flexible neural network library with forward and backward propagation, featuring multiple activation functions, modern optimizers, and comprehensive unit tests. ## Features -- ✅ Forward propagation with sigmoid activation -- ✅ Backward propagation for error computation -- ✅ Flexible layer architecture (supports arbitrary network topologies) -- ✅ Matrix operations optimized with OpenMP -- ✅ Model serialization (save/load weights to/from JSON files) +### ✅ What's Available + +**Network Architecture:** +- ✅ Fully connected (dense) feedforward networks +- ✅ Arbitrary layer sizes and network depths +- ✅ Flexible layer connections (supports arbitrary network topologies) +- ✅ Composable networks (networks as layers) + +**Activation Functions:** +- ✅ **Sigmoid** - Classic activation, range (0, 1) +- ✅ **ReLU** - Rectified Linear Unit, best for hidden layers +- ✅ **Leaky ReLU** - Prevents dying ReLU problem +- ✅ **Tanh** - Hyperbolic tangent, range (-1, 1) +- ✅ **Softmax** - Multi-class classification output +- ✅ **ELU** - Exponential Linear Unit +- ✅ **SELU** - Self-normalizing ELU variant +- ✅ **Linear** - Identity function for regression + +**Optimizers:** +- ✅ **SGD** - Stochastic Gradient Descent with gradient clipping +- ✅ **Momentum** - SGD with momentum (β=0.9 default) +- ✅ **Adam** - Adaptive moment estimation (β1=0.9, β2=0.999) + +**Training & Optimization:** +- ✅ Backpropagation for error computation +- ✅ Xavier/Glorot weight initialization +- ✅ Gradient clipping for stability +- ✅ Configurable learning rates + +**Infrastructure:** +- ✅ Matrix operations optimized with OpenMP and SSE +- ✅ Model serialization (save/load weights to/from JSON) - ✅ Comprehensive unit test suite - ✅ CI/CD with GitHub Actions +- ✅ Cross-platform (x86/x64 and ARM/Apple Silicon) + +### ❌ What's Missing + +**Advanced Network Types:** +- ❌ Convolutional layers (CNN) +- ❌ Pooling layers (MaxPool, AvgPool) +- ❌ Recurrent layers (RNN, LSTM, GRU) +- ❌ Attention mechanisms + +**Training Features:** +- ❌ Batch training (currently sample-by-sample only) +- ❌ Mini-batch gradient descent +- ❌ Batch normalization +- ❌ Dropout regularization +- ❌ L1/L2 weight regularization + +**Loss Functions:** +- ❌ Cross-entropy loss (currently using MSE only) +- ❌ Multiple loss function options + +**Advanced Optimizers:** +- ❌ AdaGrad, RMSprop +- ❌ Learning rate scheduling +- ❌ Adaptive learning rate decay ## Building @@ -32,6 +84,12 @@ make test_network make test_model_save_load ./test_model_save_load +# Run activation functions and optimizer tests +g++ -std=c++11 -O3 -fopenmp -msse2 -I. \ + test_activations_optimizers.cpp Matrix/matrix.cpp thirdparty/jsonxx/jsonxx.cpp \ + -o test_activations_optimizers +./test_activations_optimizers + # Or use CTest to run all tests ctest --output-on-failure ``` @@ -123,17 +181,21 @@ This generates: ## Usage Example +### Basic Network with Custom Activations + ```cpp #include "network.h" using namespace ml; -// Create a simple 3-layer network: 2 inputs -> 4 hidden -> 1 output +// Create a 3-layer network with ReLU hidden layers and sigmoid output +// Architecture: 2 inputs -> 4 hidden (ReLU) -> 1 output (Sigmoid) Network* network = new Network(); -ILayer* inputLayer = new Layer(2, "Input"); -ILayer* hiddenLayer = new Layer(4, "Hidden"); -ILayer* outputLayer = new Layer(1, "Output"); +// Create layers with specific activation functions +Layer* inputLayer = new Layer(2, "Input", ActivationType::LINEAR); +Layer* hiddenLayer = new Layer(4, "Hidden", ActivationType::RELU); +Layer* outputLayer = new Layer(1, "Output", ActivationType::SIGMOID); // Connect layers network->setInputLayer(inputLayer); @@ -141,6 +203,9 @@ network->connect(inputLayer, hiddenLayer); network->connect(hiddenLayer, outputLayer); network->setOutputLayer(outputLayer); +// Set optimizer (default is SGD) +network->setOptimizerType(OptimizerType::ADAM); // or MOMENTUM, SGD + // Initialize weights network->init(); @@ -151,15 +216,47 @@ input.setAt(0, 1, 0.5); Mat output = network->feed(input); -// Backward pass +// Backward pass and weight update Mat targetOutput(1, 1, 0.8); Mat error = Diff(targetOutput, output); outputLayer->setErrors(error); network->backprop(); +network->updateWeights(0.01); // learning rate = 0.01 -// Access propagated errors -Mat hiddenErrors = hiddenLayer->getErrors(); -Mat inputErrors = inputLayer->getErrors(); +// Training loop example +for (int epoch = 0; epoch < 1000; epoch++) { + Mat output = network->feed(input); + outputLayer->setErrors(Diff(targetOutput, output)); + network->backprop(); + network->updateWeights(0.01); +} +``` + +### Available Activation Functions + +```cpp +// When creating layers, specify activation type: +Layer* layer1 = new Layer(10, "Layer1", ActivationType::RELU); +Layer* layer2 = new Layer(10, "Layer2", ActivationType::TANH); +Layer* layer3 = new Layer(10, "Layer3", ActivationType::SIGMOID); +Layer* layer4 = new Layer(10, "Layer4", ActivationType::SOFTMAX); + +// For Leaky ReLU or ELU, you can specify the alpha parameter: +Layer* leaky = new Layer(10, "Leaky", ActivationType::LEAKY_RELU, 0.01); +Layer* elu = new Layer(10, "ELU", ActivationType::ELU, 1.0); +``` + +### Available Optimizers + +```cpp +// Set optimizer type +network->setOptimizerType(OptimizerType::SGD); // Basic SGD with gradient clipping +network->setOptimizerType(OptimizerType::MOMENTUM); // SGD with momentum (β=0.9) +network->setOptimizerType(OptimizerType::ADAM); // Adam optimizer (β1=0.9, β2=0.999) + +// Or create and set a custom optimizer +AdamOptimizer* adam = new AdamOptimizer(0.9, 0.999, 1e-8); +network->setOptimizer(adam); ``` ## Matrix Operations @@ -173,11 +270,14 @@ Mat result = ElementMult(m1, m2); // Matrix multiplication Mat result = Mult(m1, m2); -// Sigmoid activation -Mat activated = Sigmoid(input); +// Activation functions (using unified interface) +Mat activated = Activate(input, ActivationType::RELU); +Mat grad = ActivateGrad(activated, ActivationType::RELU); -// Sigmoid gradient -Mat grad = SigGrad(activated); +// Or use specific activation functions directly +Mat sigmoid_out = Sigmoid(input); +Mat relu_out = ReLU(input); +Mat tanh_out = Tanh(input); ``` ## Model Serialization diff --git a/activation.h b/activation.h new file mode 100644 index 0000000..f5cb649 --- /dev/null +++ b/activation.h @@ -0,0 +1,343 @@ +#pragma once + +#include "Matrix/matrix.h" +#include +#include + +namespace ml { + + // Activation function types + enum class ActivationType { + SIGMOID, + RELU, + LEAKY_RELU, + TANH, + SOFTMAX, + ELU, + SELU, + LINEAR + }; + + // Convert activation type to string for debugging/logging + inline const char* ActivationTypeToString(ActivationType type) { + switch (type) { + case ActivationType::SIGMOID: return "Sigmoid"; + case ActivationType::RELU: return "ReLU"; + case ActivationType::LEAKY_RELU: return "LeakyReLU"; + case ActivationType::TANH: return "Tanh"; + case ActivationType::SOFTMAX: return "Softmax"; + case ActivationType::ELU: return "ELU"; + case ActivationType::SELU: return "SELU"; + case ActivationType::LINEAR: return "Linear"; + default: return "Unknown"; + } + } + + // ========== ACTIVATION FUNCTIONS ========== + + // Sigmoid: σ(x) = 1 / (1 + e^(-x)) + // Range: (0, 1) + // Use: Binary classification output, historically used in hidden layers + template + ml::Mat Sigmoid(const ml::Mat& mat) { + ml::Mat result(mat.size(), 0); + for (int i = 0; i < mat.size().cy; ++i) { + for (int j = 0; j < mat.size().cx; ++j) { + T val = mat.getAt(i, j); + result.setAt(i, j, 1.0 / (1.0 + std::exp(-val))); + } + } + return result; + } + + // ReLU: f(x) = max(0, x) + // Range: [0, ∞) + // Use: Default choice for hidden layers in deep networks + template + ml::Mat ReLU(const ml::Mat& mat) { + ml::Mat result(mat.size(), 0); + for (int i = 0; i < mat.size().cy; ++i) { + for (int j = 0; j < mat.size().cx; ++j) { + T val = mat.getAt(i, j); + result.setAt(i, j, std::max(T(0), val)); + } + } + return result; + } + + // Leaky ReLU: f(x) = max(αx, x) where α is typically 0.01 + // Range: (-∞, ∞) + // Use: Prevents dying ReLU problem by allowing small negative values + template + ml::Mat LeakyReLU(const ml::Mat& mat, T alpha = 0.01) { + ml::Mat result(mat.size(), 0); + for (int i = 0; i < mat.size().cy; ++i) { + for (int j = 0; j < mat.size().cx; ++j) { + T val = mat.getAt(i, j); + result.setAt(i, j, val > 0 ? val : alpha * val); + } + } + return result; + } + + // Tanh: f(x) = (e^x - e^(-x)) / (e^x + e^(-x)) + // Range: (-1, 1) + // Use: Hidden layers, zero-centered (unlike sigmoid) + template + ml::Mat Tanh(const ml::Mat& mat) { + ml::Mat result(mat.size(), 0); + for (int i = 0; i < mat.size().cy; ++i) { + for (int j = 0; j < mat.size().cx; ++j) { + T val = mat.getAt(i, j); + result.setAt(i, j, std::tanh(val)); + } + } + return result; + } + + // Softmax: f(x_i) = e^(x_i) / Σ(e^(x_j)) + // Range: (0, 1), outputs sum to 1 + // Use: Multi-class classification output layer + template + ml::Mat Softmax(const ml::Mat& mat) { + ml::Mat result(mat.size(), 0); + + // Process each row independently (each row is a sample) + for (int i = 0; i < mat.size().cy; ++i) { + // Find max for numerical stability + T maxVal = mat.getAt(i, 0); + for (int j = 1; j < mat.size().cx; ++j) { + maxVal = std::max(maxVal, mat.getAt(i, j)); + } + + // Compute exp(x - max) and sum + T sum = 0; + for (int j = 0; j < mat.size().cx; ++j) { + T expVal = std::exp(mat.getAt(i, j) - maxVal); + result.setAt(i, j, expVal); + sum += expVal; + } + + // Normalize + for (int j = 0; j < mat.size().cx; ++j) { + result.setAt(i, j, result.getAt(i, j) / sum); + } + } + return result; + } + + // ELU: f(x) = x if x > 0, α(e^x - 1) if x ≤ 0 + // Range: (-α, ∞) where α is typically 1.0 + // Use: Can produce negative outputs, smoother than ReLU + template + ml::Mat ELU(const ml::Mat& mat, T alpha = 1.0) { + ml::Mat result(mat.size(), 0); + for (int i = 0; i < mat.size().cy; ++i) { + for (int j = 0; j < mat.size().cx; ++j) { + T val = mat.getAt(i, j); + result.setAt(i, j, val > 0 ? val : alpha * (std::exp(val) - 1)); + } + } + return result; + } + + // SELU: Self-normalizing variant of ELU with specific constants + // λ ≈ 1.0507, α ≈ 1.67326 + // Range: (-λα, ∞) + // Use: Self-normalizing properties for deep networks + template + ml::Mat SELU(const ml::Mat& mat) { + const T lambda = 1.0507009873554804934193349852946; + const T alpha = 1.6732632423543772848170429916717; + + ml::Mat result(mat.size(), 0); + for (int i = 0; i < mat.size().cy; ++i) { + for (int j = 0; j < mat.size().cx; ++j) { + T val = mat.getAt(i, j); + result.setAt(i, j, lambda * (val > 0 ? val : alpha * (std::exp(val) - 1))); + } + } + return result; + } + + // Linear: f(x) = x + // Range: (-∞, ∞) + // Use: Regression output layer + template + ml::Mat Linear(const ml::Mat& mat) { + return mat.Copy(); + } + + // ========== ACTIVATION GRADIENTS ========== + + // Sigmoid gradient: σ'(x) = σ(x) * (1 - σ(x)) + // Note: Input should be the ACTIVATED values, not raw inputs + template + ml::Mat SigmoidGrad(const ml::Mat& activated) { + return ml::ElementMult(activated, ml::Diff(1, activated)); + } + + // ReLU gradient: f'(x) = 1 if x > 0, 0 otherwise + // Note: Input should be the ACTIVATED values + template + ml::Mat ReLUGrad(const ml::Mat& activated) { + ml::Mat result(activated.size(), 0); + for (int i = 0; i < activated.size().cy; ++i) { + for (int j = 0; j < activated.size().cx; ++j) { + T val = activated.getAt(i, j); + result.setAt(i, j, val > 0 ? T(1) : T(0)); + } + } + return result; + } + + // Leaky ReLU gradient: f'(x) = 1 if x > 0, α otherwise + template + ml::Mat LeakyReLUGrad(const ml::Mat& activated, T alpha = 0.01) { + ml::Mat result(activated.size(), 0); + for (int i = 0; i < activated.size().cy; ++i) { + for (int j = 0; j < activated.size().cx; ++j) { + T val = activated.getAt(i, j); + result.setAt(i, j, val > 0 ? T(1) : alpha); + } + } + return result; + } + + // Tanh gradient: f'(x) = 1 - tanh²(x) + // Note: Input should be the ACTIVATED values + template + ml::Mat TanhGrad(const ml::Mat& activated) { + ml::Mat result(activated.size(), 0); + for (int i = 0; i < activated.size().cy; ++i) { + for (int j = 0; j < activated.size().cx; ++j) { + T val = activated.getAt(i, j); + result.setAt(i, j, 1 - val * val); + } + } + return result; + } + + // Softmax gradient: For cross-entropy loss, gradient simplifies to (predicted - target) + // This is handled in the loss function, so we return 1s + // For general case: ∂y_i/∂x_j = y_i(δ_ij - y_j) where δ is Kronecker delta + template + ml::Mat SoftmaxGrad(const ml::Mat& activated) { + // When used with cross-entropy loss, the gradient simplifies + // and is typically computed differently in the loss function + // For now, return identity (will be properly handled in loss computation) + ml::Mat result(activated.size(), 0); + for (int i = 0; i < activated.size().cy; ++i) { + for (int j = 0; j < activated.size().cx; ++j) { + result.setAt(i, j, T(1)); + } + } + return result; + } + + // ELU gradient: f'(x) = 1 if x > 0, α*e^x if x ≤ 0 + // Note: For activated values, if val > 0 then gradient is 1 + // If val ≤ 0, then val = α(e^x - 1), so e^x = val/α + 1 + template + ml::Mat ELUGrad(const ml::Mat& activated, T alpha = 1.0) { + ml::Mat result(activated.size(), 0); + for (int i = 0; i < activated.size().cy; ++i) { + for (int j = 0; j < activated.size().cx; ++j) { + T val = activated.getAt(i, j); + if (val > 0) { + result.setAt(i, j, T(1)); + } else { + // val = α(e^x - 1), so gradient = α*e^x = val + α + result.setAt(i, j, val + alpha); + } + } + } + return result; + } + + // SELU gradient: Similar to ELU but with SELU constants + template + ml::Mat SELUGrad(const ml::Mat& activated) { + const T lambda = 1.0507009873554804934193349852946; + const T alpha = 1.6732632423543772848170429916717; + + ml::Mat result(activated.size(), 0); + for (int i = 0; i < activated.size().cy; ++i) { + for (int j = 0; j < activated.size().cx; ++j) { + T val = activated.getAt(i, j); + if (val > 0) { + result.setAt(i, j, lambda); + } else { + // val = λ*α(e^x - 1), gradient = λ*α*e^x = val + λ*α + result.setAt(i, j, val + lambda * alpha); + } + } + } + return result; + } + + // Linear gradient: f'(x) = 1 + template + ml::Mat LinearGrad(const ml::Mat& activated) { + ml::Mat result(activated.size(), 0); + for (int i = 0; i < activated.size().cy; ++i) { + for (int j = 0; j < activated.size().cx; ++j) { + result.setAt(i, j, T(1)); + } + } + return result; + } + + // ========== UNIFIED INTERFACE ========== + + // Apply activation function based on type + template + ml::Mat Activate(const ml::Mat& input, ActivationType type, T alpha = 0.01) { + switch (type) { + case ActivationType::SIGMOID: + return Sigmoid(input); + case ActivationType::RELU: + return ReLU(input); + case ActivationType::LEAKY_RELU: + return LeakyReLU(input, alpha); + case ActivationType::TANH: + return Tanh(input); + case ActivationType::SOFTMAX: + return Softmax(input); + case ActivationType::ELU: + return ELU(input, alpha); + case ActivationType::SELU: + return SELU(input); + case ActivationType::LINEAR: + return Linear(input); + default: + return Sigmoid(input); // Default fallback + } + } + + // Apply activation gradient based on type + template + ml::Mat ActivateGrad(const ml::Mat& activated, ActivationType type, T alpha = 0.01) { + switch (type) { + case ActivationType::SIGMOID: + return SigmoidGrad(activated); + case ActivationType::RELU: + return ReLUGrad(activated); + case ActivationType::LEAKY_RELU: + return LeakyReLUGrad(activated, alpha); + case ActivationType::TANH: + return TanhGrad(activated); + case ActivationType::SOFTMAX: + return SoftmaxGrad(activated); + case ActivationType::ELU: + return ELUGrad(activated, alpha); + case ActivationType::SELU: + return SELUGrad(activated); + case ActivationType::LINEAR: + return LinearGrad(activated); + default: + return SigmoidGrad(activated); // Default fallback + } + } + +} // namespace ml diff --git a/network.h b/network.h index 06bb63f..e177aec 100644 --- a/network.h +++ b/network.h @@ -5,8 +5,12 @@ #include #include #include +#include #include "math/rect.h" #include "Matrix/matrix.h" +#include "utility.h" +#include "activation.h" +#include "optimizer.h" #include "thirdparty/jsonxx/jsonxx.h" @@ -347,7 +351,7 @@ namespace ml { public: typedef ILayer baseclass; public: - Layer(int numNodes, std::string name = ""); + Layer(int numNodes, std::string name = "", ActivationType activation = ActivationType::SIGMOID, T alpha = 0.01); virtual ~Layer(); protected: @@ -362,12 +366,24 @@ namespace ml { virtual ml::Mat getInput() override; virtual ml::Mat getActivatedInput() override; virtual void setActivatedInput(ml::Mat activatedInput) override; + + // Activation configuration + ActivationType getActivationType() const { return mActivationType; } + void setActivationType(ActivationType type) { mActivationType = type; } + T getActivationAlpha() const { return mActivationAlpha; } + void setActivationAlpha(T alpha) { mActivationAlpha = alpha; } + + private: + ActivationType mActivationType; + T mActivationAlpha; // For Leaky ReLU, ELU, etc. }; template - Layer::Layer(int numNodes, std::string name) : baseclass() { + Layer::Layer(int numNodes, std::string name, ActivationType activation, T alpha) : baseclass() { this->setNumInputNodes(numNodes); + mActivationType = activation; + mActivationAlpha = alpha; common_construct(); baseclass::setName(name); } @@ -398,9 +414,9 @@ namespace ml { if (!inputMat.IsGood()) return; - // Store the raw input and compute activation + // Store the raw input and compute activation using configured activation function this->mInput = inputMat.Copy(); - this->mActivated = Sigmoid(this->mInput); + this->mActivated = Activate(this->mInput, mActivationType, mActivationAlpha); // CRITICAL FIX: Add bias to the ACTIVATED values, not the pre-activation values // Create a copy of activated values with bias for forward propagation @@ -483,6 +499,15 @@ namespace ml { virtual bool saveToFile(const std::string& filename); virtual bool loadFromFile(const std::string& filename); + // Optimizer configuration + void setOptimizer(IOptimizer* optimizer) { mOptimizer = optimizer; mOwnsOptimizer = false; } + IOptimizer* getOptimizer() const { return mOptimizer; } + void setOptimizerType(OptimizerType type) { + if (mOptimizer && mOwnsOptimizer) delete mOptimizer; + mOptimizer = CreateOptimizer(type); + mOwnsOptimizer = true; + } + // ILayer overrides public: virtual void init() override; @@ -525,6 +550,8 @@ namespace ml { private: ILayer* pInputLayer; ILayer* pOutputLayer; + IOptimizer* mOptimizer; + bool mOwnsOptimizer; }; @@ -544,11 +571,18 @@ namespace ml { //ILayer::AddToLayersMap(this); pInputLayer = NULL; pOutputLayer = NULL; + // Default to SGD optimizer + mOptimizer = new SGDOptimizer(); + mOwnsOptimizer = true; } template Network::~Network() { mOutputLayerSiblingCache.clear(); + if (mOptimizer && mOwnsOptimizer) { + delete mOptimizer; + mOptimizer = NULL; + } } @@ -695,7 +729,17 @@ namespace ml { // Compute weighted error: W^T * error = (n+bias, m) * (m, 1) = (n+bias, 1) ml::Mat weightedErr = ml::Mult(weightsT, errorCol, true); - ml::Mat deltaSig = SigGrad(activatedInput); + // Get activation gradient based on layer's activation type + Layer* pPrevLayerConcrete = dynamic_cast*>(pPrevLayer); + ml::Mat deltaSig; + if (pPrevLayerConcrete) { + deltaSig = ActivateGrad(activatedInput, + pPrevLayerConcrete->getActivationType(), + pPrevLayerConcrete->getActivationAlpha()); + } else { + // Fallback to sigmoid for non-Layer types + deltaSig = SigmoidGrad(activatedInput); + } // Strip bias rows from weighted errors to match dimensions of deltaSig // weightedErr is a column vector (cx=1, cy=outputSize) which includes bias @@ -726,12 +770,11 @@ namespace ml { template void Network::updateWeights(T learningRate) { /* - Update weights using gradient descent: + Update weights using the configured optimizer: For each layer, compute weight gradients from activations and errors, - then update weights: W = W + learningRate * input^T * error - The error already includes the gradient of the loss + then delegate to optimizer to update weights */ - if (!pInputLayer) return; + if (!pInputLayer || !mOptimizer) return; std::vector*> toProcess; std::set*> visited; @@ -745,6 +788,7 @@ namespace ml { if (!pCurLayer) continue; // Add siblings to processing queue + int siblingIdx = 0; for (ILayer* pNextLayer : pCurLayer->getSiblings()) { if (visited.find(pNextLayer) == visited.end()) { toProcess.push_back(pNextLayer); @@ -767,40 +811,37 @@ namespace ml { for (int b = 0; b < ILayer::GetNumBiasNodes(); ++b) pushBiasCol(activatedWithBias); - // Compute weight update: delta_W = error^T * activation (outer product) + // Compute weight gradients: delta_W = error^T * activation (outer product) // errors is (1, m) row vector, activatedWithBias is (1, n+bias) row vector // weights are (m, n+bias) // Result: delta_W (m, n+bias) where delta_W[i,j] = error[i] * activation[j] - // Manual outer product since Mult() doesn't handle this case properly - ml::Mat weightDelta(weights.size(), 0); + ml::Mat gradients(weights.size(), 0); int numOutputs = errors.size().cx; // m int numInputs = activatedWithBias.size().cx; // n+bias - for (int i = 0; i < numOutputs; ++i) { - T err = errors.getAt(0, i); - for (int j = 0; j < numInputs; ++j) { - T act = activatedWithBias.getAt(0, j); - weightDelta.setAt(i, j, err * act); + for (int row = 0; row < numOutputs; ++row) { + T err = errors.getAt(0, row); + for (int col = 0; col < numInputs; ++col) { + T act = activatedWithBias.getAt(0, col); + gradients.setAt(row, col, err * act); } } - // Create updated weights matrix + // Create unique key for this layer-sibling pair for optimizer state tracking + std::string layerKey = pCurLayer->getName() + "_to_" + pNextLayer->getName() + + "_" + std::to_string(i) + "_" + std::to_string(siblingIdx); + + // Get modifiable reference to weights ml::Mat updatedWeights = weights.Copy(); - for (int row = 0; row < updatedWeights.size().cy; ++row) { - for (int col = 0; col < updatedWeights.size().cx; ++col) { - T delta = weightDelta.getAt(row, col); - // Clip delta to prevent exploding gradients - if (std::abs(delta) > 10.0) { - delta = (delta > 0) ? 10.0 : -10.0; - } - T newWeight = updatedWeights.getAt(row, col) + learningRate * delta; - updatedWeights.setAt(row, col, newWeight); - } - } + + // Use optimizer to update weights + mOptimizer->updateWeights(updatedWeights, gradients, learningRate, layerKey); // Store the updated weights back pCurLayer->setWeights(pNextLayer, updatedWeights); + + siblingIdx++; } } } diff --git a/optimizer.h b/optimizer.h new file mode 100644 index 0000000..269421d --- /dev/null +++ b/optimizer.h @@ -0,0 +1,294 @@ +#pragma once + +#include "Matrix/matrix.h" +#include +#include +#include + +namespace ml { + + // Optimizer types + enum class OptimizerType { + SGD, + MOMENTUM, + ADAM + }; + + // Convert optimizer type to string for debugging/logging + inline const char* OptimizerTypeToString(OptimizerType type) { + switch (type) { + case OptimizerType::SGD: return "SGD"; + case OptimizerType::MOMENTUM: return "Momentum"; + case OptimizerType::ADAM: return "Adam"; + default: return "Unknown"; + } + } + + // Base optimizer interface + template + class IOptimizer { + public: + virtual ~IOptimizer() {} + + // Update weights given gradients + // layerKey is used to track state for each layer-sibling pair + virtual void updateWeights( + ml::Mat& weights, + const ml::Mat& gradients, + T learningRate, + const std::string& layerKey + ) = 0; + + // Reset optimizer state (useful when starting new training) + virtual void reset() = 0; + + // Get optimizer type + virtual OptimizerType getType() const = 0; + }; + + // ========== SGD OPTIMIZER ========== + // Stochastic Gradient Descent with optional gradient clipping + // Update: W = W + learningRate * gradient + template + class SGDOptimizer : public IOptimizer { + public: + SGDOptimizer(T gradientClipThreshold = 10.0) + : mGradientClipThreshold(gradientClipThreshold) {} + + virtual ~SGDOptimizer() {} + + virtual void updateWeights( + ml::Mat& weights, + const ml::Mat& gradients, + T learningRate, + const std::string& layerKey + ) override { + if (!weights.IsGood() || !gradients.IsGood()) return; + if (weights.size() != gradients.size()) return; + + for (int row = 0; row < weights.size().cy; ++row) { + for (int col = 0; col < weights.size().cx; ++col) { + T gradient = gradients.getAt(row, col); + + // Clip gradient to prevent exploding gradients + if (std::abs(gradient) > mGradientClipThreshold) { + gradient = (gradient > 0) ? mGradientClipThreshold : -mGradientClipThreshold; + } + + T currentWeight = weights.getAt(row, col); + T newWeight = currentWeight + learningRate * gradient; + weights.setAt(row, col, newWeight); + } + } + } + + virtual void reset() override { + // SGD has no state to reset + } + + virtual OptimizerType getType() const override { + return OptimizerType::SGD; + } + + private: + T mGradientClipThreshold; + }; + + // ========== MOMENTUM OPTIMIZER ========== + // SGD with Momentum: accumulates velocity vector + // velocity = β * velocity + gradient + // W = W + learningRate * velocity + // + // Hyperparameters: + // - β (beta): momentum coefficient, typically 0.9 + // - Helps accelerate in relevant direction and dampen oscillations + template + class MomentumOptimizer : public IOptimizer { + public: + MomentumOptimizer(T beta = 0.9, T gradientClipThreshold = 10.0) + : mBeta(beta), mGradientClipThreshold(gradientClipThreshold) {} + + virtual ~MomentumOptimizer() {} + + virtual void updateWeights( + ml::Mat& weights, + const ml::Mat& gradients, + T learningRate, + const std::string& layerKey + ) override { + if (!weights.IsGood() || !gradients.IsGood()) return; + if (weights.size() != gradients.size()) return; + + // Initialize velocity for this layer if not exists + if (mVelocity.find(layerKey) == mVelocity.end()) { + mVelocity[layerKey] = ml::Mat(weights.size(), 0); + } + + ml::Mat& velocity = mVelocity[layerKey]; + + // Update velocity and weights + for (int row = 0; row < weights.size().cy; ++row) { + for (int col = 0; col < weights.size().cx; ++col) { + T gradient = gradients.getAt(row, col); + + // Clip gradient + if (std::abs(gradient) > mGradientClipThreshold) { + gradient = (gradient > 0) ? mGradientClipThreshold : -mGradientClipThreshold; + } + + // Update velocity: v = β*v + gradient + T v = mBeta * velocity.getAt(row, col) + gradient; + velocity.setAt(row, col, v); + + // Update weights: W = W + learningRate * v + T currentWeight = weights.getAt(row, col); + T newWeight = currentWeight + learningRate * v; + weights.setAt(row, col, newWeight); + } + } + } + + virtual void reset() override { + mVelocity.clear(); + } + + virtual OptimizerType getType() const override { + return OptimizerType::MOMENTUM; + } + + T getBeta() const { return mBeta; } + void setBeta(T beta) { mBeta = beta; } + + private: + T mBeta; + T mGradientClipThreshold; + std::map> mVelocity; // velocity for each layer + }; + + // ========== ADAM OPTIMIZER ========== + // Adaptive Moment Estimation + // Combines momentum (first moment) with RMSprop (second moment) + // + // m = β1 * m + (1 - β1) * gradient (first moment - mean) + // v = β2 * v + (1 - β2) * gradient² (second moment - variance) + // m_hat = m / (1 - β1^t) (bias correction) + // v_hat = v / (1 - β2^t) (bias correction) + // W = W + learningRate * m_hat / (√v_hat + ε) + // + // Hyperparameters: + // - β1: typically 0.9 (first moment decay) + // - β2: typically 0.999 (second moment decay) + // - ε: typically 1e-8 (small constant for numerical stability) + template + class AdamOptimizer : public IOptimizer { + public: + AdamOptimizer(T beta1 = 0.9, T beta2 = 0.999, T epsilon = 1e-8, T gradientClipThreshold = 10.0) + : mBeta1(beta1), mBeta2(beta2), mEpsilon(epsilon), + mGradientClipThreshold(gradientClipThreshold) {} + + virtual ~AdamOptimizer() {} + + virtual void updateWeights( + ml::Mat& weights, + const ml::Mat& gradients, + T learningRate, + const std::string& layerKey + ) override { + if (!weights.IsGood() || !gradients.IsGood()) return; + if (weights.size() != gradients.size()) return; + + // Initialize moments for this layer if not exists + if (mFirstMoment.find(layerKey) == mFirstMoment.end()) { + mFirstMoment[layerKey] = ml::Mat(weights.size(), 0); + mSecondMoment[layerKey] = ml::Mat(weights.size(), 0); + mTimeStep[layerKey] = 0; + } + + ml::Mat& m = mFirstMoment[layerKey]; + ml::Mat& v = mSecondMoment[layerKey]; + int& t = mTimeStep[layerKey]; + t++; + + // Compute bias correction terms + T beta1_t = std::pow(mBeta1, t); + T beta2_t = std::pow(mBeta2, t); + T bias1Correction = 1.0 - beta1_t; + T bias2Correction = 1.0 - beta2_t; + + // Update moments and weights + for (int row = 0; row < weights.size().cy; ++row) { + for (int col = 0; col < weights.size().cx; ++col) { + T gradient = gradients.getAt(row, col); + + // Clip gradient + if (std::abs(gradient) > mGradientClipThreshold) { + gradient = (gradient > 0) ? mGradientClipThreshold : -mGradientClipThreshold; + } + + // Update biased first moment estimate: m = β1*m + (1-β1)*g + T m_val = mBeta1 * m.getAt(row, col) + (1.0 - mBeta1) * gradient; + m.setAt(row, col, m_val); + + // Update biased second moment estimate: v = β2*v + (1-β2)*g² + T v_val = mBeta2 * v.getAt(row, col) + (1.0 - mBeta2) * gradient * gradient; + v.setAt(row, col, v_val); + + // Compute bias-corrected moment estimates + T m_hat = m_val / bias1Correction; + T v_hat = v_val / bias2Correction; + + // Update weights: W = W + α * m_hat / (√v_hat + ε) + T currentWeight = weights.getAt(row, col); + T update = learningRate * m_hat / (std::sqrt(v_hat) + mEpsilon); + T newWeight = currentWeight + update; + weights.setAt(row, col, newWeight); + } + } + } + + virtual void reset() override { + mFirstMoment.clear(); + mSecondMoment.clear(); + mTimeStep.clear(); + } + + virtual OptimizerType getType() const override { + return OptimizerType::ADAM; + } + + T getBeta1() const { return mBeta1; } + T getBeta2() const { return mBeta2; } + T getEpsilon() const { return mEpsilon; } + + void setBeta1(T beta1) { mBeta1 = beta1; } + void setBeta2(T beta2) { mBeta2 = beta2; } + void setEpsilon(T epsilon) { mEpsilon = epsilon; } + + private: + T mBeta1; + T mBeta2; + T mEpsilon; + T mGradientClipThreshold; + + std::map> mFirstMoment; // m - first moment (mean) + std::map> mSecondMoment; // v - second moment (variance) + std::map mTimeStep; // t - timestep for bias correction + }; + + // ========== OPTIMIZER FACTORY ========== + // Helper function to create optimizers + template + IOptimizer* CreateOptimizer(OptimizerType type) { + switch (type) { + case OptimizerType::SGD: + return new SGDOptimizer(); + case OptimizerType::MOMENTUM: + return new MomentumOptimizer(); + case OptimizerType::ADAM: + return new AdamOptimizer(); + default: + return new SGDOptimizer(); // Default fallback + } + } + +} // namespace ml diff --git a/test_activations_optimizers b/test_activations_optimizers new file mode 100755 index 0000000..9271556 Binary files /dev/null and b/test_activations_optimizers differ diff --git a/test_activations_optimizers.cpp b/test_activations_optimizers.cpp new file mode 100644 index 0000000..5c4a5a6 --- /dev/null +++ b/test_activations_optimizers.cpp @@ -0,0 +1,349 @@ +#include +#include +#include +#include "network.h" + +using namespace ml; +using namespace std; + +// Helper function to check if two values are approximately equal +template +bool approxEqual(T a, T b, T epsilon = 1e-5) { + return std::abs(a - b) < epsilon; +} + +// Test activation functions +void testActivationFunctions() { + cout << "\n=== Testing Activation Functions ===" << endl; + + // Test Sigmoid + { + Mat input(1, 3, 0); + input.setAt(0, 0, -1.0); + input.setAt(0, 1, 0.0); + input.setAt(0, 2, 1.0); + + Mat output = Sigmoid(input); + assert(approxEqual(output.getAt(0, 0), 0.2689414, 1e-5)); + assert(approxEqual(output.getAt(0, 1), 0.5, 1e-5)); + assert(approxEqual(output.getAt(0, 2), 0.7310586, 1e-5)); + + Mat grad = SigmoidGrad(output); + assert(approxEqual(grad.getAt(0, 0), 0.1966119, 1e-5)); + assert(approxEqual(grad.getAt(0, 1), 0.25, 1e-5)); + assert(approxEqual(grad.getAt(0, 2), 0.1966119, 1e-5)); + + cout << "✓ Sigmoid activation and gradient test passed" << endl; + } + + // Test ReLU + { + Mat input(1, 4, 0); + input.setAt(0, 0, -2.0); + input.setAt(0, 1, -0.5); + input.setAt(0, 2, 0.0); + input.setAt(0, 3, 1.5); + + Mat output = ReLU(input); + assert(approxEqual(output.getAt(0, 0), 0.0)); + assert(approxEqual(output.getAt(0, 1), 0.0)); + assert(approxEqual(output.getAt(0, 2), 0.0)); + assert(approxEqual(output.getAt(0, 3), 1.5)); + + Mat grad = ReLUGrad(output); + assert(approxEqual(grad.getAt(0, 0), 0.0)); + assert(approxEqual(grad.getAt(0, 1), 0.0)); + assert(approxEqual(grad.getAt(0, 2), 0.0)); + assert(approxEqual(grad.getAt(0, 3), 1.0)); + + cout << "✓ ReLU activation and gradient test passed" << endl; + } + + // Test Leaky ReLU + { + Mat input(1, 3, 0); + input.setAt(0, 0, -1.0); + input.setAt(0, 1, 0.0); + input.setAt(0, 2, 1.0); + + Mat output = LeakyReLU(input, 0.01); + assert(approxEqual(output.getAt(0, 0), -0.01)); + assert(approxEqual(output.getAt(0, 1), 0.0)); + assert(approxEqual(output.getAt(0, 2), 1.0)); + + Mat grad = LeakyReLUGrad(output, 0.01); + assert(approxEqual(grad.getAt(0, 0), 0.01)); + assert(approxEqual(grad.getAt(0, 1), 0.01)); // gradient is alpha at x=0 + assert(approxEqual(grad.getAt(0, 2), 1.0)); + + cout << "✓ Leaky ReLU activation and gradient test passed" << endl; + } + + // Test Tanh + { + Mat input(1, 3, 0); + input.setAt(0, 0, -1.0); + input.setAt(0, 1, 0.0); + input.setAt(0, 2, 1.0); + + Mat output = Tanh(input); + assert(approxEqual(output.getAt(0, 0), -0.7615942, 1e-5)); + assert(approxEqual(output.getAt(0, 1), 0.0)); + assert(approxEqual(output.getAt(0, 2), 0.7615942, 1e-5)); + + Mat grad = TanhGrad(output); + assert(approxEqual(grad.getAt(0, 0), 0.4199743, 1e-5)); + assert(approxEqual(grad.getAt(0, 1), 1.0)); + assert(approxEqual(grad.getAt(0, 2), 0.4199743, 1e-5)); + + cout << "✓ Tanh activation and gradient test passed" << endl; + } + + // Test Softmax + { + Mat input(1, 3, 0); + input.setAt(0, 0, 1.0); + input.setAt(0, 1, 2.0); + input.setAt(0, 2, 3.0); + + Mat output = Softmax(input); + double sum = output.getAt(0, 0) + output.getAt(0, 1) + output.getAt(0, 2); + assert(approxEqual(sum, 1.0, 1e-5)); + assert(approxEqual(output.getAt(0, 0), 0.0900306, 1e-5)); + assert(approxEqual(output.getAt(0, 1), 0.2447285, 1e-5)); + assert(approxEqual(output.getAt(0, 2), 0.6652409, 1e-5)); + + cout << "✓ Softmax activation test passed" << endl; + } + + // Test Linear + { + Mat input(1, 3, 0); + input.setAt(0, 0, -1.5); + input.setAt(0, 1, 0.0); + input.setAt(0, 2, 2.5); + + Mat output = Linear(input); + assert(approxEqual(output.getAt(0, 0), -1.5)); + assert(approxEqual(output.getAt(0, 1), 0.0)); + assert(approxEqual(output.getAt(0, 2), 2.5)); + + Mat grad = LinearGrad(output); + assert(approxEqual(grad.getAt(0, 0), 1.0)); + assert(approxEqual(grad.getAt(0, 1), 1.0)); + assert(approxEqual(grad.getAt(0, 2), 1.0)); + + cout << "✓ Linear activation and gradient test passed" << endl; + } + + // Test unified interface + { + Mat input(1, 2, 0); + input.setAt(0, 0, -1.0); + input.setAt(0, 1, 1.0); + + Mat sigmoidOut = Activate(input, ActivationType::SIGMOID); + Mat reluOut = Activate(input, ActivationType::RELU); + Mat tanhOut = Activate(input, ActivationType::TANH); + + assert(sigmoidOut.getAt(0, 0) > 0.0 && sigmoidOut.getAt(0, 0) < 1.0); + assert(reluOut.getAt(0, 0) == 0.0 && reluOut.getAt(0, 1) == 1.0); + assert(tanhOut.getAt(0, 0) < 0.0 && tanhOut.getAt(0, 1) > 0.0); + + cout << "✓ Unified activation interface test passed" << endl; + } + + cout << "\n✅ All activation function tests passed!\n" << endl; +} + +// Test optimizers +void testOptimizers() { + cout << "\n=== Testing Optimizers ===" << endl; + + // Test SGD + { + SGDOptimizer sgd; + Mat weights(2, 2, 0); + weights.setAt(0, 0, 1.0); + weights.setAt(0, 1, 2.0); + weights.setAt(1, 0, 3.0); + weights.setAt(1, 1, 4.0); + + Mat gradients(2, 2, 0); + gradients.setAt(0, 0, 0.1); + gradients.setAt(0, 1, 0.2); + gradients.setAt(1, 0, 0.3); + gradients.setAt(1, 1, 0.4); + + sgd.updateWeights(weights, gradients, 0.1, "test_layer"); + + // SGD: w = w + lr * grad + assert(approxEqual(weights.getAt(0, 0), 1.01)); + assert(approxEqual(weights.getAt(0, 1), 2.02)); + assert(approxEqual(weights.getAt(1, 0), 3.03)); + assert(approxEqual(weights.getAt(1, 1), 4.04)); + + cout << "✓ SGD optimizer test passed" << endl; + } + + // Test Momentum + { + MomentumOptimizer momentum(0.9); + Mat weights(2, 2, 0); + weights.setAt(0, 0, 1.0); + weights.setAt(0, 1, 2.0); + weights.setAt(1, 0, 3.0); + weights.setAt(1, 1, 4.0); + + Mat gradients(2, 2, 0); + gradients.setAt(0, 0, 0.1); + gradients.setAt(0, 1, 0.2); + gradients.setAt(1, 0, 0.3); + gradients.setAt(1, 1, 0.4); + + // First update: v = 0*0.9 + grad = grad, w = w + lr * v + momentum.updateWeights(weights, gradients, 0.1, "test_layer"); + assert(approxEqual(weights.getAt(0, 0), 1.01)); + assert(approxEqual(weights.getAt(0, 1), 2.02)); + + // Second update: v = prev_v*0.9 + grad, w = w + lr * v + momentum.updateWeights(weights, gradients, 0.1, "test_layer"); + // v[0,0] = 0.1*0.9 + 0.1 = 0.19, w[0,0] = 1.01 + 0.1*0.19 = 1.029 + assert(approxEqual(weights.getAt(0, 0), 1.029, 1e-5)); + + cout << "✓ Momentum optimizer test passed" << endl; + } + + // Test Adam + { + AdamOptimizer adam(0.9, 0.999, 1e-8); + Mat weights(2, 2, 0); + weights.setAt(0, 0, 1.0); + weights.setAt(0, 1, 2.0); + weights.setAt(1, 0, 3.0); + weights.setAt(1, 1, 4.0); + + Mat gradients(2, 2, 0); + gradients.setAt(0, 0, 0.1); + gradients.setAt(0, 1, 0.2); + gradients.setAt(1, 0, 0.3); + gradients.setAt(1, 1, 0.4); + + double w00_before = weights.getAt(0, 0); + adam.updateWeights(weights, gradients, 0.01, "test_layer"); + double w00_after = weights.getAt(0, 0); + + // Adam should update weights (exact value depends on bias correction) + assert(w00_after > w00_before); + + // Second update should also work + adam.updateWeights(weights, gradients, 0.01, "test_layer"); + assert(weights.getAt(0, 0) > w00_after); + + cout << "✓ Adam optimizer test passed" << endl; + } + + // Test optimizer reset + { + MomentumOptimizer momentum(0.9); + Mat weights(2, 2, 1.0); + Mat gradients(2, 2, 0.1); + + momentum.updateWeights(weights, gradients, 0.1, "test_layer"); + double w1 = weights.getAt(0, 0); + + momentum.updateWeights(weights, gradients, 0.1, "test_layer"); + double w2 = weights.getAt(0, 0); + + // Reset and update again + momentum.reset(); + weights.setAt(0, 0, 1.0); // Reset weight + momentum.updateWeights(weights, gradients, 0.1, "test_layer"); + double w3 = weights.getAt(0, 0); + + // After reset, first update should be same as initial first update + assert(approxEqual(w1, w3)); + + cout << "✓ Optimizer reset test passed" << endl; + } + + cout << "\n✅ All optimizer tests passed!\n" << endl; +} + +// Test network with different activations +void testNetworkWithActivations() { + cout << "\n=== Testing Network with Different Activations ===" << endl; + + // Create a simple network with ReLU activation + Layer* input = new Layer(2, "input", ActivationType::RELU); + Layer* hidden = new Layer(3, "hidden", ActivationType::RELU); + Layer* output = new Layer(1, "output", ActivationType::SIGMOID); + + Network net; + net.connect(input, hidden); + net.connect(hidden, output); + net.setInputLayer(input); + net.setOutputLayer(output); + net.init(); + + // Test feed forward + Mat testInput(1, 2, 0); + testInput.setAt(0, 0, 0.5); + testInput.setAt(0, 1, 0.3); + + Mat result = net.feed(testInput); + assert(result.IsGood()); + assert(result.size().cx == 1); // Output should be 1D + + cout << "✓ Network with ReLU activation test passed" << endl; + + // Test with Adam optimizer + net.setOptimizerType(OptimizerType::ADAM); + assert(net.getOptimizer()->getType() == OptimizerType::ADAM); + + Mat target(1, 1, 0); + target.setAt(0, 0, 1.0); + + result = net.feed(testInput); + output->setErrors(ml::Diff(target, result)); + net.backprop(); + net.updateWeights(0.01); + + cout << "✓ Network with Adam optimizer test passed" << endl; + + // Test with Momentum optimizer + net.setOptimizerType(OptimizerType::MOMENTUM); + assert(net.getOptimizer()->getType() == OptimizerType::MOMENTUM); + + result = net.feed(testInput); + output->setErrors(ml::Diff(target, result)); + net.backprop(); + net.updateWeights(0.01); + + cout << "✓ Network with Momentum optimizer test passed" << endl; + + cout << "\n✅ All network integration tests passed!\n" << endl; +} + +int main() { + cout << "\n========================================" << endl; + cout << "Testing Activation Functions & Optimizers" << endl; + cout << "========================================" << endl; + + try { + testActivationFunctions(); + testOptimizers(); + testNetworkWithActivations(); + + cout << "\n========================================" << endl; + cout << "🎉 ALL TESTS PASSED! 🎉" << endl; + cout << "========================================\n" << endl; + return 0; + } catch (const std::exception& e) { + cout << "\n❌ TEST FAILED: " << e.what() << endl; + return 1; + } catch (...) { + cout << "\n❌ TEST FAILED: Unknown error" << endl; + return 1; + } +} diff --git a/utility.h b/utility.h index ece2826..c2219a3 100644 --- a/utility.h +++ b/utility.h @@ -7,6 +7,9 @@ #include #include #include +#if defined(_OPENMP) +#include +#endif #ifdef _DEBUG #define DEBUG(t, message) (Utility::Debug(t, message))