diff --git a/README.md b/README.md
index 00b7d9b..f428d11 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,68 @@
 # Neural Network Library
 
-A C++ implementation of a basic neural network with forward and backward propagation, featuring flexible layer architecture and comprehensive unit tests.
+A C++ implementation of a flexible neural network library with forward and backward propagation, featuring multiple activation functions, modern optimizers, and comprehensive unit tests.
 
 ## Features
 
-- ✅ Forward propagation with sigmoid activation
-- ✅ Backward propagation for error computation
-- ✅ Flexible layer architecture (supports arbitrary network topologies)
-- ✅ Matrix operations optimized with OpenMP
-- ✅ Model serialization (save/load weights to/from JSON files)
+### ✅ What's Available
+
+**Network Architecture:**
+- ✅ Fully connected (dense) feedforward networks
+- ✅ Arbitrary layer sizes and network depths
+- ✅ Flexible layer connections (supports arbitrary network topologies)
+- ✅ Composable networks (networks as layers)
+
+**Activation Functions:**
+- ✅ **Sigmoid** - Classic activation, range (0, 1)
+- ✅ **ReLU** - Rectified Linear Unit, best for hidden layers
+- ✅ **Leaky ReLU** - Prevents dying ReLU problem
+- ✅ **Tanh** - Hyperbolic tangent, range (-1, 1)
+- ✅ **Softmax** - Multi-class classification output
+- ✅ **ELU** - Exponential Linear Unit
+- ✅ **SELU** - Self-normalizing ELU variant
+- ✅ **Linear** - Identity function for regression
+
+**Optimizers:**
+- ✅ **SGD** - Stochastic Gradient Descent with gradient clipping
+- ✅ **Momentum** - SGD with momentum (β=0.9 default)
+- ✅ **Adam** - Adaptive moment estimation (β1=0.9, β2=0.999)
+
+**Training & Optimization:**
+- ✅ Backpropagation for error computation
+- ✅ Xavier/Glorot weight initialization
+- ✅ Gradient clipping for stability
+- ✅ Configurable learning rates
+
+**Infrastructure:**
+- ✅ Matrix operations optimized with OpenMP and SSE
+- ✅ Model serialization (save/load weights to/from JSON)
 - ✅ Comprehensive unit test suite
 - ✅ CI/CD with GitHub Actions
+- ✅ Cross-platform (x86/x64 and ARM/Apple Silicon)
+
+### ❌ What's Missing
+
+**Advanced Network Types:**
+- ❌ Convolutional layers (CNN)
+- ❌ Pooling layers (MaxPool, AvgPool)
+- ❌ Recurrent layers (RNN, LSTM, GRU)
+- ❌ Attention mechanisms
+
+**Training Features:**
+- ❌ Batch training (currently sample-by-sample only)
+- ❌ Mini-batch gradient descent
+- ❌ Batch normalization
+- ❌ Dropout regularization
+- ❌ L1/L2 weight regularization
+
+**Loss Functions:**
+- ❌ Cross-entropy loss (currently using MSE only)
+- ❌ Multiple loss function options
+
+**Advanced Optimizers:**
+- ❌ AdaGrad, RMSprop
+- ❌ Learning rate scheduling
+- ❌ Adaptive learning rate decay
 
 ## Building
 
@@ -32,6 +84,12 @@ make test_network
 make test_model_save_load
 ./test_model_save_load
 
+# Run activation functions and optimizer tests
+g++ -std=c++11 -O3 -fopenmp -msse2 -I. \
+    test_activations_optimizers.cpp Matrix/matrix.cpp thirdparty/jsonxx/jsonxx.cpp \
+    -o test_activations_optimizers
+./test_activations_optimizers
+
 # Or use CTest to run all tests
 ctest --output-on-failure
 ```
@@ -123,17 +181,21 @@ This generates:
 
 ## Usage Example
 
+### Basic Network with Custom Activations
+
 ```cpp
 #include "network.h"
 
 using namespace ml;
 
-// Create a simple 3-layer network: 2 inputs -> 4 hidden -> 1 output
+// Create a 3-layer network with ReLU hidden layers and sigmoid output
+// Architecture: 2 inputs -> 4 hidden (ReLU) -> 1 output (Sigmoid)
 Network<double>* network = new Network<double>();
 
-ILayer<double>* inputLayer  = new Layer<double>(2, "Input");
-ILayer<double>* hiddenLayer = new Layer<double>(4, "Hidden");
-ILayer<double>* outputLayer = new Layer<double>(1, "Output");
+// Create layers with specific activation functions
+Layer<double>* inputLayer  = new Layer<double>(2, "Input", ActivationType::LINEAR);
+Layer<double>* hiddenLayer = new Layer<double>(4, "Hidden", ActivationType::RELU);
+Layer<double>* outputLayer = new Layer<double>(1, "Output", ActivationType::SIGMOID);
 
 // Connect layers
 network->setInputLayer(inputLayer);
@@ -141,6 +203,9 @@ network->connect(inputLayer, hiddenLayer);
 network->connect(hiddenLayer, outputLayer);
 network->setOutputLayer(outputLayer);
 
+// Set optimizer (default is SGD)
+network->setOptimizerType(OptimizerType::ADAM);  // or MOMENTUM, SGD
+
 // Initialize weights
 network->init();
 
@@ -151,15 +216,47 @@ input.setAt(0, 1, 0.5);
 
 Mat<double> output = network->feed(input);
 
-// Backward pass
+// Backward pass and weight update
 Mat<double> targetOutput(1, 1, 0.8);
 Mat<double> error = Diff(targetOutput, output);
 outputLayer->setErrors(error);
 network->backprop();
+network->updateWeights(0.01);  // learning rate = 0.01
 
-// Access propagated errors
-Mat<double> hiddenErrors = hiddenLayer->getErrors();
-Mat<double> inputErrors = inputLayer->getErrors();
+// Training loop example
+for (int epoch = 0; epoch < 1000; epoch++) {
+    Mat<double> output = network->feed(input);
+    outputLayer->setErrors(Diff(targetOutput, output));
+    network->backprop();
+    network->updateWeights(0.01);
+}
+```
+
+### Available Activation Functions
+
+```cpp
+// When creating layers, specify activation type:
+Layer<double>* layer1 = new Layer<double>(10, "Layer1", ActivationType::RELU);
+Layer<double>* layer2 = new Layer<double>(10, "Layer2", ActivationType::TANH);
+Layer<double>* layer3 = new Layer<double>(10, "Layer3", ActivationType::SIGMOID);
+Layer<double>* layer4 = new Layer<double>(10, "Layer4", ActivationType::SOFTMAX);
+
+// For Leaky ReLU or ELU, you can specify the alpha parameter:
+Layer<double>* leaky = new Layer<double>(10, "Leaky", ActivationType::LEAKY_RELU, 0.01);
+Layer<double>* elu = new Layer<double>(10, "ELU", ActivationType::ELU, 1.0);
+```
+
+### Available Optimizers
+
+```cpp
+// Set optimizer type
+network->setOptimizerType(OptimizerType::SGD);       // Basic SGD with gradient clipping
+network->setOptimizerType(OptimizerType::MOMENTUM);  // SGD with momentum (β=0.9)
+network->setOptimizerType(OptimizerType::ADAM);      // Adam optimizer (β1=0.9, β2=0.999)
+
+// Or create and set a custom optimizer
+AdamOptimizer<double>* adam = new AdamOptimizer<double>(0.9, 0.999, 1e-8);
+network->setOptimizer(adam);
 ```
 
 ## Matrix Operations
@@ -173,11 +270,14 @@ Mat<double> result = ElementMult(m1, m2);
 // Matrix multiplication
 Mat<double> result = Mult(m1, m2);
 
-// Sigmoid activation
-Mat<double> activated = Sigmoid(input);
+// Activation functions (using unified interface)
+Mat<double> activated = Activate(input, ActivationType::RELU);
+Mat<double> grad = ActivateGrad(activated, ActivationType::RELU);
 
-// Sigmoid gradient
-Mat<double> grad = SigGrad(activated);
+// Or use specific activation functions directly
+Mat<double> sigmoid_out = Sigmoid(input);
+Mat<double> relu_out = ReLU(input);
+Mat<double> tanh_out = Tanh(input);
 ```
 
 ## Model Serialization
diff --git a/activation.h b/activation.h
new file mode 100644
index 0000000..f5cb649
--- /dev/null
+++ b/activation.h
@@ -0,0 +1,343 @@
+#pragma once
+
+#include "Matrix/matrix.h"
+#include <cmath>
+#include <algorithm>
+
+namespace ml {
+
+    // Activation function types
+    enum class ActivationType {
+        SIGMOID,
+        RELU,
+        LEAKY_RELU,
+        TANH,
+        SOFTMAX,
+        ELU,
+        SELU,
+        LINEAR
+    };
+
+    // Convert activation type to string for debugging/logging
+    inline const char* ActivationTypeToString(ActivationType type) {
+        switch (type) {
+            case ActivationType::SIGMOID: return "Sigmoid";
+            case ActivationType::RELU: return "ReLU";
+            case ActivationType::LEAKY_RELU: return "LeakyReLU";
+            case ActivationType::TANH: return "Tanh";
+            case ActivationType::SOFTMAX: return "Softmax";
+            case ActivationType::ELU: return "ELU";
+            case ActivationType::SELU: return "SELU";
+            case ActivationType::LINEAR: return "Linear";
+            default: return "Unknown";
+        }
+    }
+
+    // ========== ACTIVATION FUNCTIONS ==========
+
+    // Sigmoid: σ(x) = 1 / (1 + e^(-x))
+    // Range: (0, 1)
+    // Use: Binary classification output, historically used in hidden layers
+    template <typename T>
+    ml::Mat<T> Sigmoid(const ml::Mat<T>& mat) {
+        ml::Mat<T> result(mat.size(), 0);
+        for (int i = 0; i < mat.size().cy; ++i) {
+            for (int j = 0; j < mat.size().cx; ++j) {
+                T val = mat.getAt(i, j);
+                result.setAt(i, j, 1.0 / (1.0 + std::exp(-val)));
+            }
+        }
+        return result;
+    }
+
+    // ReLU: f(x) = max(0, x)
+    // Range: [0, ∞)
+    // Use: Default choice for hidden layers in deep networks
+    template <typename T>
+    ml::Mat<T> ReLU(const ml::Mat<T>& mat) {
+        ml::Mat<T> result(mat.size(), 0);
+        for (int i = 0; i < mat.size().cy; ++i) {
+            for (int j = 0; j < mat.size().cx; ++j) {
+                T val = mat.getAt(i, j);
+                result.setAt(i, j, std::max(T(0), val));
+            }
+        }
+        return result;
+    }
+
+    // Leaky ReLU: f(x) = max(αx, x) where α is typically 0.01
+    // Range: (-∞, ∞)
+    // Use: Prevents dying ReLU problem by allowing small negative values
+    template <typename T>
+    ml::Mat<T> LeakyReLU(const ml::Mat<T>& mat, T alpha = 0.01) {
+        ml::Mat<T> result(mat.size(), 0);
+        for (int i = 0; i < mat.size().cy; ++i) {
+            for (int j = 0; j < mat.size().cx; ++j) {
+                T val = mat.getAt(i, j);
+                result.setAt(i, j, val > 0 ? val : alpha * val);
+            }
+        }
+        return result;
+    }
+
+    // Tanh: f(x) = (e^x - e^(-x)) / (e^x + e^(-x))
+    // Range: (-1, 1)
+    // Use: Hidden layers, zero-centered (unlike sigmoid)
+    template <typename T>
+    ml::Mat<T> Tanh(const ml::Mat<T>& mat) {
+        ml::Mat<T> result(mat.size(), 0);
+        for (int i = 0; i < mat.size().cy; ++i) {
+            for (int j = 0; j < mat.size().cx; ++j) {
+                T val = mat.getAt(i, j);
+                result.setAt(i, j, std::tanh(val));
+            }
+        }
+        return result;
+    }
+
+    // Softmax: f(x_i) = e^(x_i) / Σ(e^(x_j))
+    // Range: (0, 1), outputs sum to 1
+    // Use: Multi-class classification output layer
+    template <typename T>
+    ml::Mat<T> Softmax(const ml::Mat<T>& mat) {
+        ml::Mat<T> result(mat.size(), 0);
+
+        // Process each row independently (each row is a sample)
+        for (int i = 0; i < mat.size().cy; ++i) {
+            // Find max for numerical stability
+            T maxVal = mat.getAt(i, 0);
+            for (int j = 1; j < mat.size().cx; ++j) {
+                maxVal = std::max(maxVal, mat.getAt(i, j));
+            }
+
+            // Compute exp(x - max) and sum
+            T sum = 0;
+            for (int j = 0; j < mat.size().cx; ++j) {
+                T expVal = std::exp(mat.getAt(i, j) - maxVal);
+                result.setAt(i, j, expVal);
+                sum += expVal;
+            }
+
+            // Normalize
+            for (int j = 0; j < mat.size().cx; ++j) {
+                result.setAt(i, j, result.getAt(i, j) / sum);
+            }
+        }
+        return result;
+    }
+
+    // ELU: f(x) = x if x > 0, α(e^x - 1) if x ≤ 0
+    // Range: (-α, ∞) where α is typically 1.0
+    // Use: Can produce negative outputs, smoother than ReLU
+    template <typename T>
+    ml::Mat<T> ELU(const ml::Mat<T>& mat, T alpha = 1.0) {
+        ml::Mat<T> result(mat.size(), 0);
+        for (int i = 0; i < mat.size().cy; ++i) {
+            for (int j = 0; j < mat.size().cx; ++j) {
+                T val = mat.getAt(i, j);
+                result.setAt(i, j, val > 0 ? val : alpha * (std::exp(val) - 1));
+            }
+        }
+        return result;
+    }
+
+    // SELU: Self-normalizing variant of ELU with specific constants
+    // λ ≈ 1.0507, α ≈ 1.67326
+    // Range: (-λα, ∞)
+    // Use: Self-normalizing properties for deep networks
+    template <typename T>
+    ml::Mat<T> SELU(const ml::Mat<T>& mat) {
+        const T lambda = 1.0507009873554804934193349852946;
+        const T alpha = 1.6732632423543772848170429916717;
+
+        ml::Mat<T> result(mat.size(), 0);
+        for (int i = 0; i < mat.size().cy; ++i) {
+            for (int j = 0; j < mat.size().cx; ++j) {
+                T val = mat.getAt(i, j);
+                result.setAt(i, j, lambda * (val > 0 ? val : alpha * (std::exp(val) - 1)));
+            }
+        }
+        return result;
+    }
+
+    // Linear: f(x) = x
+    // Range: (-∞, ∞)
+    // Use: Regression output layer
+    template <typename T>
+    ml::Mat<T> Linear(const ml::Mat<T>& mat) {
+        return mat.Copy();
+    }
+
+    // ========== ACTIVATION GRADIENTS ==========
+
+    // Sigmoid gradient: σ'(x) = σ(x) * (1 - σ(x))
+    // Note: Input should be the ACTIVATED values, not raw inputs
+    template <typename T>
+    ml::Mat<T> SigmoidGrad(const ml::Mat<T>& activated) {
+        return ml::ElementMult(activated, ml::Diff<T>(1, activated));
+    }
+
+    // ReLU gradient: f'(x) = 1 if x > 0, 0 otherwise
+    // Note: Input should be the ACTIVATED values
+    template <typename T>
+    ml::Mat<T> ReLUGrad(const ml::Mat<T>& activated) {
+        ml::Mat<T> result(activated.size(), 0);
+        for (int i = 0; i < activated.size().cy; ++i) {
+            for (int j = 0; j < activated.size().cx; ++j) {
+                T val = activated.getAt(i, j);
+                result.setAt(i, j, val > 0 ? T(1) : T(0));
+            }
+        }
+        return result;
+    }
+
+    // Leaky ReLU gradient: f'(x) = 1 if x > 0, α otherwise
+    template <typename T>
+    ml::Mat<T> LeakyReLUGrad(const ml::Mat<T>& activated, T alpha = 0.01) {
+        ml::Mat<T> result(activated.size(), 0);
+        for (int i = 0; i < activated.size().cy; ++i) {
+            for (int j = 0; j < activated.size().cx; ++j) {
+                T val = activated.getAt(i, j);
+                result.setAt(i, j, val > 0 ? T(1) : alpha);
+            }
+        }
+        return result;
+    }
+
+    // Tanh gradient: f'(x) = 1 - tanh²(x)
+    // Note: Input should be the ACTIVATED values
+    template <typename T>
+    ml::Mat<T> TanhGrad(const ml::Mat<T>& activated) {
+        ml::Mat<T> result(activated.size(), 0);
+        for (int i = 0; i < activated.size().cy; ++i) {
+            for (int j = 0; j < activated.size().cx; ++j) {
+                T val = activated.getAt(i, j);
+                result.setAt(i, j, 1 - val * val);
+            }
+        }
+        return result;
+    }
+
+    // Softmax gradient: For cross-entropy loss, gradient simplifies to (predicted - target)
+    // This is handled in the loss function, so we return 1s
+    // For general case: ∂y_i/∂x_j = y_i(δ_ij - y_j) where δ is Kronecker delta
+    template <typename T>
+    ml::Mat<T> SoftmaxGrad(const ml::Mat<T>& activated) {
+        // When used with cross-entropy loss, the gradient simplifies
+        // and is typically computed differently in the loss function
+        // For now, return identity (will be properly handled in loss computation)
+        ml::Mat<T> result(activated.size(), 0);
+        for (int i = 0; i < activated.size().cy; ++i) {
+            for (int j = 0; j < activated.size().cx; ++j) {
+                result.setAt(i, j, T(1));
+            }
+        }
+        return result;
+    }
+
+    // ELU gradient: f'(x) = 1 if x > 0, α*e^x if x ≤ 0
+    // Note: For activated values, if val > 0 then gradient is 1
+    // If val ≤ 0, then val = α(e^x - 1), so e^x = val/α + 1
+    template <typename T>
+    ml::Mat<T> ELUGrad(const ml::Mat<T>& activated, T alpha = 1.0) {
+        ml::Mat<T> result(activated.size(), 0);
+        for (int i = 0; i < activated.size().cy; ++i) {
+            for (int j = 0; j < activated.size().cx; ++j) {
+                T val = activated.getAt(i, j);
+                if (val > 0) {
+                    result.setAt(i, j, T(1));
+                } else {
+                    // val = α(e^x - 1), so gradient = α*e^x = val + α
+                    result.setAt(i, j, val + alpha);
+                }
+            }
+        }
+        return result;
+    }
+
+    // SELU gradient: Similar to ELU but with SELU constants
+    template <typename T>
+    ml::Mat<T> SELUGrad(const ml::Mat<T>& activated) {
+        const T lambda = 1.0507009873554804934193349852946;
+        const T alpha = 1.6732632423543772848170429916717;
+
+        ml::Mat<T> result(activated.size(), 0);
+        for (int i = 0; i < activated.size().cy; ++i) {
+            for (int j = 0; j < activated.size().cx; ++j) {
+                T val = activated.getAt(i, j);
+                if (val > 0) {
+                    result.setAt(i, j, lambda);
+                } else {
+                    // val = λ*α(e^x - 1), gradient = λ*α*e^x = val + λ*α
+                    result.setAt(i, j, val + lambda * alpha);
+                }
+            }
+        }
+        return result;
+    }
+
+    // Linear gradient: f'(x) = 1
+    template <typename T>
+    ml::Mat<T> LinearGrad(const ml::Mat<T>& activated) {
+        ml::Mat<T> result(activated.size(), 0);
+        for (int i = 0; i < activated.size().cy; ++i) {
+            for (int j = 0; j < activated.size().cx; ++j) {
+                result.setAt(i, j, T(1));
+            }
+        }
+        return result;
+    }
+
+    // ========== UNIFIED INTERFACE ==========
+
+    // Apply activation function based on type
+    template <typename T>
+    ml::Mat<T> Activate(const ml::Mat<T>& input, ActivationType type, T alpha = 0.01) {
+        switch (type) {
+            case ActivationType::SIGMOID:
+                return Sigmoid(input);
+            case ActivationType::RELU:
+                return ReLU(input);
+            case ActivationType::LEAKY_RELU:
+                return LeakyReLU(input, alpha);
+            case ActivationType::TANH:
+                return Tanh(input);
+            case ActivationType::SOFTMAX:
+                return Softmax(input);
+            case ActivationType::ELU:
+                return ELU(input, alpha);
+            case ActivationType::SELU:
+                return SELU(input);
+            case ActivationType::LINEAR:
+                return Linear(input);
+            default:
+                return Sigmoid(input); // Default fallback
+        }
+    }
+
+    // Apply activation gradient based on type
+    template <typename T>
+    ml::Mat<T> ActivateGrad(const ml::Mat<T>& activated, ActivationType type, T alpha = 0.01) {
+        switch (type) {
+            case ActivationType::SIGMOID:
+                return SigmoidGrad(activated);
+            case ActivationType::RELU:
+                return ReLUGrad(activated);
+            case ActivationType::LEAKY_RELU:
+                return LeakyReLUGrad(activated, alpha);
+            case ActivationType::TANH:
+                return TanhGrad(activated);
+            case ActivationType::SOFTMAX:
+                return SoftmaxGrad(activated);
+            case ActivationType::ELU:
+                return ELUGrad(activated, alpha);
+            case ActivationType::SELU:
+                return SELUGrad(activated);
+            case ActivationType::LINEAR:
+                return LinearGrad(activated);
+            default:
+                return SigmoidGrad(activated); // Default fallback
+        }
+    }
+
+} // namespace ml
diff --git a/network.h b/network.h
index 06bb63f..e177aec 100644
--- a/network.h
+++ b/network.h
@@ -5,8 +5,12 @@
 #include <set>
 #include <algorithm>
 #include <fstream>
+#include <memory>
 #include "math/rect.h"
 #include "Matrix/matrix.h"
+#include "utility.h"
+#include "activation.h"
+#include "optimizer.h"
 #include "thirdparty/jsonxx/jsonxx.h"
 
 
@@ -347,7 +351,7 @@ namespace ml {
     public:
         typedef ILayer<T> baseclass;
     public:
-        Layer(int numNodes, std::string name = "");
+        Layer(int numNodes, std::string name = "", ActivationType activation = ActivationType::SIGMOID, T alpha = 0.01);
         virtual ~Layer();
 
     protected:
@@ -362,12 +366,24 @@ namespace ml {
         virtual ml::Mat<T> getInput() override;
         virtual ml::Mat<T> getActivatedInput() override;
         virtual void setActivatedInput(ml::Mat<T> activatedInput) override;
+
+        // Activation configuration
+        ActivationType getActivationType() const { return mActivationType; }
+        void setActivationType(ActivationType type) { mActivationType = type; }
+        T getActivationAlpha() const { return mActivationAlpha; }
+        void setActivationAlpha(T alpha) { mActivationAlpha = alpha; }
+
+    private:
+        ActivationType mActivationType;
+        T mActivationAlpha; // For Leaky ReLU, ELU, etc.
     };
 
 
     template <typename T>
-    Layer<T>::Layer(int numNodes, std::string name) : baseclass() {
+    Layer<T>::Layer(int numNodes, std::string name, ActivationType activation, T alpha) : baseclass() {
         this->setNumInputNodes(numNodes);
+        mActivationType = activation;
+        mActivationAlpha = alpha;
         common_construct();
         baseclass::setName(name);
     }
@@ -398,9 +414,9 @@ namespace ml {
         if (!inputMat.IsGood())
             return;
 
-        // Store the raw input and compute activation
+        // Store the raw input and compute activation using configured activation function
         this->mInput     = inputMat.Copy();
-        this->mActivated = Sigmoid<T>(this->mInput);
+        this->mActivated = Activate<T>(this->mInput, mActivationType, mActivationAlpha);
 
         // CRITICAL FIX: Add bias to the ACTIVATED values, not the pre-activation values
         // Create a copy of activated values with bias for forward propagation
@@ -483,6 +499,15 @@ namespace ml {
         virtual bool        saveToFile(const std::string& filename);
         virtual bool        loadFromFile(const std::string& filename);
 
+        // Optimizer configuration
+        void setOptimizer(IOptimizer<T>* optimizer) { mOptimizer = optimizer; mOwnsOptimizer = false; }
+        IOptimizer<T>* getOptimizer() const { return mOptimizer; }
+        void setOptimizerType(OptimizerType type) {
+            if (mOptimizer && mOwnsOptimizer) delete mOptimizer;
+            mOptimizer = CreateOptimizer<T>(type);
+            mOwnsOptimizer = true;
+        }
+
         // ILayer<T> overrides
     public:
         virtual void        init() override;
@@ -525,6 +550,8 @@ namespace ml {
     private:
         ILayer<T>* pInputLayer;
         ILayer<T>* pOutputLayer;
+        IOptimizer<T>* mOptimizer;
+        bool mOwnsOptimizer;
     };
 
 
@@ -544,11 +571,18 @@ namespace ml {
         //ILayer<T>::AddToLayersMap(this);
         pInputLayer = NULL;
         pOutputLayer = NULL;
+        // Default to SGD optimizer
+        mOptimizer = new SGDOptimizer<T>();
+        mOwnsOptimizer = true;
     }
 
     template <typename T>
     Network<T>::~Network() {
         mOutputLayerSiblingCache.clear();
+        if (mOptimizer && mOwnsOptimizer) {
+            delete mOptimizer;
+            mOptimizer = NULL;
+        }
     }
 
 
@@ -695,7 +729,17 @@ namespace ml {
                 // Compute weighted error: W^T * error = (n+bias, m) * (m, 1) = (n+bias, 1)
                 ml::Mat<T> weightedErr = ml::Mult<T>(weightsT, errorCol, true);
 
-                ml::Mat<T> deltaSig = SigGrad<T>(activatedInput);
+                // Get activation gradient based on layer's activation type
+                Layer<T>* pPrevLayerConcrete = dynamic_cast<Layer<T>*>(pPrevLayer);
+                ml::Mat<T> deltaSig;
+                if (pPrevLayerConcrete) {
+                    deltaSig = ActivateGrad<T>(activatedInput,
+                                               pPrevLayerConcrete->getActivationType(),
+                                               pPrevLayerConcrete->getActivationAlpha());
+                } else {
+                    // Fallback to sigmoid for non-Layer types
+                    deltaSig = SigmoidGrad<T>(activatedInput);
+                }
 
                 // Strip bias rows from weighted errors to match dimensions of deltaSig
                 // weightedErr is a column vector (cx=1, cy=outputSize) which includes bias
@@ -726,12 +770,11 @@ namespace ml {
     template <typename T>
     void Network<T>::updateWeights(T learningRate) {
         /*
-        Update weights using gradient descent:
+        Update weights using the configured optimizer:
         For each layer, compute weight gradients from activations and errors,
-        then update weights: W = W + learningRate * input^T * error
-        The error already includes the gradient of the loss
+        then delegate to optimizer to update weights
         */
-        if (!pInputLayer) return;
+        if (!pInputLayer || !mOptimizer) return;
 
         std::vector<ILayer<T>*> toProcess;
         std::set<ILayer<T>*> visited;
@@ -745,6 +788,7 @@ namespace ml {
             if (!pCurLayer) continue;
 
             // Add siblings to processing queue
+            int siblingIdx = 0;
             for (ILayer<T>* pNextLayer : pCurLayer->getSiblings()) {
                 if (visited.find(pNextLayer) == visited.end()) {
                     toProcess.push_back(pNextLayer);
@@ -767,40 +811,37 @@ namespace ml {
                 for (int b = 0; b < ILayer<T>::GetNumBiasNodes(); ++b)
                     pushBiasCol<T>(activatedWithBias);
 
-                // Compute weight update: delta_W = error^T * activation (outer product)
+                // Compute weight gradients: delta_W = error^T * activation (outer product)
                 // errors is (1, m) row vector, activatedWithBias is (1, n+bias) row vector
                 // weights are (m, n+bias)
                 // Result: delta_W (m, n+bias) where delta_W[i,j] = error[i] * activation[j]
 
-                // Manual outer product since Mult() doesn't handle this case properly
-                ml::Mat<T> weightDelta(weights.size(), 0);
+                ml::Mat<T> gradients(weights.size(), 0);
                 int numOutputs = errors.size().cx;  // m
                 int numInputs = activatedWithBias.size().cx;  // n+bias
 
-                for (int i = 0; i < numOutputs; ++i) {
-                    T err = errors.getAt(0, i);
-                    for (int j = 0; j < numInputs; ++j) {
-                        T act = activatedWithBias.getAt(0, j);
-                        weightDelta.setAt(i, j, err * act);
+                for (int row = 0; row < numOutputs; ++row) {
+                    T err = errors.getAt(0, row);
+                    for (int col = 0; col < numInputs; ++col) {
+                        T act = activatedWithBias.getAt(0, col);
+                        gradients.setAt(row, col, err * act);
                     }
                 }
 
-                // Create updated weights matrix
+                // Create unique key for this layer-sibling pair for optimizer state tracking
+                std::string layerKey = pCurLayer->getName() + "_to_" + pNextLayer->getName() +
+                                       "_" + std::to_string(i) + "_" + std::to_string(siblingIdx);
+
+                // Get modifiable reference to weights
                 ml::Mat<T> updatedWeights = weights.Copy();
-                for (int row = 0; row < updatedWeights.size().cy; ++row) {
-                    for (int col = 0; col < updatedWeights.size().cx; ++col) {
-                        T delta = weightDelta.getAt(row, col);
-                        // Clip delta to prevent exploding gradients
-                        if (std::abs(delta) > 10.0) {
-                            delta = (delta > 0) ? 10.0 : -10.0;
-                        }
-                        T newWeight = updatedWeights.getAt(row, col) + learningRate * delta;
-                        updatedWeights.setAt(row, col, newWeight);
-                    }
-                }
+
+                // Use optimizer to update weights
+                mOptimizer->updateWeights(updatedWeights, gradients, learningRate, layerKey);
 
                 // Store the updated weights back
                 pCurLayer->setWeights(pNextLayer, updatedWeights);
+
+                siblingIdx++;
             }
         }
     }
diff --git a/optimizer.h b/optimizer.h
new file mode 100644
index 0000000..269421d
--- /dev/null
+++ b/optimizer.h
@@ -0,0 +1,294 @@
+#pragma once
+
+#include "Matrix/matrix.h"
+#include <map>
+#include <string>
+#include <cmath>
+
+namespace ml {
+
+    // Optimizer types
+    enum class OptimizerType {
+        SGD,
+        MOMENTUM,
+        ADAM
+    };
+
+    // Convert optimizer type to string for debugging/logging
+    inline const char* OptimizerTypeToString(OptimizerType type) {
+        switch (type) {
+            case OptimizerType::SGD: return "SGD";
+            case OptimizerType::MOMENTUM: return "Momentum";
+            case OptimizerType::ADAM: return "Adam";
+            default: return "Unknown";
+        }
+    }
+
+    // Base optimizer interface
+    template <typename T>
+    class IOptimizer {
+    public:
+        virtual ~IOptimizer() {}
+
+        // Update weights given gradients
+        // layerKey is used to track state for each layer-sibling pair
+        virtual void updateWeights(
+            ml::Mat<T>& weights,
+            const ml::Mat<T>& gradients,
+            T learningRate,
+            const std::string& layerKey
+        ) = 0;
+
+        // Reset optimizer state (useful when starting new training)
+        virtual void reset() = 0;
+
+        // Get optimizer type
+        virtual OptimizerType getType() const = 0;
+    };
+
+    // ========== SGD OPTIMIZER ==========
+    // Stochastic Gradient Descent with optional gradient clipping
+    // Update: W = W + learningRate * gradient
+    template <typename T>
+    class SGDOptimizer : public IOptimizer<T> {
+    public:
+        SGDOptimizer(T gradientClipThreshold = 10.0)
+            : mGradientClipThreshold(gradientClipThreshold) {}
+
+        virtual ~SGDOptimizer() {}
+
+        virtual void updateWeights(
+            ml::Mat<T>& weights,
+            const ml::Mat<T>& gradients,
+            T learningRate,
+            const std::string& layerKey
+        ) override {
+            if (!weights.IsGood() || !gradients.IsGood()) return;
+            if (weights.size() != gradients.size()) return;
+
+            for (int row = 0; row < weights.size().cy; ++row) {
+                for (int col = 0; col < weights.size().cx; ++col) {
+                    T gradient = gradients.getAt(row, col);
+
+                    // Clip gradient to prevent exploding gradients
+                    if (std::abs(gradient) > mGradientClipThreshold) {
+                        gradient = (gradient > 0) ? mGradientClipThreshold : -mGradientClipThreshold;
+                    }
+
+                    T currentWeight = weights.getAt(row, col);
+                    T newWeight = currentWeight + learningRate * gradient;
+                    weights.setAt(row, col, newWeight);
+                }
+            }
+        }
+
+        virtual void reset() override {
+            // SGD has no state to reset
+        }
+
+        virtual OptimizerType getType() const override {
+            return OptimizerType::SGD;
+        }
+
+    private:
+        T mGradientClipThreshold;
+    };
+
+    // ========== MOMENTUM OPTIMIZER ==========
+    // SGD with Momentum: accumulates velocity vector
+    // velocity = β * velocity + gradient
+    // W = W + learningRate * velocity
+    //
+    // Hyperparameters:
+    // - β (beta): momentum coefficient, typically 0.9
+    // - Helps accelerate in relevant direction and dampen oscillations
+    template <typename T>
+    class MomentumOptimizer : public IOptimizer<T> {
+    public:
+        MomentumOptimizer(T beta = 0.9, T gradientClipThreshold = 10.0)
+            : mBeta(beta), mGradientClipThreshold(gradientClipThreshold) {}
+
+        virtual ~MomentumOptimizer() {}
+
+        virtual void updateWeights(
+            ml::Mat<T>& weights,
+            const ml::Mat<T>& gradients,
+            T learningRate,
+            const std::string& layerKey
+        ) override {
+            if (!weights.IsGood() || !gradients.IsGood()) return;
+            if (weights.size() != gradients.size()) return;
+
+            // Initialize velocity for this layer if not exists
+            if (mVelocity.find(layerKey) == mVelocity.end()) {
+                mVelocity[layerKey] = ml::Mat<T>(weights.size(), 0);
+            }
+
+            ml::Mat<T>& velocity = mVelocity[layerKey];
+
+            // Update velocity and weights
+            for (int row = 0; row < weights.size().cy; ++row) {
+                for (int col = 0; col < weights.size().cx; ++col) {
+                    T gradient = gradients.getAt(row, col);
+
+                    // Clip gradient
+                    if (std::abs(gradient) > mGradientClipThreshold) {
+                        gradient = (gradient > 0) ? mGradientClipThreshold : -mGradientClipThreshold;
+                    }
+
+                    // Update velocity: v = β*v + gradient
+                    T v = mBeta * velocity.getAt(row, col) + gradient;
+                    velocity.setAt(row, col, v);
+
+                    // Update weights: W = W + learningRate * v
+                    T currentWeight = weights.getAt(row, col);
+                    T newWeight = currentWeight + learningRate * v;
+                    weights.setAt(row, col, newWeight);
+                }
+            }
+        }
+
+        virtual void reset() override {
+            mVelocity.clear();
+        }
+
+        virtual OptimizerType getType() const override {
+            return OptimizerType::MOMENTUM;
+        }
+
+        T getBeta() const { return mBeta; }
+        void setBeta(T beta) { mBeta = beta; }
+
+    private:
+        T mBeta;
+        T mGradientClipThreshold;
+        std::map<std::string, ml::Mat<T>> mVelocity; // velocity for each layer
+    };
+
+    // ========== ADAM OPTIMIZER ==========
+    // Adaptive Moment Estimation
+    // Combines momentum (first moment) with RMSprop (second moment)
+    //
+    // m = β1 * m + (1 - β1) * gradient        (first moment - mean)
+    // v = β2 * v + (1 - β2) * gradient²       (second moment - variance)
+    // m_hat = m / (1 - β1^t)                  (bias correction)
+    // v_hat = v / (1 - β2^t)                  (bias correction)
+    // W = W + learningRate * m_hat / (√v_hat + ε)
+    //
+    // Hyperparameters:
+    // - β1: typically 0.9 (first moment decay)
+    // - β2: typically 0.999 (second moment decay)
+    // - ε: typically 1e-8 (small constant for numerical stability)
+    template <typename T>
+    class AdamOptimizer : public IOptimizer<T> {
+    public:
+        AdamOptimizer(T beta1 = 0.9, T beta2 = 0.999, T epsilon = 1e-8, T gradientClipThreshold = 10.0)
+            : mBeta1(beta1), mBeta2(beta2), mEpsilon(epsilon),
+              mGradientClipThreshold(gradientClipThreshold) {}
+
+        virtual ~AdamOptimizer() {}
+
+        virtual void updateWeights(
+            ml::Mat<T>& weights,
+            const ml::Mat<T>& gradients,
+            T learningRate,
+            const std::string& layerKey
+        ) override {
+            if (!weights.IsGood() || !gradients.IsGood()) return;
+            if (weights.size() != gradients.size()) return;
+
+            // Initialize moments for this layer if not exists
+            if (mFirstMoment.find(layerKey) == mFirstMoment.end()) {
+                mFirstMoment[layerKey] = ml::Mat<T>(weights.size(), 0);
+                mSecondMoment[layerKey] = ml::Mat<T>(weights.size(), 0);
+                mTimeStep[layerKey] = 0;
+            }
+
+            ml::Mat<T>& m = mFirstMoment[layerKey];
+            ml::Mat<T>& v = mSecondMoment[layerKey];
+            int& t = mTimeStep[layerKey];
+            t++;
+
+            // Compute bias correction terms
+            T beta1_t = std::pow(mBeta1, t);
+            T beta2_t = std::pow(mBeta2, t);
+            T bias1Correction = 1.0 - beta1_t;
+            T bias2Correction = 1.0 - beta2_t;
+
+            // Update moments and weights
+            for (int row = 0; row < weights.size().cy; ++row) {
+                for (int col = 0; col < weights.size().cx; ++col) {
+                    T gradient = gradients.getAt(row, col);
+
+                    // Clip gradient
+                    if (std::abs(gradient) > mGradientClipThreshold) {
+                        gradient = (gradient > 0) ? mGradientClipThreshold : -mGradientClipThreshold;
+                    }
+
+                    // Update biased first moment estimate: m = β1*m + (1-β1)*g
+                    T m_val = mBeta1 * m.getAt(row, col) + (1.0 - mBeta1) * gradient;
+                    m.setAt(row, col, m_val);
+
+                    // Update biased second moment estimate: v = β2*v + (1-β2)*g²
+                    T v_val = mBeta2 * v.getAt(row, col) + (1.0 - mBeta2) * gradient * gradient;
+                    v.setAt(row, col, v_val);
+
+                    // Compute bias-corrected moment estimates
+                    T m_hat = m_val / bias1Correction;
+                    T v_hat = v_val / bias2Correction;
+
+                    // Update weights: W = W + α * m_hat / (√v_hat + ε)
+                    T currentWeight = weights.getAt(row, col);
+                    T update = learningRate * m_hat / (std::sqrt(v_hat) + mEpsilon);
+                    T newWeight = currentWeight + update;
+                    weights.setAt(row, col, newWeight);
+                }
+            }
+        }
+
+        virtual void reset() override {
+            mFirstMoment.clear();
+            mSecondMoment.clear();
+            mTimeStep.clear();
+        }
+
+        virtual OptimizerType getType() const override {
+            return OptimizerType::ADAM;
+        }
+
+        T getBeta1() const { return mBeta1; }
+        T getBeta2() const { return mBeta2; }
+        T getEpsilon() const { return mEpsilon; }
+
+        void setBeta1(T beta1) { mBeta1 = beta1; }
+        void setBeta2(T beta2) { mBeta2 = beta2; }
+        void setEpsilon(T epsilon) { mEpsilon = epsilon; }
+
+    private:
+        T mBeta1;
+        T mBeta2;
+        T mEpsilon;
+        T mGradientClipThreshold;
+
+        std::map<std::string, ml::Mat<T>> mFirstMoment;   // m - first moment (mean)
+        std::map<std::string, ml::Mat<T>> mSecondMoment;  // v - second moment (variance)
+        std::map<std::string, int> mTimeStep;             // t - timestep for bias correction
+    };
+
+    // ========== OPTIMIZER FACTORY ==========
+    // Helper function to create optimizers
+    template <typename T>
+    IOptimizer<T>* CreateOptimizer(OptimizerType type) {
+        switch (type) {
+            case OptimizerType::SGD:
+                return new SGDOptimizer<T>();
+            case OptimizerType::MOMENTUM:
+                return new MomentumOptimizer<T>();
+            case OptimizerType::ADAM:
+                return new AdamOptimizer<T>();
+            default:
+                return new SGDOptimizer<T>(); // Default fallback
+        }
+    }
+
+} // namespace ml
diff --git a/test_activations_optimizers b/test_activations_optimizers
new file mode 100755
index 0000000..9271556
Binary files /dev/null and b/test_activations_optimizers differ
diff --git a/test_activations_optimizers.cpp b/test_activations_optimizers.cpp
new file mode 100644
index 0000000..5c4a5a6
--- /dev/null
+++ b/test_activations_optimizers.cpp
@@ -0,0 +1,349 @@
+#include <iostream>
+#include <cassert>
+#include <cmath>
+#include "network.h"
+
+using namespace ml;
+using namespace std;
+
+// Helper function to check if two values are approximately equal
+template<typename T>
+bool approxEqual(T a, T b, T epsilon = 1e-5) {
+    return std::abs(a - b) < epsilon;
+}
+
+// Test activation functions
+void testActivationFunctions() {
+    cout << "\n=== Testing Activation Functions ===" << endl;
+
+    // Test Sigmoid
+    {
+        Mat<double> input(1, 3, 0);
+        input.setAt(0, 0, -1.0);
+        input.setAt(0, 1, 0.0);
+        input.setAt(0, 2, 1.0);
+
+        Mat<double> output = Sigmoid(input);
+        assert(approxEqual(output.getAt(0, 0), 0.2689414, 1e-5));
+        assert(approxEqual(output.getAt(0, 1), 0.5, 1e-5));
+        assert(approxEqual(output.getAt(0, 2), 0.7310586, 1e-5));
+
+        Mat<double> grad = SigmoidGrad(output);
+        assert(approxEqual(grad.getAt(0, 0), 0.1966119, 1e-5));
+        assert(approxEqual(grad.getAt(0, 1), 0.25, 1e-5));
+        assert(approxEqual(grad.getAt(0, 2), 0.1966119, 1e-5));
+
+        cout << "✓ Sigmoid activation and gradient test passed" << endl;
+    }
+
+    // Test ReLU
+    {
+        Mat<double> input(1, 4, 0);
+        input.setAt(0, 0, -2.0);
+        input.setAt(0, 1, -0.5);
+        input.setAt(0, 2, 0.0);
+        input.setAt(0, 3, 1.5);
+
+        Mat<double> output = ReLU(input);
+        assert(approxEqual(output.getAt(0, 0), 0.0));
+        assert(approxEqual(output.getAt(0, 1), 0.0));
+        assert(approxEqual(output.getAt(0, 2), 0.0));
+        assert(approxEqual(output.getAt(0, 3), 1.5));
+
+        Mat<double> grad = ReLUGrad(output);
+        assert(approxEqual(grad.getAt(0, 0), 0.0));
+        assert(approxEqual(grad.getAt(0, 1), 0.0));
+        assert(approxEqual(grad.getAt(0, 2), 0.0));
+        assert(approxEqual(grad.getAt(0, 3), 1.0));
+
+        cout << "✓ ReLU activation and gradient test passed" << endl;
+    }
+
+    // Test Leaky ReLU
+    {
+        Mat<double> input(1, 3, 0);
+        input.setAt(0, 0, -1.0);
+        input.setAt(0, 1, 0.0);
+        input.setAt(0, 2, 1.0);
+
+        Mat<double> output = LeakyReLU(input, 0.01);
+        assert(approxEqual(output.getAt(0, 0), -0.01));
+        assert(approxEqual(output.getAt(0, 1), 0.0));
+        assert(approxEqual(output.getAt(0, 2), 1.0));
+
+        Mat<double> grad = LeakyReLUGrad(output, 0.01);
+        assert(approxEqual(grad.getAt(0, 0), 0.01));
+        assert(approxEqual(grad.getAt(0, 1), 0.01));  // gradient is alpha at x=0
+        assert(approxEqual(grad.getAt(0, 2), 1.0));
+
+        cout << "✓ Leaky ReLU activation and gradient test passed" << endl;
+    }
+
+    // Test Tanh
+    {
+        Mat<double> input(1, 3, 0);
+        input.setAt(0, 0, -1.0);
+        input.setAt(0, 1, 0.0);
+        input.setAt(0, 2, 1.0);
+
+        Mat<double> output = Tanh(input);
+        assert(approxEqual(output.getAt(0, 0), -0.7615942, 1e-5));
+        assert(approxEqual(output.getAt(0, 1), 0.0));
+        assert(approxEqual(output.getAt(0, 2), 0.7615942, 1e-5));
+
+        Mat<double> grad = TanhGrad(output);
+        assert(approxEqual(grad.getAt(0, 0), 0.4199743, 1e-5));
+        assert(approxEqual(grad.getAt(0, 1), 1.0));
+        assert(approxEqual(grad.getAt(0, 2), 0.4199743, 1e-5));
+
+        cout << "✓ Tanh activation and gradient test passed" << endl;
+    }
+
+    // Test Softmax
+    {
+        Mat<double> input(1, 3, 0);
+        input.setAt(0, 0, 1.0);
+        input.setAt(0, 1, 2.0);
+        input.setAt(0, 2, 3.0);
+
+        Mat<double> output = Softmax(input);
+        double sum = output.getAt(0, 0) + output.getAt(0, 1) + output.getAt(0, 2);
+        assert(approxEqual(sum, 1.0, 1e-5));
+        assert(approxEqual(output.getAt(0, 0), 0.0900306, 1e-5));
+        assert(approxEqual(output.getAt(0, 1), 0.2447285, 1e-5));
+        assert(approxEqual(output.getAt(0, 2), 0.6652409, 1e-5));
+
+        cout << "✓ Softmax activation test passed" << endl;
+    }
+
+    // Test Linear
+    {
+        Mat<double> input(1, 3, 0);
+        input.setAt(0, 0, -1.5);
+        input.setAt(0, 1, 0.0);
+        input.setAt(0, 2, 2.5);
+
+        Mat<double> output = Linear(input);
+        assert(approxEqual(output.getAt(0, 0), -1.5));
+        assert(approxEqual(output.getAt(0, 1), 0.0));
+        assert(approxEqual(output.getAt(0, 2), 2.5));
+
+        Mat<double> grad = LinearGrad(output);
+        assert(approxEqual(grad.getAt(0, 0), 1.0));
+        assert(approxEqual(grad.getAt(0, 1), 1.0));
+        assert(approxEqual(grad.getAt(0, 2), 1.0));
+
+        cout << "✓ Linear activation and gradient test passed" << endl;
+    }
+
+    // Test unified interface
+    {
+        Mat<double> input(1, 2, 0);
+        input.setAt(0, 0, -1.0);
+        input.setAt(0, 1, 1.0);
+
+        Mat<double> sigmoidOut = Activate(input, ActivationType::SIGMOID);
+        Mat<double> reluOut = Activate(input, ActivationType::RELU);
+        Mat<double> tanhOut = Activate(input, ActivationType::TANH);
+
+        assert(sigmoidOut.getAt(0, 0) > 0.0 && sigmoidOut.getAt(0, 0) < 1.0);
+        assert(reluOut.getAt(0, 0) == 0.0 && reluOut.getAt(0, 1) == 1.0);
+        assert(tanhOut.getAt(0, 0) < 0.0 && tanhOut.getAt(0, 1) > 0.0);
+
+        cout << "✓ Unified activation interface test passed" << endl;
+    }
+
+    cout << "\n✅ All activation function tests passed!\n" << endl;
+}
+
+// Test optimizers
+void testOptimizers() {
+    cout << "\n=== Testing Optimizers ===" << endl;
+
+    // Test SGD
+    {
+        SGDOptimizer<double> sgd;
+        Mat<double> weights(2, 2, 0);
+        weights.setAt(0, 0, 1.0);
+        weights.setAt(0, 1, 2.0);
+        weights.setAt(1, 0, 3.0);
+        weights.setAt(1, 1, 4.0);
+
+        Mat<double> gradients(2, 2, 0);
+        gradients.setAt(0, 0, 0.1);
+        gradients.setAt(0, 1, 0.2);
+        gradients.setAt(1, 0, 0.3);
+        gradients.setAt(1, 1, 0.4);
+
+        sgd.updateWeights(weights, gradients, 0.1, "test_layer");
+
+        // SGD: w = w + lr * grad
+        assert(approxEqual(weights.getAt(0, 0), 1.01));
+        assert(approxEqual(weights.getAt(0, 1), 2.02));
+        assert(approxEqual(weights.getAt(1, 0), 3.03));
+        assert(approxEqual(weights.getAt(1, 1), 4.04));
+
+        cout << "✓ SGD optimizer test passed" << endl;
+    }
+
+    // Test Momentum
+    {
+        MomentumOptimizer<double> momentum(0.9);
+        Mat<double> weights(2, 2, 0);
+        weights.setAt(0, 0, 1.0);
+        weights.setAt(0, 1, 2.0);
+        weights.setAt(1, 0, 3.0);
+        weights.setAt(1, 1, 4.0);
+
+        Mat<double> gradients(2, 2, 0);
+        gradients.setAt(0, 0, 0.1);
+        gradients.setAt(0, 1, 0.2);
+        gradients.setAt(1, 0, 0.3);
+        gradients.setAt(1, 1, 0.4);
+
+        // First update: v = 0*0.9 + grad = grad, w = w + lr * v
+        momentum.updateWeights(weights, gradients, 0.1, "test_layer");
+        assert(approxEqual(weights.getAt(0, 0), 1.01));
+        assert(approxEqual(weights.getAt(0, 1), 2.02));
+
+        // Second update: v = prev_v*0.9 + grad, w = w + lr * v
+        momentum.updateWeights(weights, gradients, 0.1, "test_layer");
+        // v[0,0] = 0.1*0.9 + 0.1 = 0.19, w[0,0] = 1.01 + 0.1*0.19 = 1.029
+        assert(approxEqual(weights.getAt(0, 0), 1.029, 1e-5));
+
+        cout << "✓ Momentum optimizer test passed" << endl;
+    }
+
+    // Test Adam
+    {
+        AdamOptimizer<double> adam(0.9, 0.999, 1e-8);
+        Mat<double> weights(2, 2, 0);
+        weights.setAt(0, 0, 1.0);
+        weights.setAt(0, 1, 2.0);
+        weights.setAt(1, 0, 3.0);
+        weights.setAt(1, 1, 4.0);
+
+        Mat<double> gradients(2, 2, 0);
+        gradients.setAt(0, 0, 0.1);
+        gradients.setAt(0, 1, 0.2);
+        gradients.setAt(1, 0, 0.3);
+        gradients.setAt(1, 1, 0.4);
+
+        double w00_before = weights.getAt(0, 0);
+        adam.updateWeights(weights, gradients, 0.01, "test_layer");
+        double w00_after = weights.getAt(0, 0);
+
+        // Adam should update weights (exact value depends on bias correction)
+        assert(w00_after > w00_before);
+
+        // Second update should also work
+        adam.updateWeights(weights, gradients, 0.01, "test_layer");
+        assert(weights.getAt(0, 0) > w00_after);
+
+        cout << "✓ Adam optimizer test passed" << endl;
+    }
+
+    // Test optimizer reset
+    {
+        MomentumOptimizer<double> momentum(0.9);
+        Mat<double> weights(2, 2, 1.0);
+        Mat<double> gradients(2, 2, 0.1);
+
+        momentum.updateWeights(weights, gradients, 0.1, "test_layer");
+        double w1 = weights.getAt(0, 0);
+
+        momentum.updateWeights(weights, gradients, 0.1, "test_layer");
+        double w2 = weights.getAt(0, 0);
+
+        // Reset and update again
+        momentum.reset();
+        weights.setAt(0, 0, 1.0); // Reset weight
+        momentum.updateWeights(weights, gradients, 0.1, "test_layer");
+        double w3 = weights.getAt(0, 0);
+
+        // After reset, first update should be same as initial first update
+        assert(approxEqual(w1, w3));
+
+        cout << "✓ Optimizer reset test passed" << endl;
+    }
+
+    cout << "\n✅ All optimizer tests passed!\n" << endl;
+}
+
+// Test network with different activations
+void testNetworkWithActivations() {
+    cout << "\n=== Testing Network with Different Activations ===" << endl;
+
+    // Create a simple network with ReLU activation
+    Layer<double>* input = new Layer<double>(2, "input", ActivationType::RELU);
+    Layer<double>* hidden = new Layer<double>(3, "hidden", ActivationType::RELU);
+    Layer<double>* output = new Layer<double>(1, "output", ActivationType::SIGMOID);
+
+    Network<double> net;
+    net.connect(input, hidden);
+    net.connect(hidden, output);
+    net.setInputLayer(input);
+    net.setOutputLayer(output);
+    net.init();
+
+    // Test feed forward
+    Mat<double> testInput(1, 2, 0);
+    testInput.setAt(0, 0, 0.5);
+    testInput.setAt(0, 1, 0.3);
+
+    Mat<double> result = net.feed(testInput);
+    assert(result.IsGood());
+    assert(result.size().cx == 1);  // Output should be 1D
+
+    cout << "✓ Network with ReLU activation test passed" << endl;
+
+    // Test with Adam optimizer
+    net.setOptimizerType(OptimizerType::ADAM);
+    assert(net.getOptimizer()->getType() == OptimizerType::ADAM);
+
+    Mat<double> target(1, 1, 0);
+    target.setAt(0, 0, 1.0);
+
+    result = net.feed(testInput);
+    output->setErrors(ml::Diff(target, result));
+    net.backprop();
+    net.updateWeights(0.01);
+
+    cout << "✓ Network with Adam optimizer test passed" << endl;
+
+    // Test with Momentum optimizer
+    net.setOptimizerType(OptimizerType::MOMENTUM);
+    assert(net.getOptimizer()->getType() == OptimizerType::MOMENTUM);
+
+    result = net.feed(testInput);
+    output->setErrors(ml::Diff(target, result));
+    net.backprop();
+    net.updateWeights(0.01);
+
+    cout << "✓ Network with Momentum optimizer test passed" << endl;
+
+    cout << "\n✅ All network integration tests passed!\n" << endl;
+}
+
+int main() {
+    cout << "\n========================================" << endl;
+    cout << "Testing Activation Functions & Optimizers" << endl;
+    cout << "========================================" << endl;
+
+    try {
+        testActivationFunctions();
+        testOptimizers();
+        testNetworkWithActivations();
+
+        cout << "\n========================================" << endl;
+        cout << "🎉 ALL TESTS PASSED! 🎉" << endl;
+        cout << "========================================\n" << endl;
+        return 0;
+    } catch (const std::exception& e) {
+        cout << "\n❌ TEST FAILED: " << e.what() << endl;
+        return 1;
+    } catch (...) {
+        cout << "\n❌ TEST FAILED: Unknown error" << endl;
+        return 1;
+    }
+}
diff --git a/utility.h b/utility.h
index ece2826..c2219a3 100644
--- a/utility.h
+++ b/utility.h
@@ -7,6 +7,9 @@
 #include <math.h>
 #include <stdlib.h>
 #include <stack>
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
 
 #ifdef  _DEBUG
 #define DEBUG(t, message) 		(Utility::Debug(t, message))