'My neural networks works well only with no hidden layers

You may think I am crazy, but I decided to write a neural network from scratch in C# for studying purposes. Please, be patient, I still have little experience)). English is not my first language, so I am sorry for it in advance.

I started with a program for handwritten digit recognizing with the MNIST database. I've read through a book about the algorithms inside the process and wrote this code.

public class NeuralNetwork
{
    private List<Matrix<double>> weights = new List<Matrix<double>>();
    private List<Vector<double>> biases = new List<Vector<double>>();
    private Random random = new Random();
    private List<Image> test_data;
    private int[] layer_sizes;

    public NeuralNetwork(params int[] layers)
    {
        layer_sizes = layers;
        for (int i = 0; i < layers.Length - 1; i++)
        {
            var weigthLayer = Matrix<double>.Build.Dense(layers[i + 1], layers[i], (k, j) => random.NextDouble());
            weights.Add(weigthLayer);
        }
        for (int i = 1; i < layers.Length; i++)
        {
            var biasesLayer = Vector<double>.Build.Dense(layers[i], (k) => random.NextDouble());
            biases.Add(biasesLayer);
        }
    }

    public Vector<double> FeedForward(Vector<double> a)
    {
        for (int i = 0; i < weights.Count; i++)
        {
            a = Sigmoid(weights[i].Multiply(a) + biases[i]);
        }
        return Sigmoid(a);
    }

    public void SGD(ITrainingDataProvider dataProvider, int epochs, int chuck_size, double eta)
    {
        test_data = new MnistReader().ReadTestData();
        Console.WriteLine("SGD algorithm started");
        var training_data = dataProvider.ReadTrainingData();
        Console.WriteLine("Training data has beeen read");
        Console.WriteLine($"Training data test: {Test(training_data)}%");
        Console.WriteLine($"Test data test: {Test(test_data)}%");
        for (int epoch = 0; epoch < epochs; epoch++)
        {
            training_data = training_data.OrderBy(item => random.Next()).ToList();
            List<List<Image>> chunks = training_data.ChunkBy(chuck_size);
            foreach (List<Image> chunk in chunks)
            {
                ProcessChunk(chunk, eta);
            }
            Console.WriteLine($"Epoch: {epoch + 1}/{epochs}");
            Console.WriteLine($"Training data test: {Test(training_data)}%");
            Console.WriteLine($"Test data test: {Test(test_data)}%");
        }
        Console.WriteLine("Done!");
    }

    private double Test(List<Image> data)
    {
        int count = 0;
        foreach (Image im in data)
        {
            var output = FeedForward(im.DataToVector());
            int number = output.MaximumIndex();
            if (number == (int)im.Label)
            {
                count++;
            }
        }
        return (double)count / data.Count * 100;
    }

    private void ProcessChunk(List<Image> chunk, double eta)
    {
        Delta[] deltas = new Delta[chunk.Count];
        for (int i = 0; i < chunk.Count; i++)
        {
            Image image = chunk[i];
            var input = image.DataToVector();
            var desired_output = Vector<double>.Build.Dense(layer_sizes[layer_sizes.Length - 1]);
            desired_output[(int)image.Label] = 1;
            Delta delta = BackPropagation(input, desired_output);
            deltas[i] = delta;
        }

        Delta sum = deltas[0];
        for (int i = 1; i < deltas.Length; i++)
        {
            sum += deltas[i];
        }
        Delta average_delta = sum / deltas.Length;

        for (int i = 0; i < layer_sizes.Length - 1; i++)
        {
            weights[i] += average_delta.d_weights[i].Multiply(eta);
            biases[i] += average_delta.d_biases[i].Multiply(eta);
        }
    }

    private Delta BackPropagation(Vector<double> input, Vector<double> desired_output)
    {
        List<Vector<double>> activations = new List<Vector<double>>();
        List<Vector<double>> zs = new List<Vector<double>>();

        Vector<double> a = input;
        activations.Add(input);
        for (int i = 0; i < layer_sizes.Length - 1; i++)
        {
            var z = weights[i].Multiply(a) + biases[i];
            zs.Add(z);
            a = Sigmoid(z);
            activations.Add(a);
        }

        List<Vector<double>> errors = new List<Vector<double>>();
        List<Matrix<double>> delta_weights = new List<Matrix<double>>();
        List<Vector<double>> delta_biases = new List<Vector<double>>();

        var error = CDerivative(activations[activations.Count - 1], desired_output).HProd(SigmoidDerivative(zs[^1]));
        errors.Add(error);

        int steps = 0;
        for (int i = layer_sizes.Length - 2; i >= 1; i--)
        {
            var layer_error = weights[i].Transpose().Multiply(errors[steps]).HProd(SigmoidDerivative(zs[i - 1]));
            errors.Add(layer_error);
            steps++;
        }
        errors.Reverse();

        for (int i = layer_sizes.Length - 1; i >= 1; i--)
        {
            var delta_layer_weights = (errors[i - 1].ToColumnMatrix() * activations[i - 1].ToRowMatrix()).Multiply(-1);
            delta_weights.Add(delta_layer_weights);
            var delta_layer_biases = errors[i - 1].Multiply(-1);
            delta_biases.Add(delta_layer_biases);
        }
        delta_biases.Reverse();
        delta_weights.Reverse();

        return new Delta { d_weights = delta_weights, d_biases = delta_biases };
    }

    private Vector<double> CDerivative(Vector<double> x, Vector<double> y)
    {
        return x - y;
    }

    private Vector<double> Sigmoid(Vector<double> x)
    {
        for (int i = 0; i < x.Count; i++)
        {
            x[i] = 1.0 / (1.0 + Math.Exp(-x[i]));
        }
        return x;
    }

    private Vector<double> SigmoidDerivative(Vector<double> x)
    {
        for (int i = 0; i < x.Count; i++)
        {
            x[i] = Math.Exp(-x[i]) / Math.Pow(1.0 + Math.Exp(-x[i]), 2);
        }
        return x;
    }
}

Delta class. A simple DTO to store weights and biases changes in a single object.

 public class Delta
{
    public List<Matrix<double>> d_weights { get; set; }
    public List<Vector<double>> d_biases { get; set; }


    public static Delta operator +(Delta d1, Delta d2)
    {
        Delta result = d1;
        for (int i = 0; i < d2.d_weights.Count; i++)
        {
            result.d_weights[i] += d2.d_weights[i];
        }
        for (int i = 0; i < d2.d_biases.Count; i++)
        {
            result.d_biases[i] += d2.d_biases[i];
        }
        return result;
    }

    public static Delta operator /(Delta d1, double d)
    {
        Delta result = d1;
        for (int i = 0; i < d1.d_weights.Count; i++)
        {
            result.d_weights[i] /= d;
        }
        for (int i = 0; i < d1.d_biases.Count; i++)
        {
            result.d_biases[i] /= d;
        }
        return result;
    }
}

Everything ended up working fine, however complex networks with 1 or more hidden layers don't show any significant results. They are getting best 70% accuracy and then the learning curve drops. The accuracy returns to its 20-30%. Typically, the graph looks like a square root function, but in my case it is more like a turned around quadratic parabola the graph of my tries with different amounts of neurons in the first hidden layer

After a few tries, I found out, that without any hidden layers the algorithm works just fine. It learns up to 90% of accuracy and then the graph never falls down. Apparently, the bug is somewhere in back-propagation algorithm. It doesn't cause any problems with only input and output layers, but it does, when I add a hidden layer.

I have been trying to find the problem for a long time and I hope that someone, smarter than me, will be able to help.

Thanks in advance!



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source