'InvalidArgumentError(): Incompatible shapes: [4,784] vs [784,4] [Op:Mul]

I am implementing some new ideas, but when I am running through the code it gives me the error:

InvalidArgumentError(): Incompatible shapes: [4,784] vs [784,4] [Op:Mul]

The code

import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from tensorflow.python.ops.numpy_ops import np_config

np_config.enable_numpy_behavior()

### MNIST dataset
def load_dataset():
    # load in the dataset
    (x_train, y_train), (x_test, y_test) = datasets.mnist.load_data()
    # reshape dataset into a single channel
    x_train = x_train.reshape((x_train.shape[0], 28, 28, 1))
    x_test = x_test.reshape((x_test.shape[0], 28, 28, 1))
    # one hot encode target values
    y_train = tf.keras.utils.to_categorical(y_train)
    y_test = tf.keras.utils.to_categorical(y_test)
    return x_train, y_train, x_test, y_test


X_train, y_train, X_test, y_test = load_dataset()


def normalize(X):
    X = X.astype("float32")
    X = X / 255.0
    return X


### shuffling the data and sample 1024 images
from random import sample
import pandas as pd

n = 1024
X_train, y_train = shuffle(X_train, y_train)
X_sample = X_train[:n]
# oneliner
y_sample = tf.one_hot(X_train, 10)

x = tf.constant(tf.reshape(normalize(X_sample), (n, 784)))
y = tf.constant(y_sample)

## Initialize the weights by adding noise to the weight weight = A + B*omega
A1_norm = tf.random.normal(shape=[784 * 4], stddev=1e-2)
A2_norm = tf.random.normal(shape=[4 * 4], stddev=1e-2)
A3_norm = tf.random.normal(shape=[4 * 4], stddev=1e-2)
A4_norm = tf.random.normal(shape=[4 * 10], stddev=1e-2)

# set them as variables do we can keep a look out for the gradient
A1 = tf.Variable(initial_value=tf.reshape(A1_norm, (784, 4)), trainable=True)
B1 = tf.Variable(initial_value=tf.ones([784, 4]), trainable=True)

A2 = tf.Variable(initial_value=tf.reshape(A2_norm, (4, 4)), trainable=True)
B2 = tf.Variable(initial_value=tf.ones([4, 4]), trainable=True)

A3 = tf.Variable(initial_value=tf.reshape(A3_norm, (4, 4)), trainable=True)
B3 = tf.Variable(initial_value=tf.ones([4, 4]), trainable=True)

A4 = tf.Variable(
    initial_value=tf.reshape(A4_norm, (4, 10)), trainable=True
)  # making it trainable we can calculate the gradient
B4 = tf.Variable(initial_value=tf.ones([4, 10]), trainable=True)

# Very interesting how we initialize the weights
P0 = tf.constant(np.diag(np.repeat(1e-4, 28)))
W1 = tf.constant(A1.numpy() * 0.0, dtype=tf.float32)
W2 = tf.constant(A2.numpy() * 0.0, dtype=tf.float32)
W3 = tf.constant(A3.numpy() * 0.0, dtype=tf.float32)
W4 = tf.constant(A4.numpy() * 0.0, dtype=tf.float32)
V = tf.constant(A4.numpy() * 0.0, dtype=tf.float32)


def f1(x, A1, W1, B1):
    return tf.sigmoid(tf.matmul(x, (A1 + W1.T * B1)))


def f2(x, A2, W2, B2):
    return tf.sigmoid(tf.matmul(x, (A2 + W2.T * B2)))


def f3(x, A3, W3, B3):
    return tf.sigmoid(tf.matmul(x, (A3 + W3.T * B3)))


def obs(x, V, A4, B4):
    return tf.nn.softmax(tf.matmul(x, A4 + V.T * B4))


# set a learning rate
learning_rate = 0.1

# the forward propagation
# https://www.tensorflow.org/guide/autodiff
def forward(x, y, W1, A1, B1, W2, A2, B2, W3, A3, B3, V, A4, B4):
    with tf.GradientTape() as g:  # keep a record of the g really interesting
        g.watch(W1)  # watch out for the weight
        g.watch(x)  # watch out for the input
        h1 = f1(
            x, A1, W1, B1
        )  # hidden layer 0 in the implementation of the forward propagation

        dh1dx = g.batch_jacobian(h1, x)
        dh1dW = tf.reshape(
            g.jacobian(h1, W1), shape=list(n, h1.shape.as_list()[2], -1)
        )  # if you need a seperate gradient for each item
        P1 = tf.matmul(tf.matmul(dh1dx, P0), dh1dx, transpose_b=True) + tf.matmul(
            dh1dW, dh1dW, transpose_b=True
        )  # Q=I

    with tf.GradientTape() as g:
        g.watch(W2)
        g.watch(h1)
        h2 = f2(h1, W2, A2, B2)  # mu.predict

        dh2dh1 = g.batch_jacobian(h2, h1)
        dh2dW = tf.reshape(
            g.jacobian(h2, W2), shape=list(n, h2.shape.as_list()[2], -1)
        )  # input batch size = 1024, output size = 2, number of parameters = 2 x 2
        P2 = tf.matmul(tf.matmul(dh2dh1, P1), dh2dh1, transpose_b=True) + tf.matmul(
            dh2dW, dh2dW, transpose_b=True
        )  # Q=I

    with tf.GradientTape() as g:  # persistent=TRUE
        g.watch(W3)
        g.watch(h2)
        h3 = f3(h2, W3, A3, B3)
        dh3dh2 = g.batch_jacobian(
            h3, h2
        )  # taken the jacobian of the h3 with respect to h2
        dh3dW = tf.reshape(
            g.jacobian(h3, W3), shape=list(n, h3.shape.as_list()[2], -1)
        )  # input batch size = 1024, output size = 2, number of parameters = 2 x 2
        P3 = tf.matmul(tf.matmul(dh3dh2, P2), dh3dh2, transpose_b=True) + tf.matmul(
            dh3dW, dh3dW, transpose_b=True
        )  # Q=I

    with tf.GradientTape() as g:
        g.watch(W4)
        g.watch(h3)
        y3 = obs(h3, W4, A4, B4)

        dobsdh3 = g.batch_jacobian(y3, h3)
        dobsdV = tf.reshape(
            g.jacobian(y3, W4), shape=list(n, y3.shape.as_list()[2], -1)
        )
        tf.matmul(dh3dW, dh3dW, transpose_b=True)  # Q=I
        S3 = (
            tf.matmul(tf.matmul(dobsdh3, P3), dobsdh3, transpose_b=True)
            + tf.matmul(dobsdV, dobsdV, transpose_b=True)
            + np.diag(np.repeat(1e-6, 10))
        )  # Q=I

        # require detailed study
        log_prob = tfp.distributions.Categorical(probs=y3).log_prob(y_train[:1024, :])

        return log_prob, h1, P1, h2, P2, h3, P3, y3, S3


log_prob, h1, P1, h2, P2, h3, P3, y3, S3 = forward(
    x, y, W1, A1, B1, W2, A2, B2, W3, A3, B3, V, A4, B4
)


def train(x, y, W1, A1, B1, W2, A2, B2, W3, A3, B3, V, A4, B4, learning_rate):
    with tf.GradientTape() as g2:
        log_prob, h1, P1, h2, P2, h3, P3, y3, S3 = forward(
            x, y, W1, A1, B1, W2, A2, B2, W3, A3, B3, V, A4, B4
        )
        neg_log_prob = -1 * (tf.reduce_mean(log_prob))
        print(neg_log_prob.numpy())
        dloss = g2.gradient(neg_log_prob, list(A1, B1, A2, B2, A3, B3, A4, B4))
        A1.assign_sub(learning_rate * dloss[[1]])
        B1.assign_sub(learning_rate * dloss[[2]])
        A2.assign_sub(learning_rate * dloss[[3]])
        B2.assign_sub(learning_rate * dloss[[4]])
        A3.assign_sub(learning_rate * dloss[[5]])
        B3.assign_sub(learning_rate * dloss[[6]])
        A4.assign_sub(learning_rate * dloss[[7]])
        B4.assign_sub(learning_rate * dloss[[8]])


train(x, y, W1, A1, B1, W2, A2, B2, W3, A3, B3, W4, A4, B4, learning_rate)

Error:

Traceback (most recent call last): File "/home/southern/Documents/Kalman_Project/Kalman_filter.py", line 162, in log_prob, h1, P1, h2, P2, h3, P3, y3, S3 = forward( File "/home/southern/Documents/Kalman_Project/Kalman_filter.py", line 101, in forward h1 = f1( File "/home/southern/Documents/Kalman_Project/Kalman_filter.py", line 77, in f1 return tf.sigmoid(tf.matmul(x, (A1 + W1.T * B1))) File "/home/southern/.local/lib/python3.8/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler raise e.with_traceback(filtered_tb) from None File "/home/southern/.local/lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 7186, in raise_from_not_ok_status raise core._status_to_exception(e) from None # pylint: disable=protected-access tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [4,784] vs. [784,4] [Op:Mul]



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source