'How to access dynamically allocated array in CUDA

Here I'm trying to access a dynamically allocated array in CUDA. However, after running the output is c[0][0] = 0. Am I accessing the allocated array correctly? I think the way I'm copying the arrays is probably correct and for some reason, the value of C has not been changed on the device.

#include<iostream>
using namespace std;

__global__ void add_matrix(float *A, float *B, float *C, int n) {
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    if ((i < n) && (j < n)){
        C[i*n+j] = A[i*n+j] + B[i*n+j];
    }
}

int main(){
    const size_t N = 1024;
    const size_t size = N * N * sizeof(float);
    float *A, *B, *C;
    A = (float*) malloc(size);
    B = (float*) malloc(size);
    C = (float*) malloc(size);
    for (size_t i=0; i<N*N; i++){
        A[i] = 5.0;
        B[i] = 6.0;
    }
    float *A_d, *B_d, *C_d;
    cudaMalloc((void**)&A_d, size);
    cudaMalloc((void**)&B_d, size);
    cudaMalloc((void**)&C_d, size);
    auto code = cudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
    if (code != cudaSuccess){
        cout << "Error copying A to device" << endl;
    }
    code = cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);
    if (code != cudaSuccess){
        cout << "Error copying B to device" << endl;
    }

    dim3 threads(N, N);
    dim3 blocks(1,1);
    add_matrix<<<blocks, threads>>>(A_d, B_d, C_d, N);
    code = cudaMemcpy(C, C_d, size, cudaMemcpyDeviceToHost);
    if (code != cudaSuccess){
        cout << "Error copying C from device" << endl;
    }

    std::cout << "C[0][0] : " << C[0] << std::endl;

    free(A); free(B); free(C);
    cudaFree(A_d); cudaFree(B_d); cudaFree(C_d);
    return 0;
}


Solution 1:[1]

The problem was arranging the blocks. I totally forgot each block can have a limited number of threads. we can obtain the maximum threads per block by getting maxThreadsPerBlock property using cudaDeviceGetAttribute. It seems the Colab GPU supports 1024 threads in each block. so I changed the arrangement this way:

dim3 threads(32,32);
dim3 blocks(32,32);

And it worked

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Amirabbas asadi