'Cuda kernel not running on if statements c++

I have a small program that has two arrays of vectors (pos, vel) representing particles floating in space, and I'm using the GPU to update them and will soon run the GPU to render them as well. The kernel adding the vectors together runs as it is supposed to and updates the particles, however running the kernel that does a simple check to see if the particle is on screen or not, and wraps around the edge accordingly, simply doesn't change the particle's position even if the conditions are met. Not going to include the screen drawing since it mostly clutters up the code and I've already narrowed the problem to the kernel functions.

Vec is just a struct with a float x, y variables and has the basic operator() functions for adding, subtracting, multiplying with scalars, etc. Every functions is made to be compatible with CUDA with the CUDA_CALLABLE_MEMBER macro

Includes and CUDA kernels:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define CUDA_CALLABLE_MEMBER __host__ __device__


CUDA_CALLABLE_MEMBER
void kernel(vec* a , vec* b, vec* c, int i)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < i)
    {
        c[idx] =  a[idx] + b[idx];
    }
}

CUDA_CALLABLE_MEMBER
void check_positions(vec* pos, vec* res) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < sizeof(pos) / sizeof(vec)) {
        // Wrap around
        if (pos[idx].x < 0) {
            pos[idx].x = pos[idx].x + res->x;
        }
        if (pos[idx].x > res->x) {
            pos[idx].x = pos[idx].x - res->x;
        }
        if (pos[idx].y < 0) {
            pos[idx].y = pos[idx].y + res->y;
        }
        if (pos[idx].y > res->y) {
            pos[idx].y = pos[idx].y - res->y;
        }
    }
    
}

The function that runs the kernels and does the memory allocation and copying from host to device and vice versa:

vec* res = new vec(800, 800); // Resolution vector for use by the gpu

vec* addWithCuda(vec* a, vec* b, int n) {
    // Making and allocating space for arrays
    vec *c = new vec[n];
    vec *da = new vec[n];
    vec *db = new vec[n];
    vec *d_res = new vec;
    vec *dc = new vec[n];
    cudaMalloc((void**)&da, n * sizeof(vec));
    cudaMalloc((void**)&db, n * sizeof(vec));
    cudaMalloc((void**)&dc, n * sizeof(vec));
    cudaMalloc((void**)&d_res, sizeof(vec));
    cudaMemcpy(da, a, n * sizeof(vec), cudaMemcpyHostToDevice);
    cudaMemcpy(db, b, n * sizeof(vec), cudaMemcpyHostToDevice);
    cudaMemcpy(d_res, res, sizeof(vec), cudaMemcpyHostToDevice);
    // Run the kernels
    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize; 
    check_positions << <gridSize, blockSize >> > (da, d_res);
    kernel<<<gridSize, blockSize>>>(da, db, dc, n);
    // Copy data back to host and free the data
    cudaMemcpy(c, dc, n * sizeof(vec), cudaMemcpyDeviceToHost);
    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    cudaFree(d_res);
    return c;
}

Main code:


int main(){
// Make an array of vectors
    printf("Making an array of vectors\n");
    int n = 1<<15;
    vec* pos = new vec[n];
    vec* vel = new vec[n];
    for (int i = 0; i < n; i++) {
        pos[i].x = rand() % 800;
        pos[i].y = rand() % 800;
        vel[i].x = rand() % 5 - 2.5f;
        vel[i].y = rand() % 5 - 2.5f;
    }
    printf("Done making an array of vectors\n");
    vec* newPos;
    while(not_exited){ // Sample loop
        newPos = addWithCuda(pos, vel, n);
        // Copy pos to newPos
        for (int i = 0; i < n; i++) {
            pos[i] = newPos[i];
        }
    }
    // Print resulting positions:
    int i = -1;
    while(++i < n)
        printf("%d: (%f, %f)\n", i, pos[i].x, pos[i].y);

}

Once run, you can see that after a certain period of time every position will be outside of the "window":

32746: (-3262.000000, 989.000000)
32747: (2953.000000, -267.000000)
32748: (-2002.000000, -2314.000000)
32749: (-3400.000000, 2725.000000)
32750: (-2051.000000, 1165.000000)
32751: (1065.000000, 3004.000000)
32752: (-3700.000000, -223.000000)
32753: (-3904.000000, -440.000000)
32754: (2691.000000, -301.000000)
32755: (3065.000000, -470.000000)
32756: (-3647.000000, -703.000000)
32757: (-224.000000, -80.000000)
32758: (-2237.000000, 888.000000)
32759: (-585.000000, -620.000000)
32760: (-3926.000000, -426.000000)
32761: (-3453.000000, -1921.000000)

Any suggestions?



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source