'incorrect cuda kernel output

I am accelerating a big application, part of which relies on basic indexing as shown below:

#include <iostream>

void kernel_cpu() {
    for (size_t i=0; i<3; i++) {
        for (size_t j = i+1; j<4; j++) {
            printf("i:%d j:%d\n", i, j);
        }
    }
}

__global__ void kernel_gpu() {
    size_t i = blockIdx.x*blockDim.x+threadIdx.x;
    if(i < 3) {
        for (size_t j = i+1; j<4; j++) {
            printf("i:%d j:%d\n", i, j);
        }
    }
}

int main() {
    printf("kernel_cpu() \n");
    kernel_cpu();

    printf("kernel_gpu() \n");
    kernel_gpu<<<1,4>>>();
    cudaDeviceSynchronize();

    return 0;
}

I am compiling this code with nvcc -o test test.cu. Here is the output I get:

kernel_cpu() 
i:0 j:1
i:0 j:2
i:0 j:3
i:1 j:2
i:1 j:3
i:2 j:3
kernel_gpu() 
i:0 j:0
i:1 j:0
i:2 j:0
i:0 j:0
i:1 j:0
i:0 j:0

kernel_gpu() should produce same output as kernel_cpu(), however, I cannot understand why it is producing different indexes. Any help will be highly appreciated.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'incorrect cuda kernel output

Sources

Related Questions