'incorrect cuda kernel output
I am accelerating a big application, part of which relies on basic indexing as shown below:
#include <iostream>
void kernel_cpu() {
for (size_t i=0; i<3; i++) {
for (size_t j = i+1; j<4; j++) {
printf("i:%d j:%d\n", i, j);
}
}
}
__global__ void kernel_gpu() {
size_t i = blockIdx.x*blockDim.x+threadIdx.x;
if(i < 3) {
for (size_t j = i+1; j<4; j++) {
printf("i:%d j:%d\n", i, j);
}
}
}
int main() {
printf("kernel_cpu() \n");
kernel_cpu();
printf("kernel_gpu() \n");
kernel_gpu<<<1,4>>>();
cudaDeviceSynchronize();
return 0;
}
I am compiling this code with nvcc -o test test.cu
. Here is the output I get:
kernel_cpu()
i:0 j:1
i:0 j:2
i:0 j:3
i:1 j:2
i:1 j:3
i:2 j:3
kernel_gpu()
i:0 j:0
i:1 j:0
i:2 j:0
i:0 j:0
i:1 j:0
i:0 j:0
kernel_gpu()
should produce same output as kernel_cpu()
, however, I cannot understand why it is producing different indexes. Any help will be highly appreciated.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|