'CUDA limit seems to be reached, but what limit is that?

I have a CUDA program that seems to be hitting some sort of limit of some resource, but I can't figure out what that resource is. Here is the kernel function:

__global__ void DoCheck(float2* points, int* segmentToPolylineIndexMap, 
                        int segmentCount, int* output)
{
    int segmentIndex = threadIdx.x + blockIdx.x * blockDim.x;
    int pointCount = segmentCount + 1;

    if(segmentIndex >= segmentCount)
        return;

    int polylineIndex = segmentToPolylineIndexMap[segmentIndex];
    int result = 0;
    if(polylineIndex >= 0)
    {
        float2 p1 = points[segmentIndex];
        float2 p2 = points[segmentIndex+1];
        float2 A = p2;
        float2 a;
        a.x = p2.x - p1.x;
        a.y = p2.y - p1.y;

        for(int i = segmentIndex+2; i < segmentCount; i++)
        {
            int currentPolylineIndex = segmentToPolylineIndexMap[i];

            // if not a different segment within out polyline and
            // not a fake segment
            bool isLegit = (currentPolylineIndex != polylineIndex && 
                currentPolylineIndex >= 0);      

            float2 p3 = points[i];
            float2 p4 = points[i+1];
            float2 B = p4;
            float2 b;
            b.x = p4.x - p3.x;
            b.y = p4.y - p3.y;

            float2 c;
            c.x = B.x - A.x;
            c.y = B.y - A.y;

            float2 b_perp;
            b_perp.x = -b.y;
            b_perp.y = b.x;

            float numerator = dot(b_perp, c);
            float denominator = dot(b_perp, a);
            bool isParallel = (denominator == 0.0);

            float quotient = numerator / denominator;
            float2 intersectionPoint;
            intersectionPoint.x = quotient * a.x + A.x;
            intersectionPoint.y = quotient * a.y + A.y;

            result = result | (isLegit && !isParallel && 
                intersectionPoint.x > min(p1.x, p2.x) && 
                intersectionPoint.x > min(p3.x, p4.x) && 
                intersectionPoint.x < max(p1.x, p2.x) && 
                intersectionPoint.x < max(p3.x, p4.x) && 
                intersectionPoint.y > min(p1.y, p2.y) && 
                intersectionPoint.y > min(p3.y, p4.y) && 
                intersectionPoint.y < max(p1.y, p2.y) && 
                intersectionPoint.y < max(p3.y, p4.y));
        }
    }

    output[segmentIndex] = result;
}

Here is the call to execute the kernel function:

DoCheck<<<702, 32>>>(
    (float2*)devicePoints, 
    deviceSegmentsToPolylineIndexMap, 
    numSegments, 
    deviceOutput);

The sizes of the parameters are as follows:

  • devicePoints = 22,464 float2s = 179,712 bytes
  • deviceSegmentsToPolylineIndexMap = 22,463 ints = 89,852 bytes
  • numSegments = 1 int = 4 bytes
  • deviceOutput = 22,463 ints = 89,852 bytes

When I execute this kernel, it crashes the video card. It would appear that I am hitting some sort of limit, because if I execute the kernel using DoCheck<<<300, 32>>>(...);, it works. Just to be clear, the parameters are the same, just the number of blocks is different.

Any idea why one crashes the video driver, and the other doesn't? The one that fail seems to be still within the card's limit on number of blocks.

Update More information on my system configuration:

  • Video Card: nVidia 8800GT
  • CUDA Version: 1.1
  • OS: Windows Server 2008 R2

I also tried it on a laptop with the following configuration, but got the same results:

  • Video Card: nVidia Quadro FX 880M
  • CUDA Version: 1.2
  • OS: Windows 7 64-bit


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source