'why less calculation cost much more running time

I'm working with sparse voxel octree and now optimizing octreee mipmaping code.

vec4 MipmapCell(uint nodeInd, uint idx)
{
    if ((locationList[nodeInd].y & 0x000000ffu) == 0x000000ffu)
        return vec4(0.0f);
    uint brickInd = octrees[nodeInd].y & 0x3fffffffu;
    uint blockInd = GetBrickBlock(brickInd);
    ivec3 globalCoord = GetBrickCoord(brickInd, ivec3(0));
    ivec3 startLoc = ivec3(idx & 0x00000001u, (idx >> 1u) & 0x00000001u, (idx >> 2u) & 0x00000001u);
    vec4 colors[8] = { vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f) };
    for (int ind = 0; ind < 8; ++ind)
    {
        ivec3 xyz = ivec3(int(ind & 0x1), int((ind >> 1) & 0x1), int((ind >> 2) & 0x1));
        ivec3 coord = globalCoord + startLoc + xyz;
        colors[ind] = imageLoad(brickPool[blockInd].lights, coord);
    }
    uint cur = 0u, nInd = 0u, ind = 0u;
    for (ind = 0u; ind < 4u; ++ind)
    {
        cur = ind << 1u;
        nInd = cur + 0x00000001u;
        colors[cur] = BlendColorWithAlpha(colors[cur], colors[nInd]);
    }

    for (ind = 0u; ind < 2u; ++ind)
    {
        cur = ind << 2u;
        nInd = cur + 0x00000002u;
        colors[cur] = BlendColorWithAlpha(colors[cur], colors[nInd]);
    }

    {
        ind = 0u;
        nInd = ind + 0x00000004u;
        colors[ind] = BlendColorWithAvg(colors[ind], colors[nInd]);
    }
    return colors[0];
}

uvec3 shrinkvec3 = uvec3(0u, 0u, 0u);
void shrinkInd(uint ind , out uint exInd, out uint exIdx)
{
    shrinkvec3 = uvec3((ind >> 1u) & 0x1u, (ind >> 3u) & 0x1u, (ind >> 5u) & 0x1u);
    exInd = shrinkvec3.x | (shrinkvec3.y << 1u) | (shrinkvec3.z << 2u);
    shrinkvec3 = uvec3(ind & 0x1u, (ind >> 2u) & 0x1u, (ind >> 4u) & 0x1u);
    exIdx = shrinkvec3.x | (shrinkvec3.y << 1u) | (shrinkvec3.z << 2u);
}

const ivec3 brickLocList[27] = 
{
    ivec3(0, 0, 0),
    ivec3(1, 0, 0),
    ivec3(2, 0, 0),
    ivec3(0, 1, 0),
    ivec3(1, 1, 0),
    ivec3(2, 1, 0),
    ivec3(0, 2, 0),
    ivec3(1, 2, 0),
    ivec3(2, 2, 0),
    ivec3(0, 0, 1),
    ivec3(1, 0, 1),
    ivec3(2, 0, 1),
    ivec3(0, 1, 1),
    ivec3(1, 1, 1),
    ivec3(2, 1, 1),
    ivec3(0, 2, 1),
    ivec3(1, 2, 1),
    ivec3(2, 2, 1),
    ivec3(0, 0, 2),
    ivec3(1, 0, 2),
    ivec3(2, 0, 2),
    ivec3(0, 1, 2),
    ivec3(1, 1, 2),
    ivec3(2, 1, 2),
    ivec3(0, 2, 2),
    ivec3(1, 2, 2),
    ivec3(2, 2, 2)
};

void MipmapBrickSecond(uint nodeInd)
{
    if ((octrees[nodeInd].x & 0x80000000) == 0u)
        return;
    vec4 cell[64] = { 
                        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),  
                        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), 
                        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), 
                        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), 
                        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),  
                        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), 
                        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), 
                        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f) };

    uint childInd = octrees[nodeInd].x & 0x3fffffffu;
    uint blockInd = GetBrickBlock(octrees[nodeInd].y & 0x3fffffffu);
    ivec3 globalCoord = GetBrickCoord(octrees[nodeInd].y & 0x3fffffffu, ivec3(0));
    for (uint ind = 0u; ind < 64u; ++ind)
    {
        uint exInd, exIdx;
        shrinkInd(ind, exInd, exIdx);
        cell[ind] = MipmapCell(childInd + exInd, exIdx);
    }
    for (int brickInd = 0; brickInd < 27; ++brickInd)
    {
        ivec3 grid = brickLocList[brickInd];
        ivec3 coord = globalCoord + grid;
        vec4 colors[8] = { vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f) };
        ivec3 startLoc = grid * 2 + ivec3(-1);
        for (int cellInd = 0; cellInd < 8; ++cellInd)
        {
            ivec3 xyz = ivec3(cellInd & 0x1, (cellInd >> 1) & 0x1, (cellInd >> 2) & 0x1);
            xyz += startLoc;
            if (xyz.x < 0 || xyz.y < 0 || xyz.z < 0 || xyz.x > 3 || xyz.y > 3 || xyz.z > 3)
                continue;
            int ind = xyz.x | (xyz.y << 2) | (xyz.z << 4);
            colors[cellInd] = cell[ind];
        }
        for (uint ind = 0u; ind < 4u; ++ind)
        {
            uint cur = ind << 1u;
            uint nInd = cur + 0x00000001u;
            if (grid.x == 1)
                colors[cur] = BlendColorWithAlpha(colors[cur], colors[nInd]);
            else 
                colors[cur] = max(colors[nInd], colors[cur]);
        }

        for (uint ind = 0u; ind < 2u; ++ind)
        {
            uint cur = ind << 2u;
            uint nInd = cur + 0x00000002u;
            if (grid.y == 1)
                colors[cur] = BlendColorWithAlpha(colors[cur], colors[nInd]);
            else 
                colors[cur] = max(colors[nInd], colors[cur]);
        }
        if (grid.z == 1)
        {
            colors[0] = BlendColorWithAvg(colors[0], colors[4]);
        }
        else 
            colors[0] = max(colors[0], colors[4]);
        imageStore(brickPool[blockInd].lights, coord, colors[0]);
    }
}

above is the old mipmapping code, there are 1240 calculations. As you can see, this algorithm has many redundant calculations and texture fetching operations. So I make some optimization, here is the new algorithm, it just costs 535 calculations and texture fetching operation. It should be two time faster than the above algorithm..

void MipmapBrickThird(uint nodeInd)
{
    if ((octrees[nodeInd].x & 0x80000000u) == 0u)
        return;
    vec4 cell[125] = {
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
        vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f)
    };

    ivec3 brickLoc;
    uint brickInd;
    uint childInd = octrees[nodeInd].x & 0x3fffffffu;
    ivec3 globalCoord;
    ivec3 baseCoord;
    int cellInd;
    uint blockInd;
    for (uint childOffset = 0u; childOffset < 8u; ++childOffset)
    {
        if ((locationList[childInd + childOffset].y & 0x000000ffu) == 0x000000ffu)
            continue;
        brickInd = octrees[childInd + childOffset].y & 0x3fffffffu;
        globalCoord = GetBrickCoord(brickInd, ivec3(0));
        baseCoord = ivec3((childOffset & 0x1) << 1, ((childOffset >> 1) & 0x1) << 1, (childOffset >> 2) << 1);
        blockInd = GetBrickBlock(brickInd);
        for (uint ind = 0u; ind < 27u; ++ind)
        {
            brickLoc = brickLocList[ind];
            cellInd = brickLoc.x + baseCoord.x + (brickLoc.y + baseCoord.y) * 5 + (brickLoc.z + baseCoord.z) * 25;
            cell[cellInd] = max(cell[cellInd], imageLoad(brickPool[blockInd].lights, globalCoord + brickLoc));
        }
    }
    for (int ind = 0; ind < 100; ++ind)
    {
        cellInd = (ind & 0x3) + (ind >> 2) * 5;
        BlendColorWithAlphaInout(cell[cellInd], cell[cellInd + 1]);
    }

    for (int ind = 0; ind < 80; ++ind)
    {
        cellInd = (ind & 0x3) * 25 + (ind >> 4) * 5 + ((ind >> 2) & 0x3);
        BlendColorWithAlphaInout(cell[cellInd], cell[cellInd + 25]);
    }

    for (int ind = 0; ind < 64; ++ind)
    {
        cellInd = (ind & 0x3) * 5 + (ind >> 4) * 25 + ((ind >> 2) & 0x3);
        BlendColorWithAvgInout(cell[cellInd], cell[cellInd + 5]);
    }

    for (int ind = 0; ind < 16; ++ind)
    {
        cellInd = (ind >> 2) * 25 + (ind & 0x3) * 5;
        BlendColorWithAlphaInout(cell[cellInd + 1], cell[cellInd + 2]);
    }

    for (int ind = 0; ind < 16; ++ind)
    {
        cellInd = (ind & 0x3) + (ind >> 2) * 5;
        BlendColorWithAlphaInout(cell[25 + cellInd], cell[50 + cellInd]);
    }

    for (int ind = 0; ind < 16; ++ind)
    {
        cellInd = (ind & 0x3) + (ind >> 2) * 25;
        BlendColorWithAvgInout(cell[cellInd + 5], cell[cellInd + 10]);
    }

    brickInd = octrees[nodeInd].y & 0x3fffffffu;
    globalCoord = GetBrickCoord(brickInd, ivec3(0));
    blockInd = GetBrickBlock(brickInd);
    for (int ind = 0; ind < 27; ++ind)
    {
        brickLoc = brickLocList[ind];
        cellInd = cellMapping[brickLoc.z] * 25 + cellMapping[brickLoc.y] * 5 + cellMapping[brickLoc.x];
        imageStore(brickPool[blockInd].lights, globalCoord + brickLoc, cell[cellInd]);
    }
}

But actually, MipmapBrickThird costs much more time than MipmapBrickSecond(about five times). Why the less calculation costs more running time. MipmapBrickThird has less "if-else" judgment, less texture fetching and less calculations, but it costs more time.



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source