'why less calculation cost much more running time
I'm working with sparse voxel octree and now optimizing octreee mipmaping code.
vec4 MipmapCell(uint nodeInd, uint idx)
{
if ((locationList[nodeInd].y & 0x000000ffu) == 0x000000ffu)
return vec4(0.0f);
uint brickInd = octrees[nodeInd].y & 0x3fffffffu;
uint blockInd = GetBrickBlock(brickInd);
ivec3 globalCoord = GetBrickCoord(brickInd, ivec3(0));
ivec3 startLoc = ivec3(idx & 0x00000001u, (idx >> 1u) & 0x00000001u, (idx >> 2u) & 0x00000001u);
vec4 colors[8] = { vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f) };
for (int ind = 0; ind < 8; ++ind)
{
ivec3 xyz = ivec3(int(ind & 0x1), int((ind >> 1) & 0x1), int((ind >> 2) & 0x1));
ivec3 coord = globalCoord + startLoc + xyz;
colors[ind] = imageLoad(brickPool[blockInd].lights, coord);
}
uint cur = 0u, nInd = 0u, ind = 0u;
for (ind = 0u; ind < 4u; ++ind)
{
cur = ind << 1u;
nInd = cur + 0x00000001u;
colors[cur] = BlendColorWithAlpha(colors[cur], colors[nInd]);
}
for (ind = 0u; ind < 2u; ++ind)
{
cur = ind << 2u;
nInd = cur + 0x00000002u;
colors[cur] = BlendColorWithAlpha(colors[cur], colors[nInd]);
}
{
ind = 0u;
nInd = ind + 0x00000004u;
colors[ind] = BlendColorWithAvg(colors[ind], colors[nInd]);
}
return colors[0];
}
uvec3 shrinkvec3 = uvec3(0u, 0u, 0u);
void shrinkInd(uint ind , out uint exInd, out uint exIdx)
{
shrinkvec3 = uvec3((ind >> 1u) & 0x1u, (ind >> 3u) & 0x1u, (ind >> 5u) & 0x1u);
exInd = shrinkvec3.x | (shrinkvec3.y << 1u) | (shrinkvec3.z << 2u);
shrinkvec3 = uvec3(ind & 0x1u, (ind >> 2u) & 0x1u, (ind >> 4u) & 0x1u);
exIdx = shrinkvec3.x | (shrinkvec3.y << 1u) | (shrinkvec3.z << 2u);
}
const ivec3 brickLocList[27] =
{
ivec3(0, 0, 0),
ivec3(1, 0, 0),
ivec3(2, 0, 0),
ivec3(0, 1, 0),
ivec3(1, 1, 0),
ivec3(2, 1, 0),
ivec3(0, 2, 0),
ivec3(1, 2, 0),
ivec3(2, 2, 0),
ivec3(0, 0, 1),
ivec3(1, 0, 1),
ivec3(2, 0, 1),
ivec3(0, 1, 1),
ivec3(1, 1, 1),
ivec3(2, 1, 1),
ivec3(0, 2, 1),
ivec3(1, 2, 1),
ivec3(2, 2, 1),
ivec3(0, 0, 2),
ivec3(1, 0, 2),
ivec3(2, 0, 2),
ivec3(0, 1, 2),
ivec3(1, 1, 2),
ivec3(2, 1, 2),
ivec3(0, 2, 2),
ivec3(1, 2, 2),
ivec3(2, 2, 2)
};
void MipmapBrickSecond(uint nodeInd)
{
if ((octrees[nodeInd].x & 0x80000000) == 0u)
return;
vec4 cell[64] = {
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f) };
uint childInd = octrees[nodeInd].x & 0x3fffffffu;
uint blockInd = GetBrickBlock(octrees[nodeInd].y & 0x3fffffffu);
ivec3 globalCoord = GetBrickCoord(octrees[nodeInd].y & 0x3fffffffu, ivec3(0));
for (uint ind = 0u; ind < 64u; ++ind)
{
uint exInd, exIdx;
shrinkInd(ind, exInd, exIdx);
cell[ind] = MipmapCell(childInd + exInd, exIdx);
}
for (int brickInd = 0; brickInd < 27; ++brickInd)
{
ivec3 grid = brickLocList[brickInd];
ivec3 coord = globalCoord + grid;
vec4 colors[8] = { vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f), vec4(0.0f) };
ivec3 startLoc = grid * 2 + ivec3(-1);
for (int cellInd = 0; cellInd < 8; ++cellInd)
{
ivec3 xyz = ivec3(cellInd & 0x1, (cellInd >> 1) & 0x1, (cellInd >> 2) & 0x1);
xyz += startLoc;
if (xyz.x < 0 || xyz.y < 0 || xyz.z < 0 || xyz.x > 3 || xyz.y > 3 || xyz.z > 3)
continue;
int ind = xyz.x | (xyz.y << 2) | (xyz.z << 4);
colors[cellInd] = cell[ind];
}
for (uint ind = 0u; ind < 4u; ++ind)
{
uint cur = ind << 1u;
uint nInd = cur + 0x00000001u;
if (grid.x == 1)
colors[cur] = BlendColorWithAlpha(colors[cur], colors[nInd]);
else
colors[cur] = max(colors[nInd], colors[cur]);
}
for (uint ind = 0u; ind < 2u; ++ind)
{
uint cur = ind << 2u;
uint nInd = cur + 0x00000002u;
if (grid.y == 1)
colors[cur] = BlendColorWithAlpha(colors[cur], colors[nInd]);
else
colors[cur] = max(colors[nInd], colors[cur]);
}
if (grid.z == 1)
{
colors[0] = BlendColorWithAvg(colors[0], colors[4]);
}
else
colors[0] = max(colors[0], colors[4]);
imageStore(brickPool[blockInd].lights, coord, colors[0]);
}
}
above is the old mipmapping code, there are 1240 calculations. As you can see, this algorithm has many redundant calculations and texture fetching operations. So I make some optimization, here is the new algorithm, it just costs 535 calculations and texture fetching operation. It should be two time faster than the above algorithm..
void MipmapBrickThird(uint nodeInd)
{
if ((octrees[nodeInd].x & 0x80000000u) == 0u)
return;
vec4 cell[125] = {
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f),
vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f), vec4(0.f)
};
ivec3 brickLoc;
uint brickInd;
uint childInd = octrees[nodeInd].x & 0x3fffffffu;
ivec3 globalCoord;
ivec3 baseCoord;
int cellInd;
uint blockInd;
for (uint childOffset = 0u; childOffset < 8u; ++childOffset)
{
if ((locationList[childInd + childOffset].y & 0x000000ffu) == 0x000000ffu)
continue;
brickInd = octrees[childInd + childOffset].y & 0x3fffffffu;
globalCoord = GetBrickCoord(brickInd, ivec3(0));
baseCoord = ivec3((childOffset & 0x1) << 1, ((childOffset >> 1) & 0x1) << 1, (childOffset >> 2) << 1);
blockInd = GetBrickBlock(brickInd);
for (uint ind = 0u; ind < 27u; ++ind)
{
brickLoc = brickLocList[ind];
cellInd = brickLoc.x + baseCoord.x + (brickLoc.y + baseCoord.y) * 5 + (brickLoc.z + baseCoord.z) * 25;
cell[cellInd] = max(cell[cellInd], imageLoad(brickPool[blockInd].lights, globalCoord + brickLoc));
}
}
for (int ind = 0; ind < 100; ++ind)
{
cellInd = (ind & 0x3) + (ind >> 2) * 5;
BlendColorWithAlphaInout(cell[cellInd], cell[cellInd + 1]);
}
for (int ind = 0; ind < 80; ++ind)
{
cellInd = (ind & 0x3) * 25 + (ind >> 4) * 5 + ((ind >> 2) & 0x3);
BlendColorWithAlphaInout(cell[cellInd], cell[cellInd + 25]);
}
for (int ind = 0; ind < 64; ++ind)
{
cellInd = (ind & 0x3) * 5 + (ind >> 4) * 25 + ((ind >> 2) & 0x3);
BlendColorWithAvgInout(cell[cellInd], cell[cellInd + 5]);
}
for (int ind = 0; ind < 16; ++ind)
{
cellInd = (ind >> 2) * 25 + (ind & 0x3) * 5;
BlendColorWithAlphaInout(cell[cellInd + 1], cell[cellInd + 2]);
}
for (int ind = 0; ind < 16; ++ind)
{
cellInd = (ind & 0x3) + (ind >> 2) * 5;
BlendColorWithAlphaInout(cell[25 + cellInd], cell[50 + cellInd]);
}
for (int ind = 0; ind < 16; ++ind)
{
cellInd = (ind & 0x3) + (ind >> 2) * 25;
BlendColorWithAvgInout(cell[cellInd + 5], cell[cellInd + 10]);
}
brickInd = octrees[nodeInd].y & 0x3fffffffu;
globalCoord = GetBrickCoord(brickInd, ivec3(0));
blockInd = GetBrickBlock(brickInd);
for (int ind = 0; ind < 27; ++ind)
{
brickLoc = brickLocList[ind];
cellInd = cellMapping[brickLoc.z] * 25 + cellMapping[brickLoc.y] * 5 + cellMapping[brickLoc.x];
imageStore(brickPool[blockInd].lights, globalCoord + brickLoc, cell[cellInd]);
}
}
But actually, MipmapBrickThird costs much more time than MipmapBrickSecond(about five times). Why the less calculation costs more running time. MipmapBrickThird has less "if-else" judgment, less texture fetching and less calculations, but it costs more time.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
