'DirectX 12 - Root Descriptor not working properly
In my test application, I passed the model, view and projection matrices as 32 bit constants to the shaders. Now I wanted to switch to root descriptors in order to reduce my root signature size. I want to pass two constant buffers to shader. The first one contains the model matrix (one 4x4 matrix) and the second one contains the view and projection matrices (two 4x4 matrices). However, the view and projection matrices work absolutely fine using a root descriptor. As soon as I switched the model matrix from 32 bit constants to a root descriptor, the scene does not render anymore although the procedures are exactly the same for both of the constant buffers. DirectX shows no errors, not even in the debug layer.
Root Parameters Code
// Root Parameter: "CB_ModelMatrix"
rootParameters[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV;
rootParameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_VERTEX;
rootParameters[0].Descriptor.ShaderRegister = 0;
rootParameters[0].Descriptor.RegisterSpace = 0;
// Root Parameter: "CB_ViewProjectionMatrices"
rootParameters[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV;
rootParameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_VERTEX;
rootParameters[1].Descriptor.ShaderRegister = 1;
rootParameters[1].Descriptor.RegisterSpace = 0;
Resource Creation Code
// model matrix resource
D3D12_HEAP_PROPERTIES heapProperties = {};
heapProperties.Type = D3D12_HEAP_TYPE_UPLOAD;
heapProperties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
heapProperties.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
heapProperties.CreationNodeMask = 1;
heapProperties.VisibleNodeMask = 1;
D3D12_RESOURCE_DESC resourceDescription = {};
resourceDescription.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
resourceDescription.Alignment = 0;
resourceDescription.Width = (sizeof(t_ConstantBufferData_ModelMatrix) + 255) & ~255;
resourceDescription.Height = 1;
resourceDescription.DepthOrArraySize = 1;
resourceDescription.MipLevels = 1;
resourceDescription.Format = DXGI_FORMAT_UNKNOWN;
resourceDescription.SampleDesc.Count = 1;
resourceDescription.SampleDesc.Quality = 0;
resourceDescription.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
resourceDescription.Flags = D3D12_RESOURCE_FLAG_NONE;
ThrowIfFailed(g_GraphicsDevice->CreateCommittedResource(&heapProperties, D3D12_HEAP_FLAG_NONE,
&resourceDescription, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, IID_PPV_ARGS(&g_ConstantBuffer_ModelMatrix)));
ThrowIfFailed(g_ConstantBuffer_ModelMatrix->Map(0, nullptr, reinterpret_cast<void**>(&g_ConstantBufferPointer_ModelMatrix)));
// view and projection matrices resource
D3D12_HEAP_PROPERTIES heapProperties = {};
heapProperties.Type = D3D12_HEAP_TYPE_UPLOAD;
heapProperties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
heapProperties.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
heapProperties.CreationNodeMask = 1;
heapProperties.VisibleNodeMask = 1;
D3D12_RESOURCE_DESC resourceDescription = {};
resourceDescription.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
resourceDescription.Alignment = 0;
resourceDescription.Width = (sizeof(t_ConstantBufferData_ViewProjectionMatrices) + 255) & ~255;
resourceDescription.Height = 1;
resourceDescription.DepthOrArraySize = 1;
resourceDescription.MipLevels = 1;
resourceDescription.Format = DXGI_FORMAT_UNKNOWN;
resourceDescription.SampleDesc.Count = 1;
resourceDescription.SampleDesc.Quality = 0;
resourceDescription.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
resourceDescription.Flags = D3D12_RESOURCE_FLAG_NONE;
ThrowIfFailed(g_GraphicsDevice->CreateCommittedResource(&heapProperties, D3D12_HEAP_FLAG_NONE,
&resourceDescription, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, IID_PPV_ARGS(&g_ConstantBuffer_ViewProjectionMatrices)));
ThrowIfFailed(g_ConstantBuffer_ViewProjectionMatrices->Map(0, nullptr, reinterpret_cast<void**>(&g_ConstantBufferPointer_ViewProjectionMatrices)));
Resource Updating (Model Matrix)
t_ConstantBufferData_ModelMatrix CB_ModelMatrix = {};
// ...
std::memcpy(g_ConstantBufferPointer_ModelMatrix, &CB_ModelMatrix, sizeof(CB_ModelMatrix));
g_CommandList->SetGraphicsRootConstantBufferView(0, g_ConstantBuffer_ModelMatrix->GetGPUVirtualAddress());
Resource Updating (View and Projection Matrices)
t_ConstantBufferData_ViewProjectionMatrices CB_ViewProjectionMatrices = {};
// ...
std::memcpy(g_ConstantBufferPointer_ViewProjectionMatrices, &CB_ViewProjectionMatrices, sizeof(CB_ViewProjectionMatrices));
g_CommandList->SetGraphicsRootConstantBufferView(1, g_ConstantBuffer_ViewProjectionMatrices->GetGPUVirtualAddress());
Constant Buffers in the Vertex Shader
struct t_ConstantBufferData_ModelMatrix
{
float4x4 ModelMatrix;
};
struct t_ConstantBufferData_ViewProjectionMatrices
{
float4x4 ViewMatrix;
float4x4 ProjectionMatrix;
};
ConstantBuffer<t_ConstantBufferData_ModelMatrix> CB_ModelMatrix : register(b0, space0);
ConstantBuffer<t_ConstantBufferData_ViewProjectionMatrices> CB_ViewProjectionMatrices : register(b1, space0);
The resources are created and mapped at the beginning of the program and are unmapped at the end of it. The view and projection matrices resource is updated once per frame. The model matrix resource is updated multiple times per frame since I have multiple game objects with different transformations.
I can't figure out, why a root descriptor works for the view and projection matrices but wont work for the model matrix. Maybe I overlooked a basic mechanic of root descriptors? I would really appreciate it, if someone can tell me, what I missed.
By the way a side question: Is it safe to keep the resources mapped during the whole life time of the application? I assume I can achieve better performance with that instead of mapping and unmapping the resources every time, they are updated.
EDIT:
The scene consists of 27 cubes, that are rendered in 3x3x3 grid at the center of the scene. Some of those cubes are rendered behind the camera. After I explored the scene with the camera more precisely and with the help of graphics debugging tools I realized, that only the last cube is rendered. To render the grid of cubes I use the following loop in the render function:
for (unsigned char t = 0; t < 27; t++)
{
t_ConstantBufferData_ModelMatrix CB_ModelMatrix = {};
CB_ModelMatrix.ModelMatrix = createModelMatrix(g_Transforms[t]);
std::memcpy(g_ConstantBufferPointer_ModelMatrix, &CB_ModelMatrix, sizeof(CB_ModelMatrix));
g_CommandList->SetGraphicsRootConstantBufferView(0, g_ConstantBuffer_ModelMatrix->GetGPUVirtualAddress());
g_CommandList->DrawIndexedInstanced(36, 1, 0, 0, 0);
}
I create a model matrix from the transformation of the current cube and copy the matrix to the constant buffer. After that, the cube is drawn. The model matrix is created correctly and the memory of the constant buffer changes as intended. However, only the last cube is drawn.
My assumption is, that the draw commands in the command list only store a pointer to the constant buffer and right before the command list is executed, only the model matrix of the last cube is present in the buffer. Could this be the cause of the issue? If so, what could be done to resolve it?
Solution 1:[1]
I did a bit of research again assuming, that my approach is not the right one. I searched about techniques to render the same set of vertices multiple times with different transformations. Finally, I stumbled upon "Instancing" or "Instanced Drawing". I knew, that such a technique existed but in most of the tutorials I read, it was classified as "advanced graphics programming", that will be discussed later. With the help of this tutorial (which was originally written for DirectX 11 but could be ported really easily to DirectX 12) I was able to render the scene correctly using multiple instances of my 3D object. In addition to that, I could get rid of passing my model matrices as root parameters to the graphics pipeline. Also the rendering performance increases when instancing is utilized.
For those, who are not familiar with this concept, here a short summary:
Instancing is used, if multiple objects, that share the same set of vertices but have different transformations (and some other properties according to the tutorial), need to be drawn. Instead of rendering each object with a separate draw call, all of these objects can be rendered with only a single draw call. This requires to pass some extra information (like transformations) to the shader via the input data layout of the pipeline. These layout parameters can be configured, to be received by the shaders not per vertex but per instance. When many objects need to be rendered (hundreds or thousands of objects), instancing will increase the performance drastically.
Technical Details
When implementing this concept, multiple techniques are available.
The instance data can be calculated directly in the shader. This is done by manipulating the vertex data according to the instance number, which can be retrieved using the SV_InstanceID
shader semantic. This approach may not be very flexible.
Additional data can also be passed to the shader using a constant buffer.
The last approach is to use an "instance buffer". An instance buffer is created exactly the same as a vertex buffer. It is a resource, that must be created, filled with data (e.g. transformation data) and uploaded to the GPU. Also, the instance buffer needs to be referenced in the input layout of the pipeline. When a frame is rendered, this buffer must be set in the pipeline using the ID3D12GraphicsCommandList::IASetVertexBuffers
function. Finally, ID3D12GraphicsCommandList::DrawIndexedInstanced
must be called, which takes the instance count as a parameter. The further processing of the data is then up to the shader implementation.
Solution 2:[2]
The simple solution would be to have each object keep their own constant buffer. Bear in mind that you would need NUMBER_OF_OBJECTS * FRAME_COUNT
cbuffers since there would be a possibility of overwriting the old cbuffer that is still in use by the GPU. (That's only if you have more than 1
frame in flight, i.e. you are not waiting at the end of the each frame for GPU to finish).
Better solution would be to allocate each time you need from an global large upload buffer. Here you have two strategies: linear allocator or ring allocator. I am going to explain the linear one here, but you can see the link at the end of the post for both implementations.
It could look something like this:
template<typename CBuffer>
inline constexpr uint32_t GetCBufferSize()
{
return (sizeof(CBuffer) + (D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT - 1)) & ~(D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT - 1);
}
struct Allocation
{
ID3D12Resource* buffer = nullptr;
void* cpu_address = nullptr;
D3D12_GPU_VIRTUAL_ADDRESS gpu_address = 0;
size_t offset = 0;
size_t size = 0;
void Update(void* data, size_t size)
{
memcpy(cpu_address, data, size);
}
template<typename T>
void Update(T const& data)
{
memcpy(cpu_address, &data, sizeof(T));
}
};
class UploadBuffer
{
public:
UploadBuffer(ID3D12Device* device, SIZE_T max_size_in_bytes)
{
auto heap_properties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD);
auto buffer_desc = CD3DX12_RESOURCE_DESC::Buffer(max_size_in_bytes);
BREAK_IF_FAILED(device->CreateCommittedResource(
&heap_properties,
D3D12_HEAP_FLAG_NONE,
&buffer_desc,
D3D12_RESOURCE_STATE_GENERIC_READ,
nullptr,
IID_PPV_ARGS(&buffer)));
CD3DX12_RANGE read_range(0, 0);
BREAK_IF_FAILED(buffer->Map(0, &read_range, reinterpret_cast<void**>(&cpu_address)));
gpu_address = buffer->GetGPUVirtualAddress();
}
Allocation Allocate(SIZE_T size_in_bytes, SIZE_T alignment)
{
offset_in_buffer = linear_allocator.Allocate(size_in_bytes, alignment);
Allocation allocation{};
allocation.buffer = buffer.Get();
allocation.cpu_address = reinterpret_cast<uint8*>(cpu_address) + offset_in_buffer;
allocation.gpu_address = gpu_address + offset_in_buffer;
allocation.offset_in_buffer = offset_in_buffer;
allocation.size = size_in_bytes;
return allocation;
}
void Clear()
{
linear_allocator.Clear();
}
private:
LinearAllocator allocator;
ComPtr<ID3D12Resource> buffer;
uint8_t* cpu_address = nullptr;
D3D12_GPU_VIRTUAL_ADDRESS gpu_address = 0;
};
Example of usage:
//initalization
for(size_t i = 0; i < FRAMES_IN_FLIGHT; ++i)
{
upload_buffers[i] = UploadBuffer(device, MAX_UPLOAD_BUFFER_SIZE);
}
//frame
UploadBuffer upload_buffer = GetUploadBufferForThisFrame();
upload_buffer.Clear();
//...
for(auto&& object : scene)
{
//...
model_matrix_cbuf.model_matrix = object.model_matrix;
object_allocation = upload_buffer->Allocate(GetCBufferSize<ModelMatrixCBuffer>(), D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT);
object_allocation.Update(model_matrix_cbuf);
cmd_list->SetGraphicsRootConstantBufferView(0, object_allocation.gpu_address); //or whatever root parameter index your cbuffer is
}
Bear in mind this is somewhat simplified so it's easier to understand.
Also, if your application is multithreaded, you would need to protect allocator calls with mutex or use an atomic. For example of implementation, you can see this repo, some but not all relevant files would be: LinearUploadBuffer.h/cpp, RingUploadBuffer.h/cpp, LinearAllocator.h/cpp, RingAllocator.h/cpp, DynamicAllocation.h.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | Erik So |
Solution 2 |