From ba1b0f949a2b6e3b2832073e4e64b3793939883d Mon Sep 17 00:00:00 2001 From: Qin Jiajia Date: Thu, 17 Aug 2023 10:30:33 +0800 Subject: [PATCH] Test int vs uint overhead for div/mod --- D3D12ComputeAdd/D3D12Sample.cpp | 18 +++++--- D3D12ComputeAdd/D3D12Sample.h | 4 +- D3D12ComputeAdd/SLM_4X4_16X16.hlsl | 72 ++++++++++++++++++++++++++---- 3 files changed, 77 insertions(+), 17 deletions(-) diff --git a/D3D12ComputeAdd/D3D12Sample.cpp b/D3D12ComputeAdd/D3D12Sample.cpp index 95e8f60..74cbe54 100644 --- a/D3D12ComputeAdd/D3D12Sample.cpp +++ b/D3D12ComputeAdd/D3D12Sample.cpp @@ -16,6 +16,7 @@ //#define USE_STRUCTURED_BUFFERS //#define USE_VEC4 +#define USE_INT #define PRINT_DATA namespace @@ -43,7 +44,7 @@ D3D12Sample::D3D12Sample() : m_cbSrvDescriptorSize(0), m_constantBufferData{}, m_dataSize(1024*1024), - m_workGroupSizeX(128), + m_workGroupSizeX(64), m_componentSize(1) { #ifdef USE_VEC4 @@ -202,6 +203,9 @@ void D3D12Sample::LoadAssets() #endif #ifdef USE_VEC4 "USE_VEC4", "1", +#endif +#ifdef USE_INT + "USE_INT", "1", #endif nullptr, nullptr }; @@ -290,7 +294,7 @@ void D3D12Sample::LoadSizeDependentResources() const UINT elementCount = m_dataSize; for ( int i = 0; i < elementCount; ++i ) { - buf1Data.push_back((float) rand() / float(RAND_MAX)); + buf1Data.push_back(rand() % 200); } const UINT bufferSize = buf1Data.size() * sizeof(float); @@ -342,7 +346,7 @@ void D3D12Sample::LoadSizeDependentResources() const UINT elementCount = m_dataSize; for ( int i = 0; i < elementCount; ++i ) { - buf2Data.push_back((float) rand() / float(RAND_MAX)); + buf2Data.push_back(rand() % 200); } const UINT bufferSize = buf2Data.size() * sizeof(float); @@ -548,7 +552,7 @@ void D3D12Sample::RunCompute() float result = 0.0; int m = rand() % m_dataSize; D3D12_RANGE readbackBufferRange{ 0, outputBufferSize }; - FLOAT * pReadbackBufferData{}; + int * pReadbackBufferData{}; ThrowIfFailed(readbackBuffer->Map( 0, &readbackBufferRange, @@ -556,12 +560,12 @@ void D3D12Sample::RunCompute() bool hasError = false; for (int i = 0; i < m_dataSize; i++) { - float gpuResult = pReadbackBufferData[i]; - float cpuResult = buf1Data[i] + buf2Data[i]; + int gpuResult = pReadbackBufferData[i]; + int cpuResult = buf1Data[i] + buf2Data[i]; if (abs(gpuResult - cpuResult) > 0.003) { hasError = true; - printf("The result is not correct at %d. Expected %f, actual %f", i, cpuResult, gpuResult); + printf("The result is not correct at %d. Expected %d, actual %d", i, cpuResult, gpuResult); break; } } diff --git a/D3D12ComputeAdd/D3D12Sample.h b/D3D12ComputeAdd/D3D12Sample.h index 79f62c5..41e1f2d 100644 --- a/D3D12ComputeAdd/D3D12Sample.h +++ b/D3D12ComputeAdd/D3D12Sample.h @@ -94,8 +94,8 @@ class D3D12Sample UINT m_workGroupSizeX; UINT m_componentSize; UINT m_computeCount = 2000; - std::vector buf1Data; - std::vector buf2Data; + std::vector buf1Data; + std::vector buf2Data; void GetHardwareAdapter(IDXGIFactory2* pFactory, IDXGIAdapter1** ppAdapter); void CreateDevice(const ComPtr& factory); diff --git a/D3D12ComputeAdd/SLM_4X4_16X16.hlsl b/D3D12ComputeAdd/SLM_4X4_16X16.hlsl index 4649844..aaf017d 100644 --- a/D3D12ComputeAdd/SLM_4X4_16X16.hlsl +++ b/D3D12ComputeAdd/SLM_4X4_16X16.hlsl @@ -74,23 +74,59 @@ void mm_write(int index, float4 value) { dst.Store4(4 * (index * 4), asuint(value)); } #else -float mm_readA(int index) { - float result = asfloat(src0.Load(4 * index)); +#ifdef USE_INT +int mm_readA(int index) { + int result = asint(src0.Load(4 * index)); return result; } -float mm_readB(int index) { - float result = asfloat(src1.Load(4 * index)); +int mm_readB(int index) { + int result = asint(src1.Load(4 * index)); return result; } -void mm_write(int index, float value) { +void mm_write(int index, int value) { dst.Store(4 * index, asuint(value)); } + +int tint_div(int lhs, int rhs) { + return (lhs / (((rhs == 0) | ((lhs == -2147483648) & (rhs == -1))) ? 1 : rhs)); +} +int tint_mod(int lhs, int rhs) { + const int rhs_or_one = (((rhs == 0) | ((lhs == -2147483648) & (rhs == -1))) ? 1 : rhs); + if (any(((uint((lhs | rhs_or_one)) & 2147483648u) != 0u))) { + return (lhs - ((lhs / rhs_or_one) * rhs_or_one)); + } + else { + return (lhs % rhs_or_one); + } +} +#else +uint mm_readA(int index) { + uint result = asuint(src0.Load(4 * index)); + return result; +} + +uint mm_readB(int index) { + uint result = asuint(src1.Load(4 * index)); + return result; +} + +void mm_write(int index, uint value) { + dst.Store(4 * index, asuint(value)); +} +uint tint_div(uint lhs, uint rhs) { + return (lhs / ((rhs == 0u) ? 1u : rhs)); +} + +uint tint_mod(uint lhs, uint rhs) { + return (lhs % ((rhs == 0u) ? 1u : rhs)); +} +#endif // USE_INT #endif // USE_VEC4 #endif // USE_STRUCTURED_BUFFERS -[numthreads(128, 1, 1)] +[numthreads(64, 1, 1)] void main(CS_INPUT input) { initGLBuiltins(input); @@ -98,7 +134,27 @@ void main(CS_INPUT input) #ifdef USE_VEC4 float4 result = mm_readA(index) + mm_readB(index); #else - float result = mm_readA(index) + mm_readB(index); +#ifdef USE_INT + const int a = mm_readA(index); + const int b = mm_readB(index); + int c = 0; + { + for (int i = 1; (i < 200); i = (i + 1)) { + c = (c + (tint_div(a, i) + tint_mod(a, i))); + c = (c + (tint_div(b, i) + tint_mod(b, i))); + } + } +#else + const uint a = mm_readA(index); + const uint b = mm_readB(index); + uint c = 0u; + { + for (uint i = 1u; (i < 200u); i = (i + 1u)) { + c = (c + (tint_div(a, i) + tint_mod(a, i))); + c = (c + (tint_div(b, i) + tint_mod(b, i))); + } + } +#endif // USE_INT #endif // USE_VEC4 - mm_write(index, result); + mm_write(index, c); }