From ba1b0f949a2b6e3b2832073e4e64b3793939883d Mon Sep 17 00:00:00 2001
From: Qin Jiajia <jiajia.qin@intel.com>
Date: Thu, 17 Aug 2023 10:30:33 +0800
Subject: [PATCH] Test int vs uint overhead for div/mod

---
 D3D12ComputeAdd/D3D12Sample.cpp    | 18 +++++---
 D3D12ComputeAdd/D3D12Sample.h      |  4 +-
 D3D12ComputeAdd/SLM_4X4_16X16.hlsl | 72 ++++++++++++++++++++++++++----
 3 files changed, 77 insertions(+), 17 deletions(-)
diff --git a/D3D12ComputeAdd/D3D12Sample.cpp b/D3D12ComputeAdd/D3D12Sample.cpp
index 95e8f60..74cbe54 100644
--- a/D3D12ComputeAdd/D3D12Sample.cpp
+++ b/D3D12ComputeAdd/D3D12Sample.cpp
@@ -16,6 +16,7 @@
 
 //#define USE_STRUCTURED_BUFFERS
 //#define USE_VEC4
+#define USE_INT
 #define PRINT_DATA
 
 namespace
@@ -43,7 +44,7 @@ D3D12Sample::D3D12Sample() :
     m_cbSrvDescriptorSize(0),
     m_constantBufferData{},
     m_dataSize(1024*1024),
-    m_workGroupSizeX(128),
+    m_workGroupSizeX(64),
     m_componentSize(1)
 {
 #ifdef USE_VEC4
@@ -202,6 +203,9 @@ void D3D12Sample::LoadAssets()
 #endif
 #ifdef USE_VEC4
 		"USE_VEC4", "1",
+#endif
+#ifdef USE_INT
+        "USE_INT", "1",
 #endif
         nullptr, nullptr
     };
@@ -290,7 +294,7 @@ void D3D12Sample::LoadSizeDependentResources()
         const UINT elementCount = m_dataSize;
         for ( int i = 0; i < elementCount; ++i )
         {
-            buf1Data.push_back((float) rand() / float(RAND_MAX));
+            buf1Data.push_back(rand() % 200);
         }
         const UINT bufferSize = buf1Data.size() * sizeof(float);
 
@@ -342,7 +346,7 @@ void D3D12Sample::LoadSizeDependentResources()
         const UINT elementCount = m_dataSize;
         for ( int i = 0; i < elementCount; ++i )
         {
-            buf2Data.push_back((float) rand() / float(RAND_MAX));
+            buf2Data.push_back(rand() % 200);
         }
         const UINT bufferSize = buf2Data.size() * sizeof(float);
 
@@ -548,7 +552,7 @@ void D3D12Sample::RunCompute()
     float result = 0.0;
     int m = rand() % m_dataSize;
     D3D12_RANGE readbackBufferRange{ 0, outputBufferSize };
-    FLOAT * pReadbackBufferData{};
+    int * pReadbackBufferData{};
     ThrowIfFailed(readbackBuffer->Map(
         0,
         &readbackBufferRange,
@@ -556,12 +560,12 @@ void D3D12Sample::RunCompute()
 	bool hasError = false;
 	for (int i = 0; i < m_dataSize; i++)
 	{
-		float gpuResult = pReadbackBufferData[i];
-		float cpuResult = buf1Data[i] + buf2Data[i];
+		int gpuResult = pReadbackBufferData[i];
+		int cpuResult = buf1Data[i] + buf2Data[i];
 		if (abs(gpuResult - cpuResult) > 0.003)
 		{
 			hasError = true;
-			printf("The result is not correct at %d. Expected %f, actual %f", i, cpuResult, gpuResult);
+			printf("The result is not correct at %d. Expected %d, actual %d", i, cpuResult, gpuResult);
 			break;
 		}
 	}
diff --git a/D3D12ComputeAdd/D3D12Sample.h b/D3D12ComputeAdd/D3D12Sample.h
index 79f62c5..41e1f2d 100644
--- a/D3D12ComputeAdd/D3D12Sample.h
+++ b/D3D12ComputeAdd/D3D12Sample.h
@@ -94,8 +94,8 @@ class D3D12Sample
 	UINT m_workGroupSizeX;
 	UINT m_componentSize;
 	UINT m_computeCount = 2000;
-	std::vector<float> buf1Data;
-	std::vector<float> buf2Data;
+	std::vector<int> buf1Data;
+	std::vector<int> buf2Data;
 
 	void GetHardwareAdapter(IDXGIFactory2* pFactory, IDXGIAdapter1** ppAdapter);
     void CreateDevice(const ComPtr<IDXGIFactory4>& factory);
diff --git a/D3D12ComputeAdd/SLM_4X4_16X16.hlsl b/D3D12ComputeAdd/SLM_4X4_16X16.hlsl
index 4649844..aaf017d 100644
--- a/D3D12ComputeAdd/SLM_4X4_16X16.hlsl
+++ b/D3D12ComputeAdd/SLM_4X4_16X16.hlsl
@@ -74,23 +74,59 @@ void mm_write(int index, float4 value) {
     dst.Store4(4 * (index * 4), asuint(value));
 }
 #else
-float mm_readA(int index) {
-    float result = asfloat(src0.Load(4 * index));
+#ifdef USE_INT
+int mm_readA(int index) {
+    int result = asint(src0.Load(4 * index));
     return result;
 }
 
-float mm_readB(int index) {
-    float result = asfloat(src1.Load(4 * index));
+int mm_readB(int index) {
+    int result = asint(src1.Load(4 * index));
     return result;
 }
 
-void mm_write(int index, float value) {
+void mm_write(int index, int value) {
     dst.Store(4 * index, asuint(value));
 }
+
+int tint_div(int lhs, int rhs) {
+    return (lhs / (((rhs == 0) | ((lhs == -2147483648) & (rhs == -1))) ? 1 : rhs));
+}
+int tint_mod(int lhs, int rhs) {
+    const int rhs_or_one = (((rhs == 0) | ((lhs == -2147483648) & (rhs == -1))) ? 1 : rhs);
+    if (any(((uint((lhs | rhs_or_one)) & 2147483648u) != 0u))) {
+        return (lhs - ((lhs / rhs_or_one) * rhs_or_one));
+    }
+    else {
+        return (lhs % rhs_or_one);
+    }
+}
+#else
+uint mm_readA(int index) {
+    uint result = asuint(src0.Load(4 * index));
+    return result;
+}
+
+uint mm_readB(int index) {
+    uint result = asuint(src1.Load(4 * index));
+    return result;
+}
+
+void mm_write(int index, uint value) {
+    dst.Store(4 * index, asuint(value));
+}
+uint tint_div(uint lhs, uint rhs) {
+    return (lhs / ((rhs == 0u) ? 1u : rhs));
+}
+
+uint tint_mod(uint lhs, uint rhs) {
+    return (lhs % ((rhs == 0u) ? 1u : rhs));
+}
+#endif  // USE_INT
 #endif  // USE_VEC4
 #endif  // USE_STRUCTURED_BUFFERS
 
-[numthreads(128, 1, 1)]
+[numthreads(64, 1, 1)]
 void main(CS_INPUT input)
 {
     initGLBuiltins(input);
@@ -98,7 +134,27 @@ void main(CS_INPUT input)
 #ifdef USE_VEC4
 	float4 result = mm_readA(index) + mm_readB(index);
 #else
-	float result = mm_readA(index) + mm_readB(index);
+#ifdef USE_INT
+    const int a = mm_readA(index);
+    const int b = mm_readB(index);
+    int c = 0;
+    {
+        for (int i = 1; (i < 200); i = (i + 1)) {
+            c = (c + (tint_div(a, i) + tint_mod(a, i)));
+            c = (c + (tint_div(b, i) + tint_mod(b, i)));
+        }
+    }
+#else
+    const uint a = mm_readA(index);
+    const uint b = mm_readB(index);
+    uint c = 0u;
+    {
+        for (uint i = 1u; (i < 200u); i = (i + 1u)) {
+            c = (c + (tint_div(a, i) + tint_mod(a, i)));
+            c = (c + (tint_div(b, i) + tint_mod(b, i)));
+        }
+    }
+#endif  // USE_INT
 #endif  // USE_VEC4
-	mm_write(index, result);
+	mm_write(index, c);
 }