diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/CMakeLists.txt new file mode 100755 index 0000000000..cdba19e90b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/CMakeLists.txt @@ -0,0 +1,20 @@ +if(UNIX) + # Direct CMake to use dpcpp rather than the default C++ compiler/linker + set(CMAKE_CXX_COMPILER dpcpp) +else() # Windows + # Force CMake to use dpcpp rather than the default C++ compiler/linker + # (needed on Windows only) + include (CMakeForceCompiler) + CMAKE_FORCE_CXX_COMPILER (dpcpp IntelDPCPP) + include (Platform/Windows-Clang) +endif() + +cmake_minimum_required (VERSION 3.4) + +project(ACInt CXX) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/License.txt new file mode 100755 index 0000000000..7c8b8a36c6 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/License.txt @@ -0,0 +1,23 @@ +Copyright Intel Corporation + +SPDX-License-Identifier: MIT +https://opensource.org/licenses/MIT + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/README.md new file mode 100755 index 0000000000..7c6d24f051 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/README.md @@ -0,0 +1,249 @@ +# Using the Algorithmic C Integer Data-type 'ac_int' + +This FPGA tutorial demonstrates how to use the Algorithmic C (AC) Data-type `ac_int` and some best practices. + +***Documentation***: The [DPC++ FPGA Code Samples Guide](https://software.intel.com/content/www/us/en/develop/articles/explore-dpcpp-through-intel-fpga-code-samples.html) helps you to navigate the samples and build your knowledge of DPC++ for FPGA.
+The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) is the reference manual for targeting FPGAs through DPC++.
+The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04/20.04, RHEL*/CentOS* 8, SUSE* 15; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA
Intel® FPGA Programmable Acceleration Card (PAC) D5005 (with Intel Stratix® 10 SX)
Intel® FPGA 3rd party / custom platforms with oneAPI support
*__Note__: Intel® FPGA PAC hardware is only compatible with Ubuntu 18.04* +| Software | Intel® oneAPI DPC++ Compiler
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | Using the `ac_int` data-type for basic operations
Efficiently using the left shift operation
Setting and reading certain bits of an `ac_int` number +| Time to complete | 20 minutes + + + +## Purpose + +This FPGA tutorial shows how to use the `ac_int` type with some simple examples. + +This data-type can be used in place of native integer types to generate area efficient and optimized designs for the FPGA. For example, operations which do not utilize all of the bits the native integer types are good candidates for replacement with `ac_int` type. + + +### Simple Code Example + +An `ac_int` number can be defined as follows: +```cpp +ac_int a; +``` +Here W is the width and S is the sign of the number. Signed numbers use one of the W bits to store the sign information. + +To use this type in your code, you must include the following header: + +```cpp +#include +``` +Additionally, you must use the flag `-qactypes` in order to ensure that the headers are correctly included. + +For convenience, the following are predefined under the `ac_intN` namespace: +``` +ac_int are type defined as intN up to 63. +ac_int are type defined as uintN up to 63. +``` + +For example, a 14 bit signed `ac_int` can be defined by using +```cpp +ac_intN::int14 a; +``` + +### Understanding the Tutorial Design + +The tutorial consists of several functions, each of which contains a SYCL kernel that demonstrates a specific operation. The operations we will see are: +* Addition +* Division +* Multiplication +* Left shift +* Setting a bit of an `ac_int` number +* Reading a bit of an `ac_int` number + +#### Basic Operations and Promotion Rules + +When using `ac_int`, we can write Addition, Division, Multiplication operations to use precisely as many bits as are needed to store the results. This is demonstrated by the kernels `Add`, `Div` and `Mult`. + +`ac_int` automatically promotes the result of all operations to the number of bits needed to represent all possible results without overflowing. For example, the addition of two 8-bit integers results in a 9-bit result to support overflow. + +However, if the user attempts to store the result in an 8-bit container, `ac_int` will let the user do this, but this leads to the discard of the extra carry bit. The responsibility lies on the user to use the correct datatype. + +These promotions rules are consistent across all architectures so the behavior should be equivalent on x86 or on FPGA. + +#### Shift Operation + +The behavior of a shift operation with an `ac_int` is slightly different from its behavior with native integer types. For full details, see the `ac_int` documentation in the file `ac_data_types_ref.pdf`. Some key points to remember are as follows: + - If the datatype of the shift amount is not explicitly `unsigned` (either using `ac_int` or using the `unsigned` keyword), then the compiler will generate a more complex shifter that allows negative shifts and positive shifts. A right-shift by a negative amount is equivalent to a positive left-shift. + - Normally, you will not want to enable negative shifting, so you should use an `unsigned` datatype for the shift value to obtain a more resource efficient design. + - Shift values greater than the width of the data types are treated as a shift equal to the width of the datatype. + - The shift operation can be done more efficiently by specifying the amount to shift with the smallest possible `ac_int`. + +For example, in the tutorial, two kernels perform the left shift operation: `ShiftLeft` and `EfficientShiftLeft`. Both operate on an 14 bits wide `ac_int`. The former stores the shift amount in an `ac_int` which is 14 bits wide and the latter stores it in an `ac_int` which is 4 bits wide. The latter will generate simpler hardware. + +#### Bit Slice Operations + +The kernels `GetBitSlice` and `SetBitSlice` show how to read from and write to specific bits of an `ac_int` number. Note that only static bit widths are supported with such "slice" operations. + +For detailed documentation on the `set_slc` and `slc` APIs please see the file `ac_data_types_ref.pdf` + +## Key Concepts +* The `ac_int` data-type can be used to generate hardware for only as many bits as is needed by the operation as compared to native integer types which generate hardware for the entire type width. +* The left shift operation on `ac_int` can be implemented more efficiently when the amount to shift with is stored in a minimally sized ac_int``. +* The `ac_int` data-type offers functions for several useful operations including reading and writing of certain bits of an `ac_int` number. This can be very useful in creating bit masks. + +## License + +Code samples are licensed under the MIT license. See +[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details. + +## Building the `ac_int` Tutorial + +### Include Files + +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the type of compute node and whether to run in batch or interactive mode. Compiles to FPGA are only supported on fpga_compile nodes. Executing programs on FPGA hardware is only supported on fpga_runtime nodes of the appropriate type, such as fpga_runtime:arria10 or fpga_runtime:stratix10. Neither compiling nor executing programs on FPGA hardware are supported on the login nodes. For more information, see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/documentation/base-toolkit/](https://devcloud.intel.com/oneapi/documentation/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Install the design in `build` directory from the design directory by running `cmake`: + + ```bash + mkdir build + cd build + ``` + + If you are compiling for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + + ```bash + cmake .. + ``` + + Alternatively, to compile for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command: + + ```bash + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + You can also compile for a custom FPGA platform. Ensure that the board support package is installed on your system. Then run `cmake` using the command: + ```bash + cmake .. -DFPGA_BOARD=: + ``` + +2. Compile the design using the generated `Makefile`. The following four build targets are provided that match the recommended development flow: + + * Compile and run for emulation (fast compile time, targets emulates an FPGA device) using: + + ```bash + make fpga_emu + ``` + + * Generate HTML optimization reports using: + + ```bash + make report + ``` + + * Compile and run on FPGA hardware (longer compile time, targets an FPGA device) using: + + ```bash + make fpga + ``` + +3. (Optional) As the above hardware compile may take several hours to complete, FPGA precompiled binaries (compatible with Linux* Ubuntu* 18.04) can be downloaded here. + +### On a Windows* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake -G "NMake Makefiles" .. + ``` + Alternatively, to compile for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command: + + ``` + cmake -G "NMake Makefiles" .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + You can also compile for a custom FPGA platform. Ensure that the board support package is installed on your system. Then run `cmake` using the command: + ``` + cmake -G "NMake Makefiles" .. -DFPGA_BOARD=: + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + nmake fpga_emu + ``` + * Generate the optimization report: + ``` + nmake report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + nmake fpga + ``` + +*Note:* The Intel® PAC with Intel Arria® 10 GX FPGA and Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) do not support Windows*. Compiling to FPGA hardware on Windows* requires a third-party or custom Board Support Package (BSP) with Windows* support. + +### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). +For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports + +Locate the pair of `report.html` files in either: + +* **Report-only compile**: `ac_int_report.prj` +* **FPGA hardware compile**: `ac_int.prj` + +Navigate to the *System Viewer* report (*Views* > *System Viewer*) and step through the clusters generated for `ShiftLeft` by clicking on the cluster entires on the left hand side pane under `ShiftLeft` until you find the one that contains the left shift node (`<<`). Similarly locate the cluster containing the left shift node for `EfficientShiftLeft`. Observe that the compiler needs to generate extra logic to deal with the signedness of the b operand for the `ShiftLeft` kernel and hence generates more hardware than for the `EfficientShiftLeft` kernel. + +## Running the Sample + +1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + + ```bash + ./ac_int.fpga_emu # Linux + ac_int.fpga_emu.exe # Windows + ``` + +2. Run the sample on the FPGA device + + ```bash + ./ac_int.fpga # Linux + ``` + +### Example of Output + +```txt +Arithmetic Operations: +ac_int: +1383 + +966 = +2349 +int: 1383 + 966 = 2349 +ac_int: +6249 * +966 = +6036534 +int: 6249 * 966 = 6036534 +ac_int: +2163 / +43 = +50 +int: 2163 / 43 = 50 + +Bitwise Operations: +ac_int: +7423 << +2 = -3076 +int: 7423 << 2 = -3076 +ac_int: +6380 << 1 = -3624 +int: 6380 << 1 = -3624 +(+7373).slc<4>(5) = 6 +Running these two ops on +7373 + (+7373).set_slc(6, 10) = +7808 + a[3] = 0; a[2] = 0; a[1] = 0; a[0] = 0; + Result = +7808 +PASSED +``` + +### Discussion of Results + +`ac_int` can help minimize the generated hardware and achieve the same numerical result as standard integer types. This can be very useful when the logic does not need to utilize all of the bits provided by the standard integer type. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/ac_datatypes_ref.pdf b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/ac_datatypes_ref.pdf new file mode 100644 index 0000000000..898ec876d3 Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/ac_datatypes_ref.pdf differ diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/ac_int.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/ac_int.sln new file mode 100755 index 0000000000..6ad4ad3928 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/ac_int.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ac_int", "ac_int.vcxproj", "{73FCAD5C-4C93-4786-B662-A7273C515E22}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Debug|x64.ActiveCfg = Debug|x64 + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Debug|x64.Build.0 = Debug|x64 + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Release|x64.ActiveCfg = Release|x64 + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {DE911CD1-4F98-4391-BD43-B02212357F5E} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/ac_int.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/ac_int.vcxproj new file mode 100755 index 0000000000..1e31ad82cd --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/ac_int.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {73fcad5c-4c93-4786-b662-a7273c515e22} + Win32Proj + ac_int + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR /Qactypes %(AdditionalOptions) + $(IntDir)ac_int.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR /Qactypes %(AdditionalOptions) + $(IntDir)ac_int.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/sample.json new file mode 100755 index 0000000000..d49bf55b3b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/sample.json @@ -0,0 +1,55 @@ +{ + "guid": "394849F1-EDB6-4234-9276-B5A2A9DD6D5A", + "name": "AC Int", + "categories": ["Toolkit/oneAPI Direct Programming/DPC++ FPGA/Tutorials/Features"], + "description": "An Intel® FPGA tutorial demonstrating how to use the Algorithmic C Integer (AC Int) ", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./ac_int.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake -G \"NMake Makefiles\" ..", + "nmake fpga_emu", + "ac_int.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake -G \"NMake Makefiles\" ..", + "nmake report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/src/CMakeLists.txt new file mode 100755 index 0000000000..a2e41f93dd --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/src/CMakeLists.txt @@ -0,0 +1,80 @@ +# To see a Makefile equivalent of this build system: +# https://github.com/oneapi-src/oneAPI-samples/blob/master/DirectProgramming/DPC++/ProjectTemplates/makefile-fpga + +set(SOURCE_FILE ac_int.cpp) +set(TARGET_NAME ac_int) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# FPGA board selection +if(NOT DEFINED FPGA_BOARD) + set(FPGA_BOARD "intel_a10gx_pac:pac_a10") + message(STATUS "FPGA_BOARD was not specified.\ + \nConfiguring the design to run on the default FPGA board ${FPGA_BOARD} (Intel(R) PAC with Intel Arria(R) 10 GX FPGA). \ + \nPlease refer to the README for information on board selection.") +else() + message(STATUS "Configuring the design to run on FPGA board ${FPGA_BOARD}") +endif() + +# These are Windows-specific flags: +# 1. /EHsc This is a Windows-specific flag that enables exception handling in host code +# 2. /Qactypes Include ac_types headers and link against ac_types emulation libraries +if(WIN32) + set(WIN_FLAG "/EHsc") + set(AC_TYPES_FLAG "/Qactypes") +else() + set(AC_TYPES_FLAG "-qactypes") +endif() + +# A DPC++ ahead-of-time (AoT) compile processes the device code in two stages. +# 1. The "compile" stage compiles the device code to an intermediate representation (SPIR-V). +# 2. The "link" stage invokes the compiler's FPGA backend before linking. +# For this reason, FPGA backend flags must be passed as link flags in CMake. +set(EMULATOR_COMPILE_FLAGS "${WIN_FLAG} -fintelfpga ${AC_TYPES_FLAG} -DFPGA_EMULATOR -Wall") +set(EMULATOR_LINK_FLAGS "-fintelfpga ${AC_TYPES_FLAG}") +set(HARDWARE_COMPILE_FLAGS "${WIN_FLAG} -fintelfpga ${AC_TYPES_FLAG} -Wall") +set(HARDWARE_LINK_FLAGS "-fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard=${FPGA_BOARD} ${USER_HARDWARE_FLAGS}") +# We do not need to supply the AC_TYPES_FLAG for the 'report' target's linking stage. +set(REPORT_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${FPGA_BOARD} ${USER_HARDWARE_FLAGS}") +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation + +############################################################################### +### FPGA Emulator +############################################################################### +# To compile in a single command: +# dpcpp -fintelfpga ${AC_TYPES_FLAG} -DFPGA_EMULATOR fpga_compile.cpp -o fpga_compile.fpga_emu +# CMake executes: +# [compile] dpcpp -fintelfpga ${AC_TYPES_FLAG} -DFPGA_EMULATOR -o fpga_compile.cpp.o -c fpga_compile.cpp +# [link] dpcpp -fintelfpga ${AC_TYPES_FLAG} fpga_compile.cpp.o -o fpga_compile.fpga_emu +add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) +set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS "${EMULATOR_COMPILE_FLAGS}") +set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS "${EMULATOR_LINK_FLAGS}") +add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + +############################################################################### +### Generate Report +############################################################################### +# To compile manually: +# dpcpp -fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard= -fsycl-link=early ac_int.cpp -o ac_int_report.a +set(FPGA_EARLY_IMAGE ${TARGET_NAME}_report.a) +# The compile output is not an executable, but an intermediate compilation result unique to DPC++. +add_executable(${FPGA_EARLY_IMAGE} ${SOURCE_FILE}) +add_custom_target(report DEPENDS ${FPGA_EARLY_IMAGE}) +set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}") +set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES LINK_FLAGS "${REPORT_LINK_FLAGS} -fsycl-link=early") +# fsycl-link=early stops the compiler after RTL generation, before invoking Quartus® + +############################################################################### +### FPGA Hardware +############################################################################### +# To compile in a single command: +# dpcpp -fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard= ac_int.cpp -o ac_int.fpga +# CMake executes: +# [compile] dpcpp -fintelfpga ${AC_TYPES_FLAG} -o ac_int.cpp.o -c ac_int.cpp +# [link] dpcpp -fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard= ac_int.cpp.o -o ac_int.fpga +add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) +add_custom_target(fpga DEPENDS ${FPGA_TARGET}) +set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}") +set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${FPGA_TARGET}") +# The -reuse-exe flag enables rapid recompilation of host-only code changes. +# See DPC++FPGA/GettingStarted/fast_recompile for details. \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/src/ac_int.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/src/ac_int.cpp new file mode 100644 index 0000000000..b7cbabbcd3 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ac_int/src/ac_int.cpp @@ -0,0 +1,339 @@ +// clang-format off +#include +#include +#include +// clang-format on + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace sycl; + +// Forward declare the kernel name in the global scope. +// This is a FPGA best practice that reduces name mangling in the optimization +// reports. +class Add; +class Div; +class Mult; +class ShiftLeft; +class EfficientShiftLeft; +class GetBitSlice; +class SetBitSlice; + +using ac_int4 = ac_intN::int4; +using ac_int14 = ac_intN::int14; +using ac_int15 = ac_intN::int15; +using ac_int28 = ac_intN::int28; +using ac_uint4 = ac_intN::uint4; + +void TestAdd(queue &q, const ac_int14 &a, const ac_int14 &b, ac_int15 &c) { + buffer inp1(&a, 1); + buffer inp2(&b, 1); + buffer result(&c, 1); + + q.submit([&](handler &h) { + accessor x{inp1, h, read_only}; + accessor y{inp2, h, read_only}; + accessor res{result, h, write_only, no_init}; + h.single_task([=] { res[0] = x[0] + y[0]; }); + }); +} + +void TestDiv(queue &q, const ac_int14 &a, const ac_int14 &b, ac_int15 &c) { + buffer inp1(&a, 1); + buffer inp2(&b, 1); + buffer result(&c, 1); + + q.submit([&](handler &h) { + accessor x{inp1, h, read_only}; + accessor y{inp2, h, read_only}; + accessor res{result, h, write_only, no_init}; + h.single_task
([=] { res[0] = x[0] / y[0]; }); + }); +} + +void TestMult(queue &q, const ac_int14 &a, const ac_int14 &b, ac_int28 &c) { + buffer inp1(&a, 1); + buffer inp2(&b, 1); + buffer result(&c, 1); + + q.submit([&](handler &h) { + accessor x{inp1, h, read_only}; + accessor y{inp2, h, read_only}; + accessor res{result, h, write_only, no_init}; + h.single_task([=] { res[0] = x[0] * y[0]; }); + }); +} + +void TestShiftLeft(queue &q, const ac_int14 &a, const ac_int14 &b, + ac_int14 &c) { + buffer inp1(&a, 1); + buffer inp2(&b, 1); + buffer result(&c, 1); + + q.submit([&](handler &h) { + accessor x{inp1, h, read_only}; + accessor y{inp2, h, read_only}; + accessor res{result, h, write_only, no_init}; + h.single_task([=] { res[0] = x[0] << y[0]; }); + }); +} + +// Note how the shift amount is specified with a smaller ac_int than in +// TestShiftLeft above +void TestEfficientShiftLeft(queue &q, const ac_int14 &a, const ac_uint4 &b, + ac_int14 &c) { + buffer inp1(&a, 1); + buffer inp2(&b, 1); + buffer result(&c, 1); + + q.submit([&](handler &h) { + accessor x{inp1, h, read_only}; + accessor y{inp2, h, read_only}; + accessor res{result, h, write_only, no_init}; + h.single_task([=] { res[0] = x[0] << y[0]; }); + }); +} + +// The method +// x = y.slc(n) +// is equivalent to the VHDL behavior of +// x := y((M+n-1) downto n); +// Note that only static bit widths are supported +void TestGetBitSlice(queue &q, const ac_int14 &a, ac_uint4 &b, const int lsb) { + buffer inp1(&a, 1); + buffer inp2(&lsb, 1); + buffer result(&b, 1); + + q.submit([&](handler &h) { + accessor x{inp1, h, read_only}; + accessor y{inp2, h, read_only}; + accessor res{result, h, write_only, no_init}; + h.single_task([=] { res[0] = x[0].slc<4>(y[0]); }); + }); +} + +// There is a set_slc(int lsb, const ac_int &slc) which allows the user to +// set a bit slice as shown in the example below. +void TestSetBitSlice(queue &q, ac_int14 &a, const ac_int4 &b, const int lsb) { + buffer buff_a(&a, 1); + buffer buff_b(&b, 1); + buffer buff_lsb(&lsb, 1); + + q.submit([&](handler &h) { + accessor x{buff_a, h, write_only, no_init}; + accessor y{buff_b, h, read_only}; + accessor lsb_accessor{buff_lsb, h, read_only}; + + h.single_task([=] { + // The set_slc method does not need to have a width specified as a + // template argument since the width is inferred from the width of the + // argument x + x[0].set_slc(lsb_accessor[0], y[0]); + + // Bits can also be individually set as follows: + x[0][3] = 0; + x[0][2] = 0; + x[0][1] = 0; + x[0][0] = 0; + }); + }); +} + +int main() { +#if defined(FPGA_EMULATOR) + ext::intel::fpga_emulator_selector selector; +#else + ext::intel::fpga_selector selector; +#endif + bool passed = true; + + try { + // create the SYCL device queue + queue q(selector, dpc_common::exception_handler); + + // Use a fixed initial seed + srand(123); + + // Initialize two random ints + int t1 = rand(); + int t2 = rand(); + // Truncate each of the two ints to 13 bits. The 14th bit is the sign bit. + // In this testbench, even though the datatypes are signed, we will be + // storing unsigned values to make the testing process simpler. + t1 &= (1 << 13) - 1; + t2 &= (1 << 13) - 1; + + std::cout << "Arithmetic Operations:\n"; + // Test adder + { + // ac_int offers type casting from and to native datatypes + ac_int14 a = t1; + ac_int14 b = t2; + ac_int15 c; + TestAdd(q, a, b, c); + int c_golden = t1 + t2; + // We can check the result of the ac_int addition with the native C int + // addition + if (c != c_golden) { + passed = false; + std::cerr << "Addition failed\n"; + } + std::cout << "ac_int: " << a << " + " << b << " = " << c << "\n"; + std::cout << "int: " << t1 << " + " << t2 << " = " << c_golden << "\n"; + } + + // Test multiplier + { + t1 = rand() & ((1 << 13) - 1); + ac_int14 a = t1; + ac_int14 b = t2; + ac_int28 c; + TestMult(q, a, b, c); + int c_golden = t1 * t2; + // We can check the result of the ac_int multiplication with the native C + // int multiplication + if (c != c_golden) { + passed = false; + std::cerr << "Multiplier failed\n"; + } + std::cout << "ac_int: " << a << " * " << b << " = " << c << "\n"; + std::cout << "int: " << t1 << " * " << t2 << " = " << c_golden << "\n"; + } + + // Test divider + { + t1 = rand() & ((1 << 13) - 1); + t2 = rand() % 50; // Use a small value for the divisor so that the result + // is not 0 or 1 + ac_int14 a = t1; + ac_int14 b = t2; + ac_int15 c; + TestDiv(q, a, b, c); + int c_golden = t1 / t2; + // We can check the result of the ac_int division with the native C + // int division + if (c != c_golden) { + passed = false; + std::cerr << "divider failed\n"; + } + std::cout << "ac_int: " << a << " / " << b << " = " << c << "\n"; + std::cout << "int: " << t1 << " / " << t2 << " = " << c_golden << "\n"; + } + + std::cout << "\nBitwise Operations:\n"; + // Shift operator + { + t1 = rand() & ((1 << 13) - 1); + t2 = rand() % 8; // Use a small value for the shift + ac_int14 a = t1; + ac_int14 b = t2; + ac_int14 c; + // b can be positive or negative. If (b > 0), a will be shifted to the + // left. Else, a will be shifted to the right + TestShiftLeft(q, a, b, c); + + // Note that the left shift in ac_int is logical so to check the result, + // we need to do a little bit manipulation to get the correct signed value + int c_golden = (t1 << t2) & ((1 << 14) - 1); + if ((t1 << t2) & (1 << 13)) c_golden |= (~((1 << 14) - 1)); + if (c != c_golden) { + passed = false; + std::cerr << "left_shift failed\n"; + } + std::cout << "ac_int: " << a << " << " << b << " = " << c << "\n"; + std::cout << "int: " << t1 << " << " << t2 << " = " << c_golden + << "\n"; + } + + // Efficient left shift operator + { + t1 = rand() & ((1 << 13) - 1); + t2 = rand() % 14; // Use a small value for the shift + ac_int14 a = t1; + ac_uint4 b = t2; + ac_int14 c; + // b is always positive in this case. This shift is more efficient in HW + // than the one above. If the direction of the shift is known at compile + // time, this is recommended. Note that the two datatypes need not be the + // same. Note that the datatype of b here is just 4 bits since 4 bits can + // completely contain the value of the full width of a. This will generate + // a more efficient datapath. + TestEfficientShiftLeft(q, a, b, c); + + // Note that the left shift in ac_int is logical so to check the result, + // we need to do a little bit manipulation to get the correct signed value + int c_golden = (t1 << t2) & ((1 << 14) - 1); + if ((t1 << t2) & (1 << 13)) c_golden |= (~((1 << 14) - 1)); + if (c != c_golden) { + passed = false; + std::cerr << "efficient_left_shift failed\n"; + } + std::cout << "ac_int: " << a << " << " << b << " = " << c << "\n"; + std::cout << "int: " << t1 << " << " << t2 << " = " << c_golden + << "\n"; + } + + // Slice operations + { + t1 = rand() & ((1 << 13) - 1); + ac_int14 a = t1; + ac_uint4 b; + TestGetBitSlice(q, a, b, 5); + + // Replicate the same operation using bitwise operation + t2 = (t1 >> 5) & 0xF; + // Compare the CPU result with the FPGA result + if (b != t2) { + passed = false; + std::cerr << "GetBitSlice failed\n"; + } + std::cout << "(" << a << ").slc<4>(5) = " << b << "\n"; + + int t2 = 10; + ac_uint4 d = t2; + ac_int14 c = a; + // Sets the bits (3, 2, 1, 0) as 0 and sets bits (9, 8, 7, 6) with d + TestSetBitSlice(q, c, d, 6); + + // Replicate the same operation using bitwise operation + int mask = -1; + mask ^= (0xF << 6) | (0xF); + int mask2 = (t2 << 6); + int t3 = (t1 & mask) | mask2; + // Compare the CPU result with the FPGA result + if (c != t3) { + passed = false; + std::cerr << "SetBitSlice failed\n"; + } + std::cout << "Running these two ops on " << a << "\n"; + std::cout << "\t(" << a << ").set_slc(6, " << d << ") = " << c << "\n"; + std::cout << "\ta[3] = 0; a[2] = 0; a[1] = 0; a[0] = 0;\n"; + std::cout << "\tResult = " << c << "\n"; + } + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cerr << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.code().value() == CL_DEVICE_NOT_FOUND) { + std::cerr << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cerr << "Run sys_check in the oneAPI root directory to verify.\n"; + std::cerr << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + if (passed) { + std::cout << "PASSED\n"; + } else { + std::cout << "FAILED\n"; + return 1; + } + + return 0; +}