psychocoderHPC · October 24, 2025 14:08
diff --git a/alpaka.hpp b/alpaka.hpp
 #pragma once

 // ============================================================================
 // == ./include/alpaka/alpaka.hpp ==
 // ==
 /* Copyright 2024 René Widera
 * SPDX-License-Identifier: MPL-2.0
 */

 // #pragma once
 	// ============================================================================
 	// == ./include/alpaka/CVec.hpp ==
 	// ==
 	/* Copyright 2024 René Widera
 	 * SPDX-License-Identifier: MPL-2.0
 	 */

 	// #pragma once
 		// ============================================================================
 		// == ./include/alpaka/Vec.hpp ==
 		// ==
 		/* Copyright 2024 René Widera
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 			// ============================================================================
 			// == ./include/alpaka/cast.hpp ==
 			// ==
 			/* Copyright 2025 René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 				// ============================================================================
 				// == ./include/alpaka/core/common.hpp ==
 				// ==
 				/* Copyright 2024 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, René Widera, Andrea Bocci, Aurora Perego
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 					// ============================================================================
 					// == ./include/alpaka/core/config.hpp ==
 					// ==
 					/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Jeffrey Kelling,
 					 *                Bernhard Manfred Gruber, Jan Stephan, Mehmet Yusufoglu
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 						// ============================================================================
 						// == ./include/alpaka/core/PP.hpp ==
 						// ==
 						/* Copyright 2024 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						#define ALPAKA_PP_CAT(left, right) left##right
 						#define ALPAKA_PP_REMOVE_FIRST_COMMA_DO(ignore, ...) __VA_ARGS__
 						#define ALPAKA_PP_REMOVE_FIRST_COMMA(...) ALPAKA_PP_REMOVE_FIRST_COMMA_DO(0 __VA_ARGS__)

 						/** solution from https://stackoverflow.com/a/62984543
 						 * @{
 						 */
 						#define ALPAKA_PP_REMOVE_BRACKETS_DO(X) ALPAKAESC(ISHALPAKA X)
 						#define ISHALPAKA(...) ISHALPAKA __VA_ARGS__
 						#define ALPAKAESC(...) ALPAKAESC_(__VA_ARGS__)
 						#define ALPAKAESC_(...) VAN##__VA_ARGS__
 						#define VANISHALPAKA
 						/** @} */

 						#define ALPAKA_PP_REMOVE_BRACKETS(x) ALPAKA_PP_REMOVE_BRACKETS_DO(x)

 						/* version number encoding
 						 * 4 digits for major version (max 9999)
 						 * 3 digits for minor version (max 999)
 						 * 5 digits for patch version (max 99999)
 						 * example: version 1.2.3 -> 0001 002 00003
 						 */
 						#define ALPAKA_VERSION_NUMBER(major, minor, patch)                                                                    \
 						    ((((major) % 10000llu) * 100'000'000llu) + (((minor) % 1000llu) * 100000llu) + ((patch) % 100000llu))

 						#define ALPAKA_VERSION_NUMBER_NOT_AVAILABLE ALPAKA_VERSION_NUMBER(0llu, 0llu, 0llu)
 						#define ALPAKA_VERSION_NUMBER_UNKNOWN ALPAKA_VERSION_NUMBER(9999llu, 999llu, 99999llu)

 						// version number conversion from vendor format to ALPAKA_VERSION_NUMBER
 						#define ALPAKA_YYYYMMDD_TO_VERSION(V) ALPAKA_VERSION_NUMBER(((V) / 10000llu), ((V) / 100llu) % 100llu, (V) % 100llu)

 						#define ALPAKA_YYYYMM_TO_VERSION(V) ALPAKA_VERSION_NUMBER(((V) / 100llu) % 10000llu, (V) % 100llu, 0llu)

 						#define ALPAKA_VVRRP_TO_VERSION(V)                                                                                    \
 						    ALPAKA_VERSION_NUMBER(((V) / 1000llu) % 10000llu, ((V) / 10llu) % 100llu, (V) % 10llu)

 						#define ALPAKA_VRP_TO_VERSION(V) ALPAKA_VERSION_NUMBER(((V) / 100llu) % 10000llu, ((V) / 10llu) % 10llu, (V) % 10llu)

 						#define ALPAKA_VRRPP_TO_VERSION(V)                                                                                    \
 						    ALPAKA_VERSION_NUMBER(((V) / 10000llu) % 10000llu, ((V) / 100llu) % 100llu, (V) % 100llu)
 						// ==
 						// == ./include/alpaka/core/PP.hpp ==
 						// ============================================================================

 						// ============================================================================
 						// == ./include/alpaka/core/hipConfig.hpp ==
 						// ==
 						/* Copyright 2025 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined

 						// We can not use ALPAKA_LANG_HIP because this file is required by core/config.hpp where ALPAKA_LANG_HIP is defined.
 						#if defined(__HIP__)

 						#    include <hip/hip_version.h>

 						// version numbers are only defined on the device side
 						#    if !defined(ALPAKA_AMDGPU_ARCH) && defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)                           \
 						        && __HIP_DEVICE_COMPILE__ == 1

 						/* Map AMDGPU arch macro -> ALPAKA_VRRPP_TO_VERSION(wrapped code)
 						 *  Rules:
 						 *   - gfx9xy (numeric): 9xy -> 90x0y  (e.g., 908->90008, 906->90006, 942->90402)
 						 *   - gfx10xy / gfx11xy: stxy -> st0x0y (e.g., 1036->100306, 1103->110003)
 						 *   - Suffix: a == 10 (90a->90010), b == 11, c == 11
 						 *
 						 * An overview of AMD GPU architectures can be found here:
 						 * https://llvm.org/docs/AMDGPUUsage.html#processors
 						 */

 						#        if defined(__gfx1153__)
 						/* RDNA 3.5 iGPU (Medusa Point / Strix Halo successor) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110503)
 						#        elif defined(__gfx1152__)
 						/* RDNA 3.5 iGPU (Krackan Point) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110502)
 						#        elif defined(__gfx1151__)
 						/* RDNA 3.5 iGPU (Strix Halo) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110501)
 						#        elif defined(__gfx1150__)
 						/* RDNA 3.5 iGPU (Radeon 890M on Strix Point) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110500)

 						#        elif defined(__gfx1103__)
 						/* RDNA 3 APU (Radeon 780M, 760M, ROG Ally Extreme) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110003)
 						#        elif defined(__gfx1102__)
 						/* RDNA 3 Desktop (RX 7600 / 7600 XT) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110002)
 						#        elif defined(__gfx1101__)
 						/* RDNA 3 Desktop (RX 7700 / 7700 XT, Pro W7700 / V710) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110001)
 						#        elif defined(__gfx1100__)
 						/* RDNA 3 Desktop (RX 7900 XT, XTX, Pro W7900) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110000)

 						#        elif defined(__gfx1036__)
 						/* RDNA 2 APU (Radeon Graphics 128-SP iGPU) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100306)
 						#        elif defined(__gfx1035__)
 						/* RDNA 2 APU (Radeon 660M, 680M) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100305)
 						#        elif defined(__gfx1034__)
 						/* RDNA 2 Mobile (Pro W6300/W6400, RX 6400-6500) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100304)
 						#        elif defined(__gfx1033__)
 						/* RDNA 2 APU (Steam Deck) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100303)
 						#        elif defined(__gfx1032__)
 						/* RDNA 2 Desktop (RX 6600 XT, 6650 XT/S, 6700S) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100302)
 						#        elif defined(__gfx1031__)
 						/* RDNA 2 Desktop (RX 6700 series, 6750/6850M XT) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100301)
 						#        elif defined(__gfx1030__)
 						/* RDNA 2 Desktop (RX 6800 / 6900 XT, Pro W6800) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100300)

 						#        elif defined(__gfx1013__)
 						/* RDNA 1 Mobile (RX 5300M / 5500M) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100103)
 						#        elif defined(__gfx1012__)
 						/* RDNA 1 Desktop (RX 5500 / 5500 XT) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100102)
 						#        elif defined(__gfx1011__)
 						/* RDNA 1 Desktop (Pro V520) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100101)
 						#        elif defined(__gfx1010__)
 						/* RDNA 1 Desktop (RX 5700 / 5700 XT, Pro 5600 XT/M) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100100)

 						#        elif defined(__gfx942__)
 						/* CDNA 3 (Instinct MI300 series: MI300/MI300A/MI300X) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90402)
 						#        elif defined(__gfx941__)
 						/* CDNA 2/3 (Instinct MI210) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90401)
 						#        elif defined(__gfx940__)
 						/* CDNA 2 (Instinct MI200) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90400)

 						#        elif defined(__gfx90c__)
 						/* CDNA 1 (Renoir APUs), c -> 12 */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90012)
 						#        elif defined(__gfx90b__)
 						/* (If present) b -> 11 */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90011)
 						#        elif defined(__gfx90a__)
 						/* CDNA 2 (Instinct MI250 / MI250X), a -> 10 */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90010)
 						#        elif defined(__gfx908__)
 						/* CDNA 1 (Instinct MI100) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90008)
 						#        elif defined(__gfx906__)
 						/* Vega 20 (Radeon VII, Instinct MI50/60) */
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90006)

 						#        else
 						#            warning                                                                                                  \
 						                "Unknown AMDGPU architecture, please define __gfxXXX__ macro for your target. Until alpaka is updated you can define the macro ALPAKA_AMDGPU_ARCH to avoid this warning."
 						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VERSION_NUMBER_UNKNOWN
 						#        endif

 						#    endif
 						#endif
 						// ==
 						// == ./include/alpaka/core/hipConfig.hpp ==
 						// ============================================================================


 					// guard cmake target alpaka
 					#if defined(ALPAKA_CMAKE_TARGET_ALPAKA) && !defined(ALPAKA_CMAKE_TARGET_ALPAKA_FINALIZE_CALLED)
 					#    error "After adding the cmake target alpaka or alpaka::alpaka you should call 'alpaka_finalize(targetName)'"
 					#endif
 					// guard cmake target alpaka::headers
 					#if defined(ALPAKA_CMAKE_TARGET_HEADERS) && !defined(ALPAKA_CMAKE_TARGET_HEADERS_FINALIZE_CALLED)
 					#    error "After adding the cmake target alpaka::headers you should call 'alpaka_finalize(targetName)'"
 					#endif
 					// guard cmake target alpaka::cuda
 					#if defined(ALPAKA_CMAKE_TARGET_CUDA) && !defined(ALPAKA_CMAKE_TARGET_CUDA_FINALIZE_CALLED)
 					#    error "After adding the cmake target alpaka::cuda you should call 'alpaka_finalize(targetName)'"
 					#endif
 					// guard cmake target alpaka::hip
 					#if defined(ALPAKA_CMAKE_TARGET_HIP) && !defined(ALPAKA_CMAKE_TARGET_HIP_FINALIZE_CALLED)
 					#    error "After adding the cmake target alpaka::hip you should call 'alpaka_finalize(targetName)'"
 					#endif
 					// guard cmake target alpaka::onapi
 					#if defined(ALPAKA_CMAKE_TARGET_ONEAPI) && !defined(ALPAKA_CMAKE_TARGET_ONEAPI_FINALIZE_CALLED)
 					#    error "After adding the cmake target alpaka::oneapi you should call 'alpaka_finalize(targetName)'"
 					#endif
 					// guard cmake target alpaka::host
 					#if defined(ALPAKA_CMAKE_TARGET_HOST) && !defined(ALPAKA_CMAKE_TARGET_HOST_FINALIZE_CALLED)
 					#    error "After adding the cmake target alpaka::host you should call 'alpaka_finalize(targetName)'"
 					#endif

 					#ifdef __INTEL_COMPILER
 					#    warning                                                                                                          \
 					        "The Intel Classic compiler (icpc) is no longer supported. Please upgrade to the Intel LLVM compiler (ipcx)."
 					#endif

 					// ######## detect operating systems ########

 					// WINDOWS
 					#if !defined(ALPAKA_OS_WINDOWS)
 					#    if defined(_WIN64) || defined(__MINGW64__)
 					#        define ALPAKA_OS_WINDOWS 1
 					#    else
 					#        define ALPAKA_OS_WINDOWS 0
 					#    endif
 					#endif


 					// Linux
 					#if !defined(ALPAKA_OS_LINUX)
 					#    if defined(__linux) || defined(__linux__) || defined(__gnu_linux__)
 					#        define ALPAKA_OS_LINUX 1
 					#    else
 					#        define ALPAKA_OS_LINUX 0
 					#    endif
 					#endif

 					// Apple
 					#if !defined(ALPAKA_OS_IOS)
 					#    if defined(__APPLE__)
 					#        define ALPAKA_OS_IOS 1
 					#    else
 					#        define ALPAKA_OS_IOS 0
 					#    endif
 					#endif

 					// Cygwin
 					#if !defined(ALPAKA_OS_CYGWIN)
 					#    if defined(__CYGWIN__)
 					#        define ALPAKA_OS_CYGWIN 1
 					#    else
 					#        define ALPAKA_OS_CYGWIN 0
 					#    endif
 					#endif

 					// ### architectures

 					// X86
 					#if !defined(ALPAKA_ARCH_X86)
 					#    if defined(__x86_64__) || defined(_M_X64)
 					#        define ALPAKA_ARCH_X86 1
 					#    else
 					#        define ALPAKA_ARCH_X86 0
 					#    endif
 					#endif

 					// RISCV
 					#if !defined(ALPAKA_ARCH_RISCV)
 					#    if defined(__riscv)
 					#        define ALPAKA_ARCH_RISCV 1
 					#    else
 					#        define ALPAKA_ARCH_RISCV 0
 					#    endif
 					#endif

 					// ARM
 					#if !defined(ALPAKA_ARCH_ARM)
 					#    if defined(__ARM_ARCH) || defined(__arm__) || defined(__arm64)
 					#        define ALPAKA_ARCH_ARM 1
 					#    else
 					#        define ALPAKA_ARCH_ARM 0
 					#    endif
 					#endif

 					/** NVIDIA device compile
 					 *
 					 * * The version on the host side will always be ALPAKA_VERSION_NUMBER_NOT_AVAILABLE.
 					 *
 					 *   Rules:
 					 *   - sm75 -> ALPAKA_VERSION_NUMBER(7,5,0)
 					 *   - sm91 -> ALPAKA_VERSION_NUMBER(9,1,0)
 					 */
 					#if !defined(ALPAKA_ARCH_PTX)
 					#    if defined(__CUDA_ARCH__)
 					#        define ALPAKA_ARCH_PTX ALPAKA_VRP_TO_VERSION(__CUDA_ARCH__)
 					#    else
 					#        define ALPAKA_ARCH_PTX ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#    endif
 					#endif

 					/** HIP device compile
 					 *
 					 * The version on the host side will always be ALPAKA_VERSION_NUMBER_NOT_AVAILABLE.
 					 * On the device side unknown version will be set to ALPAKA_VERSION_NUMBER_UNKNOWN.
 					 *
 					 *  Rules:
 					 *   - the last two digits will be handled as HEX values and support 0-9 and a-f
 					 *   - gfx9xy (numeric): 9xy -> ALPAKA_VERSION_NUMBER(9,x,y)
 					 *   - gfx10xy / gfx11xy: stxy -> ALPAKA_VERSION_NUMBER(st,x,y)
 					 *   - Suffix: a == 10, b == 11, c == 12
 					 *      - gfx90a -> ALPAKA_VERSION_NUMBER(9,0,10)
 					 *      - gfx90c -> ALPAKA_VERSION_NUMBER(9,0,12)
 					 */
 					#if !defined(ALPAKA_ARCH_AMD)
 					#    if defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1
 					#        define ALPAKA_ARCH_AMD ALPAKA_AMDGPU_ARCH
 					#    else
 					#        define ALPAKA_ARCH_AMD ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#    endif
 					#endif

 					// ######## compiler ########

 					// HIP compiler detection
 					#if !defined(ALPAKA_COMP_HIP)
 					#    if defined(__HIP__) // Defined by hip-clang and vanilla clang in HIP mode.
 					#        include <hip/hip_version.h>
 					// HIP doesn't give us a patch level for the last entry, just a gitdate
 					#        define ALPAKA_COMP_HIP ALPAKA_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0)
 					#    else
 					#        define ALPAKA_COMP_HIP ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#    endif
 					#endif

 					// nvcc compiler
 					#if defined(__NVCC__)
 					#    define ALPAKA_COMP_NVCC ALPAKA_VERSION_NUMBER(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, __CUDACC_VER_BUILD__)
 					#else
 					#    define ALPAKA_COMP_NVCC ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#endif

 					// clang compiler
 					#if defined(__clang__)
 					#    define ALPAKA_COMP_CLANG ALPAKA_VERSION_NUMBER(__clang_major__, __clang_minor__, __clang_patchlevel__)
 					#else
 					#    define ALPAKA_COMP_CLANG ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#endif

 					// MSVC compiler
 					#if defined(_MSC_VER)
 					#    define ALPAKA_COMP_MSVC                                                                                          \
 					        ALPAKA_VERSION_NUMBER((_MSC_FULL_VER) % 10'000'000, ((_MSC_FULL_VER) / 100000) % 100, (_MSC_FULL_VER) % 100000)
 					#else
 					#    define ALPAKA_COMP_MSVC ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#endif

 					// gnu compiler (excluding compilers which emulates gnu compiler like clang)
 					#if defined(__GNUC__) && !defined(__clang__)
 					#    if defined(__GNUC_PATCHLEVEL__)
 					#        define ALPAKA_COMP_GNUC ALPAKA_VERSION_NUMBER(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
 					#    else
 					#        define ALPAKA_COMP_GNUC ALPAKA_VERSION_NUMBER(__GNUC__, __GNUC_MINOR__, 0)
 					#    endif
 					#else
 					#    define ALPAKA_COMP_GNUC ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#endif

 					// IBM compiler
 					// only clang based is supported
 					#if defined(__ibmxl__)
 					#    define ALPAKA_COMP_IBM ALPAKA_VERSION_NUMBER(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
 					#else
 					#    define ALPAKA_COMP_IBM ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#endif

 					// clang CUDA compiler detection
 					// Currently __CUDA__ is only defined by clang when compiling CUDA code.
 					#if defined(__clang__) && defined(__CUDA__)
 					#    define ALPAKA_COMP_CLANG_CUDA ALPAKA_VERSION_NUMBER(__clang_major__, __clang_minor__, __clang_patchlevel__)
 					#else
 					#    define ALPAKA_COMP_CLANG_CUDA ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#endif

 					// PGI and NV HPC SDK compiler detection
 					#if defined(__PGI)
 					#    define ALPAKA_COMP_PGI ALPAKA_VERSION_NUMBER(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
 					#else
 					#    define ALPAKA_COMP_PGI ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#endif

 					// Intel LLVM compiler detection
 					#if !defined(ALPAKA_COMP_ICPX)
 					#    if defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER)
 					// The version string for icpx 2023.1.0 is 20230100. In Boost.Predef this becomes (53,1,0).
 					#        define ALPAKA_COMP_ICPX ALPAKA_YYYYMMDD_TO_VERSION(__INTEL_LLVM_COMPILER)
 					#    else
 					#        define ALPAKA_COMP_ICPX ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#    endif
 					#endif

 					// ######## C++ language ########

 					//---------------------------------------HIP-----------------------------------
 					// __HIP__ is defined by both hip-clang and vanilla clang in HIP mode.
 					// https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#compiler-defines-summary
 					#if !defined(ALPAKA_LANG_HIP)
 					#    if defined(__HIP__)
 					#        include <hip/hip_version.h>
 					// HIP doesn't give us a patch level for the last entry, just a gitdate
 					#        define ALPAKA_LANG_HIP ALPAKA_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0)
 					#    else
 					#        define ALPAKA_LANG_HIP ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#    endif
 					#endif

 					// CUDA
 					#if !defined(ALPAKA_LANG_CUDA)
 					#    if defined(__CUDACC__) || defined(__CUDA__)
 					#        include <cuda.h>
 					// CUDA doesn't give us a patch level for the last entry, just zero.
 					#        define ALPAKA_LANG_CUDA ALPAKA_VVRRP_TO_VERSION(CUDART_VERSION)
 					#    else
 					#        define ALPAKA_LANG_CUDA ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#    endif
 					#endif

 					// Intel OneAPI Sycl GPU
 					#if !defined(ALPAKA_LANG_SYCL)
 					#    if defined(SYCL_LANGUAGE_VERSION)
 					#        define ALPAKA_LANG_SYCL ALPAKA_YYYYMMDD_TO_VERSION(SYCL_LANGUAGE_VERSION)
 					#    else
 					#        define ALPAKA_LANG_SYCL ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#    endif
 					#    if (ALPAKA_COMP_ICPX)
 					// ONE API must be detected via the ICPX compiler see
 					// https://www.intel.com/content/www/us/en/docs/dpcpp-cpp-compiler/developer-guide-reference/2023-2/use-predefined-macros-to-specify-intel-compilers.html
 					#        define ALPAKA_LANG_ONEAPI ALPAKA_COMP_ICPX
 					#    endif
 					#endif

 					// OpenMP
 					#if !defined(ALPAKA_OMP)
 					#    if defined(_OPENMP)
 					#        include <omp.h>
 					#    endif
 					#    if defined(_OPENMP)
 					#        define ALPAKA_OMP ALPAKA_YYYYMM_TO_VERSION(_OPENMP)
 					#    else
 					#        define ALPAKA_OMP ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#    endif
 					#endif

 					// oneTBB
 					// Use _has_include to detect oneTBB version if available, there is no predefined macro like OpenMP _OPENMP
 					// When the header is available we define ALPAKA_TBB to the real version, otherwise it drops back to
 					// ALPAKA_VERSION_NUMBER_NOT_AVAILABLE.
 					#if !defined(ALPAKA_TBB)
 					#    // Does not provide a macro we can check therefore we need to load the headers first to set ALPAKA_TBB
 					#    if defined(__has_include)
 					#        // alpaka assumes if the TBB headers can be found, TBB can be activated for usage.
 					#        // If CMake is not used e.g. in compiler explorers or other build engines, the macro ALPAKA_DISABLE_TBB
 					#        // must be set if the TBB headers are available but linker flags for TBB are not passed.
 					#        // This can be the reason together if icpx is used since oneAPI is mostly shipping TBB directly.
 					#        if __has_include(<oneapi/tbb/version.h>) && !defined(ALPAKA_DISABLE_TBB)
 					#            include <oneapi/tbb/version.h>
 					#        endif
 					#    endif
 					#    // TBB headers define TBB_VERSION_* when present; otherwise we fall back to NOT_AVAILABLE.
 					#    if defined(TBB_VERSION_MAJOR)
 					#        if defined(TBB_VERSION_PATCH)
 					#            define ALPAKA_TBB ALPAKA_VERSION_NUMBER(TBB_VERSION_MAJOR, TBB_VERSION_MINOR, TBB_VERSION_PATCH)
 					#        else
 					#            define ALPAKA_TBB ALPAKA_VERSION_NUMBER(TBB_VERSION_MAJOR, TBB_VERSION_MINOR, 0)
 					#        endif
 					#    else
 					#        define ALPAKA_TBB ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
 					#    endif
 					#endif
 					// ==
 					// == ./include/alpaka/core/config.hpp ==
 					// ============================================================================


 				#include <type_traits>

 				// Boost.Uuid errors with VS2017 when intrin.h is not included
 				#if defined(_MSC_VER) && _MSC_VER >= 1910
 				#    include <intrin.h>
 				#endif

 				#if ALPAKA_LANG_HIP
 				// HIP defines some keywords like __forceinline__ in header files.
 				#    include <hip/hip_runtime.h>
 				#endif

 				//! All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_FN_HOST_ACC.
 				//!
 				//! \code{.cpp}
 				//! Usage:
 				//! ALPAKA_FN_ACC
 				//! auto add(std::int32_t a, std::int32_t b)
 				//! -> std::int32_t;
 				//! \endcode
 				//! @{
 				#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
 				#    define ALPAKA_FN_ACC __device__ __host__
 				#    define ALPAKA_FN_HOST_ACC __device__ __host__
 				#    define ALPAKA_FN_HOST __host__
 				#else
 				#    define ALPAKA_FN_ACC
 				#    define ALPAKA_FN_HOST_ACC
 				#    define ALPAKA_FN_HOST
 				#endif
 				//! @}

 				//! All functions marked with ALPAKA_FN_ACC or ALPAKA_FN_HOST_ACC that are exported to / imported from different
 				//! translation units have to be attributed with ALPAKA_FN_EXTERN. Note that this needs to be applied to both the
 				//! declaration and the definition.
 				//!
 				//! Usage:
 				//! ALPAKA_FN_ACC ALPAKA_FN_EXTERN auto add(std::int32_t a, std::int32_t b) -> std::int32_t;
 				//!
 				//! Warning: If this is used together with the SYCL back-end make sure that your SYCL runtime supports generic
 				//! address spaces. Otherwise it is forbidden to use pointers as parameter or return type for functions marked
 				//! with ALPAKA_FN_EXTERN.
 				#if ALPAKA_LANG_SYCL
 				/*
 				   This is required by the SYCL standard, section 5.10.1 "SYCL functions and member functions linkage":

 				   The default behavior in SYCL applications is that all the definitions and declarations of the functions and member
 				   functions are available to the SYCL compiler, in the same translation unit. When this is not the case, all the
 				   symbols that need to be exported to a SYCL library or from a C++ library to a SYCL application need to be defined
 				   using the macro: SYCL_EXTERNAL.
 				*/
 				#    define ALPAKA_FN_EXTERN SYCL_EXTERNAL
 				#else
 				#    define ALPAKA_FN_EXTERN
 				#endif

 				//! Disable nvcc warning:
 				//! 'calling a __host__ function from __host__ __device__ function.'
 				//! Usage:
 				//! ALPAKA_NO_HOST_ACC_WARNING
 				//! ALPAKA_FN_HOST_ACC function_declaration()
 				//! WARNING: Only use this method if there is no other way.
 				//! Most cases can be solved by #if ALPAKA_ARCH_PTX or #if ALPAKA_LANG_CUDA.
 				#if (ALPAKA_LANG_CUDA && !ALPAKA_COMP_CLANG_CUDA)
 				#    if ALPAKA_COMP_MSVC
 				#        define ALPAKA_NO_HOST_ACC_WARNING __pragma(hd_warning_disable)
 				#    else
 				#        define ALPAKA_NO_HOST_ACC_WARNING _Pragma("hd_warning_disable")
 				#    endif
 				#else
 				#    define ALPAKA_NO_HOST_ACC_WARNING
 				#endif

 				//! Macro defining the inline function attribute.
 				//!
 				//! The macro should stay on the left hand side of keywords, e.g. 'static', 'constexpr', 'explicit' or the return type.
 				#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
 				#    define ALPAKA_FN_INLINE __forceinline__
 				#elif ALPAKA_COMP_MSVC
 				// TODO: With C++20 [[msvc::forceinline]] can be used.
 				#    define ALPAKA_FN_INLINE __forceinline
 				#else
 				// For gcc, clang, and clang-based compilers like Intel icpx
 				#    define ALPAKA_FN_INLINE [[gnu::always_inline]] inline
 				#endif

 				//! This macro defines a variable lying in global accelerator device memory.
 				//!
 				//! Example:
 				//!   ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, int> variable;
 				//!
 				//! Those variables behave like ordinary variables when used in file-scope,
 				//! but inside kernels the get() method must be used to access the variable.
 				//! They are declared inline to resolve to a single instance across multiple
 				//! translation units.
 				//! Like ordinary variables, only one definition is allowed (ODR)
 				//! Failure to do so might lead to linker errors.
 				//!
 				//! In contrast to ordinary variables, you can not define such variables
 				//! as static compilation unit local variables with internal linkage
 				//! because this is forbidden by CUDA.
 				//!
 				//! \attention It is not allowed to initialize the variable together with the declaration.
 				//!            To initialize the variable alpaka::memcpy must be used.
 				//! \code{.cpp}
 				//! ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, int> foo;
 				//!
 				//! struct DeviceMemoryKernel
 				//! {
 				//!    ALPAKA_NO_HOST_ACC_WARNING
 				//!    template<typename TAcc>
 				//!    ALPAKA_FN_ACC void operator()(TAcc const& acc) const
 				//!    {
 				//!      auto a = foo<TAcc>.get();
 				//!    }
 				//!  }
 				//!
 				//! void initFoo() {
 				//!     auto extent = alpaka::Vec<alpaka::DimInt<1u>, size_t>{1};
 				//!     int initialValue = 42;
 				//!     alpaka::ViewPlainPtr<DevHost, int, alpaka::DimInt<1u>, size_t> bufHost(&initialValue, devHost, extent);
 				//!     alpaka::memcpy(queue, foo<Acc>, bufHost, extent);
 				//! }
 				//! \endcode
 				#if (                                                                                                                 \
 				    (ALPAKA_LANG_CUDA && ALPAKA_COMP_CLANG_CUDA) || (ALPAKA_LANG_CUDA && ALPAKA_COMP_NVCC && ALPAKA_ARCH_PTX)         \
 				    || ALPAKA_LANG_HIP)
 				#    if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__)
 				#        define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                          \
 				            template<typename TAcc>                                                                                   \
 				            __device__ inline
 				#    else
 				#        define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                          \
 				            template<typename TAcc>                                                                                   \
 				            __device__ static
 				#    endif
 				#else
 				#    define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                              \
 				        template<typename TAcc>                                                                                       \
 				        inline
 				#endif

 				//! This macro defines a variable lying in constant accelerator device memory.
 				//!
 				//! Example:
 				//!   ALPAKA_STATIC_ACC_MEM_CONSTANT alpaka::DevGlobal<TAcc, const int> variable;
 				//!
 				//! Those variables behave like ordinary variables when used in file-scope,
 				//! but inside kernels the get() method must be used to access the variable.
 				//! They are declared inline to resolve to a single instance across multiple
 				//! translation units.
 				//! Like ordinary variables, only one definition is allowed (ODR)
 				//! Failure to do so might lead to linker errors.
 				//!
 				//! In contrast to ordinary variables, you can not define such variables
 				//! as static compilation unit local variables with internal linkage
 				//! because this is forbidden by CUDA.
 				//!
 				//! \attention It is not allowed to initialize the variable together with the declaration.
 				//!            To initialize the variable alpaka::memcpy must be used.
 				//! \code{.cpp}
 				//! ALPAKA_STATIC_ACC_MEM_CONSTANT alpaka::DevGlobal<TAcc, const int> foo;
 				//!
 				//! struct DeviceMemoryKernel
 				//! {
 				//!    ALPAKA_NO_HOST_ACC_WARNING
 				//!    template<typename TAcc>
 				//!    ALPAKA_FN_ACC void operator()(TAcc const& acc) const
 				//!    {
 				//!      auto a = foo<TAcc>.get();
 				//!    }
 				//!  }
 				//!
 				//! void initFoo() {
 				//!     auto extent = alpaka::Vec<alpaka::DimInt<1u>, size_t>{1};
 				//!     int initialValue = 42;
 				//!     alpaka::ViewPlainPtr<DevHost, int, alpaka::DimInt<1u>, size_t> bufHost(&initialValue, devHost, extent);
 				//!     alpaka::memcpy(queue, foo<Acc>, bufHost, extent);
 				//! }
 				//! \endcode
 				#if (                                                                                                                 \
 				    (ALPAKA_LANG_CUDA && ALPAKA_COMP_CLANG_CUDA) || (ALPAKA_LANG_CUDA && ALPAKA_COMP_NVCC && ALPAKA_ARCH_PTX)         \
 				    || ALPAKA_LANG_HIP)
 				#    if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__)
 				#        define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                        \
 				            template<typename TAcc>                                                                                   \
 				            __constant__ inline
 				#    else
 				#        define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                        \
 				            template<typename TAcc>                                                                                   \
 				            __constant__ static
 				#    endif
 				#else
 				#    define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                            \
 				        template<typename TAcc>                                                                                       \
 				        inline
 				#endif

 				//! This macro disables memory optimizations for annotated device memory.
 				//!
 				//! Example:
 				//!   ALPAKA_DEVICE_VOLATILE float* ptr;
 				//!
 				//! This is useful for pointers, (shared) variables and shared memory which are used in combination with
 				//! the alpaka::mem_fence() function. It ensures that memory annotated with this macro will always be written directly
 				//! to memory (and not to a register or cache because of compiler optimizations).
 				#if (ALPAKA_LANG_CUDA && ALPAKA_ARCH_PTX)                                                                             \
 				    || (ALPAKA_LANG_HIP && defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1)
 				#    define ALPAKA_DEVICE_VOLATILE volatile
 				#else
 				#    define ALPAKA_DEVICE_VOLATILE
 				#endif

 				#define ALPAKA_FORWARD(instance) std::forward<decltype(instance)>(instance)

 				#define ALPAKA_TYPEOF(...) std::decay_t<decltype(__VA_ARGS__)>
 				// ==
 				// == ./include/alpaka/core/common.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/trait.hpp ==
 				// ==
 				/* Copyright 2024 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 					// ============================================================================
 					// == ./include/alpaka/utility.hpp ==
 					// ==
 					/* Copyright 2024 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
 					 * SPDX-License-Identifier: MPL-2.0
 					 */
 					// #pragma once
 					// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 					#include <algorithm>
 					#include <bit>
 					#include <climits>
 					#include <concepts>
 					#include <type_traits>
 					#include <utility>

 					namespace alpaka
 					{
 					    namespace core
 					    {
 					        //! convert any type to a reference type
 					        //
 					        // This function is equivalent to std::declval() but can be used
 					        // within an alpaka accelerator kernel too.
 					        // This function can be used only within std::decltype().
 					#if ALPAKA_LANG_CUDA && ALPAKA_COMP_CLANG_CUDA || ALPAKA_COMP_HIP
 					        template<class T>
 					        ALPAKA_FN_HOST_ACC std::add_rvalue_reference_t<T> declval();
 					#else
 					        using std::declval;
 					#endif
 					    } // namespace core

 					    /// Returns the ceiling of a / b, as integer.
 					    template<std::integral Integral>
 					    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
 					    {
 					        return (a + b - Integral{1}) / b;
 					    }

 					    /// Returns the  max(a / b, 1) as integer.
 					    template<std::integral Integral>
 					    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto divExZero(Integral a, Integral b) -> Integral
 					    {
 					        return std::max(a / b, Integral{1});
 					    }

 					    /// Computes the nth power of base, in integers.
 					    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
 					    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto intPow(Integral base, Integral n) -> Integral
 					    {
 					        if(n == 0)
 					            return 1;
 					        auto r = base;
 					        for(Integral i = 1; i < n; i++)
 					            r *= base;
 					        return r;
 					    }

 					    /// Computes the floor of the nth root of value, in integers.
 					    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
 					    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto nthRootFloor(Integral value, Integral n) -> Integral
 					    {
 					        // adapted from: https://en.wikipedia.org/wiki/Integer_square_root
 					        Integral L = 0;
 					        Integral R = value + 1;
 					        while(L != R - 1)
 					        {
 					            Integral const M = (L + R) / 2;
 					            if(intPow(M, n) <= value)
 					                L = M;
 					            else
 					                R = M;
 					        }
 					        return L;
 					    }

 					    template<std::integral T>
 					    inline constexpr T firstSetBit(T value)
 					    {
 					        using UnsignedValueType = std::make_unsigned_t<T>;
 					        return sizeof(T) * CHAR_BIT - 1 - std::countl_zero(static_cast<UnsignedValueType>(value));
 					    }

 					    /** round to the next power of two which is equal or lower to the value
 					     *
 					     * @param value input value >0
 					     */
 					    template<std::integral T>
 					    inline constexpr T roundDownToPowerOfTwo(T value)
 					    {
 					        return T{1} << firstSetBit(value);
 					    }

 					    /** checks if T is a instance of U
 					     *
 					     * @tparam T full type specialization
 					     * @tparam U unspecialized template type
 					     *
 					     * @return true if T is a specialization of U
 					     *
 					     * @{
 					     */
 					    template<typename T, template<typename...> typename U>
 					    inline constexpr bool isSpecializationOf_v = std::false_type{};

 					    template<template<typename...> typename U, typename... Vs>
 					    inline constexpr bool isSpecializationOf_v<U<Vs...>, U> = std::true_type{};

 					    /** @} */

 					    /**
 					     * @brief Helper function calculating the integer power for the given base and exponent.
 					     */
 					    constexpr auto ipow(std::integral auto const base, std::integral auto const exponent)
 					        requires std::same_as<ALPAKA_TYPEOF(base), ALPAKA_TYPEOF(exponent)>
 					    {
 					        using T_Res = ALPAKA_TYPEOF(base);
 					        T_Res result = T_Res{1};
 					        if(exponent == T_Res{0})
 					            return result;

 					        result = ipow(base, exponent / T_Res{2});
 					        result *= result;

 					        if(exponent % T_Res{2})
 					            result *= base;

 					        return result;
 					    }
 					} // namespace alpaka
 					// ==
 					// == ./include/alpaka/utility.hpp ==
 					// ============================================================================

 					// ============================================================================
 					// == ./include/alpaka/vecConcepts.hpp ==
 					// ==
 					/* Copyright 2024 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include <concepts>    // amalgamate: file already included
 					#include <string>
 					#include <type_traits>

 					namespace alpaka
 					{
 					    namespace concepts
 					    {
 					        namespace detail
 					        {
 					            // integral to integral
 					            template<typename T_From, typename T_To>
 					            constexpr bool integralIntegralLossless
 					                = std::is_integral_v<T_From> && std::is_integral_v<T_To>
 					                  && ((std::is_signed_v<T_From> == std::is_signed_v<T_To>
 					                       && std::numeric_limits<T_From>::digits <= std::numeric_limits<T_To>::digits)
 					                      || (std::is_unsigned_v<T_From> && std::is_signed_v<T_To>
 					                          && std::numeric_limits<T_From>::digits < std::numeric_limits<T_To>::digits));

 					            //  floating-point to floating-point
 					            template<typename T_From, typename T_To>
 					            constexpr bool floatFloatLossless
 					                = std::is_floating_point_v<T_From> && std::is_floating_point_v<T_To>
 					                  && std::numeric_limits<T_From>::radix == std::numeric_limits<T_To>::radix
 					                  && std::numeric_limits<T_From>::digits <= std::numeric_limits<T_To>::digits
 					                  && std::numeric_limits<T_From>::max_exponent <= std::numeric_limits<T_To>::max_exponent
 					                  && std::numeric_limits<T_From>::min_exponent >= std::numeric_limits<T_To>::min_exponent;

 					            //  integral to floating-point
 					            //  numeric_limits::digits for integers excludes the sign bit
 					            template<typename T_From, typename T_To>
 					            constexpr bool integralFloatLossless = std::is_integral_v<T_From> && std::is_floating_point_v<T_To>
 					                                                   && (std::numeric_limits<T_From>::digits + std::is_signed_v<T_From>)
 					                                                          <= std::numeric_limits<T_To>::digits;
 					        } // namespace detail

 					        /** Concept to check if a type can be lossless converted to another type.
 					         *
 					         * This concept ensures that a type `T_From` can be converted to a type `T_To` without any loss of information.
 					         * It checks for implicit convertibility, signedness compatibility, and precision preservation for both integer
 					         * and floating-point types.
 					         *
 					         * @tparam T_From The source type to be converted.
 					         * @tparam T_To The target type to which the source type is converted.
 					         */
 					        template<typename T_From, typename T_To>
 					        concept LosslesslyConvertible
 					            = std::convertible_to<T_From, T_To>
 					              && (detail::integralIntegralLossless<T_From, T_To> || detail::floatFloatLossless<T_From, T_To>
 					                  || detail::integralFloatLossless<T_From, T_To>);

 					        template<typename T_From, typename T_To>
 					        concept Convertible = requires { std::is_convertible_v<T_From, T_To>; };
 					    }; // namespace concepts
 					} // namespace alpaka
 					// ==
 					// == ./include/alpaka/vecConcepts.hpp ==
 					// ============================================================================


 				// #include <concepts>    // amalgamate: file already included
 				#include <cstdint>
 				#include <limits>

 				namespace alpaka
 				{
 				    /** This type is used in cases where a template type parameter is not required and can optionally be passed to a
 				     * trait or concept.
 				     */
 				    struct NotRequired
 				    {
 				    };

 				    constexpr uint32_t notRequiredDim = std::numeric_limits<uint32_t>::max();
 				    constexpr uint32_t notRequiredWidth = notRequiredDim;

 				    namespace trait
 				    {
 				        template<typename T>
 				        struct GetDim
 				        {
 				            static constexpr uint32_t value = T::dim();
 				        };

 				        template<std::integral T>
 				        struct GetDim<T>
 				        {
 				            static constexpr uint32_t value = 1u;
 				        };

 				        template<typename T>
 				        constexpr uint32_t getDim_v = GetDim<T>::value;

 				        template<typename T>
 				        struct GetValueType
 				        {
 				            using type = typename T::value_type;
 				        };

 				        template<typename T>
 				        requires(std::is_fundamental_v<T>)
 				        struct GetValueType<T>
 				        {
 				            using type = T;
 				        };

 				        // resolve handles
 				        template<typename T>
 				        requires requires() { typename T::element_type; }
 				        struct GetValueType<T>
 				        {
 				            using type = typename GetValueType<typename T::element_type>::type;
 				        };

 				        template<typename T>
 				        using GetValueType_t = typename GetValueType<T>::type;

 				        // true for alpaka MdSpan implementations
 				        template<typename T>
 				        struct IsMdSpan : std::false_type
 				        {
 				        };

 				        /** Check if a type used as kernel argument is trivially copyable
 				         *
 				         * @attention In case this trait is specialized for a user type, the user should be sure that the result of
 				         * calling the copy constructor is equivalent to using memcpy to duplicate the object. An existing destructor
 				         * must be free of side effects.
 				         *
 				         * It is implementation defined whether the closure type of a lambda is trivially copyable.
 				         * Therefore, the default implementation is true for trivially copyable or empty (stateless) types.
 				         *
 				         * @tparam T type to check
 				         */
 				        template<typename T, typename = void>
 				        struct IsKernelArgumentTriviallyCopyable
 				            : std::bool_constant<std::is_empty_v<T> || std::is_trivially_copyable_v<T>>
 				        {
 				        };

 				        /** Check if the kernel type is trivially copyable
 				         *
 				         * @attention In case this trait is specialized for a user type, the user should be sure that the result of
 				         * calling the copy constructor is equivalent to using memcpy to duplicate the object. An existing destructor
 				         * must be free of side effects.
 				         *
 				         * The default implementation is true for trivially copyable types (or for extended lambda expressions for
 				         * CUDA).
 				         *
 				         * @tparam T type to check
 				         * @{
 				         */
 				        template<typename T, typename = void>
 				        struct IsKernelTriviallyCopyable
 				#if ALPAKA_LANG_CUDA && ALPAKA_COMP_NVCC
 				            : std::bool_constant<
 				                  std::is_trivially_copyable_v<T> || __nv_is_extended_device_lambda_closure_type(T)
 				                  || __nv_is_extended_host_device_lambda_closure_type(T)>
 				#else
 				            : std::is_trivially_copyable<T>
 				#endif
 				        {
 				        };
 				    } // namespace trait

 				    template<typename T>
 				    inline constexpr bool isKernelArgumentTriviallyCopyable_v = trait::IsKernelArgumentTriviallyCopyable<T>::value;

 				    template<typename T>
 				    inline constexpr bool isKernelTriviallyCopyable_v = trait::IsKernelTriviallyCopyable<T>::value;

 				    template<typename T>
 				    [[nodiscard]] consteval uint32_t getDim([[maybe_unused]] T const& any)
 				    {
 				        return trait::getDim_v<T>;
 				    }

 				    template<typename T_From, typename T_To>
 				    constexpr bool isLosslesslyConvertible_v = concepts::LosslesslyConvertible<T_From, T_To>;

 				    template<typename T_From, typename T_To>
 				    constexpr bool isConvertible_v = concepts::Convertible<T_From, T_To>;

 				    template<typename T>
 				    constexpr bool isMdSpan_v = trait::IsMdSpan<T>::value;

 				    namespace concepts
 				    {
 				        /** @brief Concept to check for a kernel function object
 				         *
 				         * @details
 				         * The kernel function object must be trivially copyable.
 				         */
 				        template<typename T>
 				        concept KernelFn = isKernelArgumentTriviallyCopyable_v<T>;

 				        /** @brief Concept to check for a kernel argument object
 				         *
 				         * @details
 				         * A kernel call requires that its arguments are trivially copyable, which this concept requires.
 				         */
 				        template<typename T>
 				        concept KernelArg = isKernelArgumentTriviallyCopyable_v<T>;
 				    } // namespace concepts
 				} // namespace alpaka
 				// ==
 				// == ./include/alpaka/trait.hpp ==
 				// ============================================================================


 			namespace alpaka
 			{
 			    namespace internal
 			    {
 			        struct PCast
 			        {
 			            template<typename T_To, typename T_Input>
 			            struct Op
 			            {
 			                decltype(auto) operator()(auto&& any) const;
 			            };
 			        };

 			        struct LPCast
 			        {
 			            template<typename T_To, typename T_Input>
 			            struct Op
 			            {
 			                decltype(auto) operator()(auto&& any) const
 			                {
 			                    return PCast::Op<T_To, T_Input>{}(any);
 			                }
 			            };
 			        };
 			    } // namespace internal

 			    /** Performs a precision of the value type to another.
 			     *
 			     * @tparam T_To The target type to which the input is cast.
 			     * @param input The input value to be cast. value_type must be cast able to `T_To`.
 			     * @return input with exchanged value_type
 			     */
 			    template<typename T_To>
 			    constexpr decltype(auto) pCast(auto&& input) requires(isConvertible_v<typename ALPAKA_TYPEOF(input)::type, T_To>)
 			    {
 			        return internal::PCast::Op<T_To, ALPAKA_TYPEOF(input)>{}(input);
 			    }

 			    /** Performs a precision of the value type to another.
 			     *
 			     * It ensures that the conversion is lossless by requiring that the value_type of the input is lossless convertible
 			     * to the target type `T_To`.
 			     *
 			     * @tparam T_To The target type to which the input is cast.
 			     * @param input The input value to be cast. value_type must be cast able to `T_To`.
 			     * @return input with exchanged value_type
 			     */
 			    template<typename T_To>
 			    constexpr decltype(auto) lpCast(auto&& input)
 			        requires(isLosslesslyConvertible_v<typename ALPAKA_TYPEOF(input)::type, T_To>)
 			    {
 			        return internal::LPCast::Op<T_To, ALPAKA_TYPEOF(input)>{}(input);
 			    }
 			} // namespace alpaka
 			// ==
 			// == ./include/alpaka/cast.hpp ==
 			// ============================================================================

 		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/core/util.hpp ==
 			// ==
 			/* Copyright 2024 René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 			#include <cstdio>
 			#include <tuple>
 			// #include <utility>    // amalgamate: file already included

 			namespace alpaka
 			{
 			    template<typename T>
 			    constexpr decltype(auto) unWrapp(T&& value)
 			    {
 			        using WrappedType = std::unwrap_reference_t<std::decay_t<decltype(value)>>;
 			        return std::unwrap_reference_t<WrappedType>(std::forward<T>(value));
 			    }

 			    template<typename T>
 			    using RemoveVolatileFromPointer_t = std::add_pointer_t<std::remove_volatile_t<std::remove_pointer_t<T>>>;

 			    /**
 			     * @brief Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
 			     *
 			     * Useful for freeing the memory.
 			     *
 			     * @param inPtr The pointer to convert.
 			     * @tparam T The type of the given pointer.
 			     */
 			    template<typename T>
 			    auto* toVoidPtr(T inPtr)
 			    {
 			        static_assert(std::is_pointer_v<T>);
 			        using DataType = std::remove_pointer_t<T>;
 			        using VoidPtrType = std::conditional_t<std::is_const_v<DataType>, void const*, void*>;
 			        return reinterpret_cast<VoidPtrType>(const_cast<RemoveVolatileFromPointer_t<T>>(inPtr));
 			    }
 			} // namespace alpaka
 			// ==
 			// == ./include/alpaka/core/util.hpp ==
 			// ============================================================================

 		// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 		#include <array>
 		// #include <concepts>    // amalgamate: file already included
 		// #include <cstdint>    // amalgamate: file already included
 		#include <iostream>
 		#include <ranges>
 		#include <sstream>
 		#include <type_traits>

 		namespace alpaka
 		{
 		    namespace trait
 		    {
 		        template<typename T>
 		        struct IsVector : std::false_type
 		        {
 		        };

 		        template<typename T>
 		        struct IsCVector : std::false_type
 		        {
 		        };
 		    } // namespace trait

 		    template<typename T>
 		    constexpr bool isVector_v = trait::IsVector<T>::value;

 		    template<typename T>
 		    constexpr bool isCVector_v = trait::IsCVector<T>::value;

 		    namespace concepts
 		    {

 		        /** Concept to check if a type is a vector
 		         *
 		         * @tparam T Type to check
 		         * @tparam T_ValueType enforce a value type of the vector, if not provided the value type is not checked
 		         * @tparam T_dim enforce a dimensionality of the vector, if not provided the value is not checked
 		         */
 		        template<typename T, typename T_ValueType = alpaka::NotRequired, uint32_t T_dim = alpaka::notRequiredDim>
 		        concept Vector = isVector_v<T>
 		                         && (std::same_as<T_ValueType, trait::GetValueType_t<std::decay_t<T>>>
 		                             || std::same_as<T_ValueType, alpaka::NotRequired>)
 		                         && ((T_dim == alpaka::notRequiredDim) || (T::dim() == T_dim));

 		        /** Concept to check if a type is a vector or scalar variable
 		         *
 		         * @tparam T Type to check
 		         * @tparam T_ValueType enforce a value type of T, if not provided the value type is not checked
 		         */
 		        template<typename T, typename T_ValueType = alpaka::NotRequired>
 		        concept VectorOrScalar = (isVector_v<T> || std::integral<T>)
 		                                 && (std::same_as<T_ValueType, trait::GetValueType_t<std::decay_t<T>>>
 		                                     || std::same_as<T_ValueType, alpaka::NotRequired>);

 		        /** Concept to check if a type is a CVector
 		         *
 		         * @details
 		         * Checks whether the given type is a CVector. For more information, refer to the implementation alpaka::CVec.
 		         */
 		        template<typename T, typename T_ValueType = alpaka::NotRequired>
 		        concept CVector = isCVector_v<T>
 		                          && (std::same_as<T_ValueType, trait::GetValueType_t<std::decay_t<T>>>
 		                              || std::same_as<T_ValueType, alpaka::NotRequired>);

 		        /** Concept to check if a type is a vector or a specific other type
 		         *
 		         * @tparam T Type to check
 		         * @tparam T_RequiredComponent enforce that T is a vector or a specific other type
 		         */
 		        template<typename T, typename T_RequiredComponent>
 		        concept TypeOrVector = (isVector_v<T> || std::is_same_v<T, T_RequiredComponent>);

 		        template<typename T, typename T_RequiredComponent>
 		        concept VectorOrConvertibleType = (isVector_v<T> || std::is_convertible_v<T, T_RequiredComponent>);
 		    } // namespace concepts

 		    /** Array storge for vector data
 		     *
 		     * This class is a workaround and is simply wrapping std::array. It is required because the dim in std::array
 		     * in the template signature is size_t. This produces template deduction issues for math::Vec if we sue
 		     * array as default storage without this wrapper.
 		     */
 		    template<typename T_Type, uint32_t T_dim>
 		    struct ArrayStorage : protected std::array<T_Type, T_dim>
 		    {
 		        using type = T_Type;
 		        using BaseType = std::array<T_Type, T_dim>;
 		        using BaseType::operator[];

 		        // constructor is required because exposing the array constructors does not work
 		        template<typename... T_Args>
 		        constexpr ArrayStorage(T_Args&&... args) : BaseType{std::forward<T_Args>(args)...}
 		        {
 		        }

 		        constexpr ArrayStorage(std::array<T_Type, T_dim> const& data) : BaseType{data}
 		        {
 		        }
 		    };

 		    namespace detail
 		    {
 		        struct GetValue
 		        {
 		            constexpr auto operator()(auto idx, auto value) const
 		            {
 		                return value;
 		            }
 		        };

 		        template<typename T, T... T_values>
 		        struct CVec
 		        {
 		            using type = T;

 		            static consteval uint32_t dim()
 		            {
 		                return sizeof...(T_values);
 		            }

 		            constexpr T operator[](std::integral auto const idx) const
 		            {
 		                // default initializes with first value
 		                T result = std::get<0>(std::forward_as_tuple(T_values...));

 		                if constexpr(dim() > 1u)
 		                {
 		                    [[maybe_unused]] bool _ = std::apply(
 		                        [idx, &result](auto&&, auto&&... values) constexpr
 		                        {
 		                            using IdxType = ALPAKA_TYPEOF(idx);
 		                            IdxType i{1u};
 		                            return ((idx == i++ && (result = values, true)) || ...);
 		                        },
 		                        std::forward_as_tuple(T_values...));
 		                }
 		                return result;
 		            }

 		            template<T T_value>
 		            static constexpr auto all()
 		            {
 		                using IotaSeq = std::make_integer_sequence<T, dim()>;
 		                return integerSequenceToCVec(IotaSeq{}, [](auto&&) constexpr { return T_value; });
 		            }

 		        private:
 		            template<T... T_indices>
 		            static constexpr auto integerSequenceToCVec(
 		                std::integer_sequence<T, T_indices...>,
 		                auto const op = std::identity{})
 		            {
 		                return CVec<T, op(T_indices)...>{};
 		            };
 		        };

 		        template<typename T>
 		        struct TemplateSignatureStorage : std::false_type
 		        {
 		        };

 		        template<typename T_Type, T_Type... T_values>
 		        struct TemplateSignatureStorage<CVec<T_Type, T_values...>> : std::true_type
 		        {
 		        };

 		        template<typename T>
 		        constexpr bool TemplateSignatureStorage_v = TemplateSignatureStorage<T>::value;
 		    } // namespace detail

 		    template<typename T_Type, uint32_t T_dim, typename T_Storage = ArrayStorage<T_Type, T_dim>>
 		    struct Vec : private T_Storage
 		    {
 		        using Storage = T_Storage;
 		        using type = T_Type;
 		        using ParamType = type;

 		        using index_type = uint32_t;
 		        using size_type = uint32_t;
 		        using rank_type = uint32_t;

 		        // universal vec used as fallback if T_Storage is holding the state in the template signature
 		        using UniVec = Vec<T_Type, T_dim>;

 		        /*Vecs without elements are not allowed*/
 		        static_assert(T_dim > 0u);

 		        constexpr Vec() = default;

 		        /** Initialize via a generator expression
 		         *
 		         * The generator must return the value for the corresponding index of the component which is passed to the
 		         * generator.
 		         */
 		        template<
 		            typename F,
 		            std::enable_if_t<std::is_invocable_v<F, std::integral_constant<uint32_t, 0u>>, uint32_t> = 0u>
 		        constexpr explicit Vec(F&& generator)
 		            : Vec(std::forward<F>(generator), std::make_integer_sequence<uint32_t, T_dim>{})
 		        {
 		        }

 		    private:
 		        template<typename F, uint32_t... Is>
 		        constexpr explicit Vec(F&& generator, std::integer_sequence<uint32_t, Is...>)
 		            : Storage{generator(std::integral_constant<uint32_t, Is>{})...}
 		        {
 		        }

 		    public:
 		        /** Constructor for N-dimensional vector
 		         *
 		         * @attention This constructor allows implicit casts.
 		         *
 		         * @param args value of each dimension, x,y,z,...
 		         *
 		         * A constexpr vector should be initialized with {} instead of () because at least
 		         * CUDA 11.6 has problems in cases where a compile time evaluation is required.
 		         * @code{.cpp}
 		         *   constexpr auto vec1 = Vec{ 1 };
 		         *   constexpr auto vec2 = Vec{ 1, 2 };
 		         *   //or explicit
 		         *   constexpr auto vec3 = Vec<int, 3u>{ 1, 2, 3 };
 		         *   constexpr auto vec4 = Vec<int, 3u>{ {1, 2, 3} };
 		         * @endcode
 		         */
 		        template<typename... T_Args>
 		        requires(std::is_convertible_v<T_Args, T_Type> && ...)
 		        constexpr Vec(T_Args const&... args) : Storage(static_cast<T_Type>(args)...)
 		        {
 		        }

 		        constexpr Vec(Vec const& other) = default;

 		        constexpr Vec(T_Storage const& other) : T_Storage{other}
 		        {
 		        }

 		        /** constructor allows changing the storage policy
 		         */
 		        template<typename T_OtherStorage>
 		        constexpr Vec(Vec<T_Type, T_dim, T_OtherStorage> const& other)
 		            : Vec([&](uint32_t const i) constexpr { return other[i]; })
 		        {
 		        }

 		        /** Allow static_cast / explicit cast to member type for 1D vector */
 		        template<uint32_t T_deferDim = T_dim, typename = typename std::enable_if<T_deferDim == 1u>::type>
 		        constexpr explicit operator type()
 		        {
 		            return (*this)[0];
 		        }

 		        static consteval uint32_t dim()
 		        {
 		            return T_dim;
 		        }

 		        /**
 		         * Creates a Vec where all dimensions are set to the same value
 		         *
 		         * @param value Value which is set for all dimensions
 		         * @return new Vec<...>
 		         */
 		        static constexpr auto all(concepts::Convertible<T_Type> auto const& value)
 		        {
 		            if constexpr(requires { detail::TemplateSignatureStorage_v<T_Storage>; })
 		            {
 		                UniVec result([=](uint32_t const) { return static_cast<T_Type>(value); });
 		                return result;
 		            }
 		            else
 		            {
 		                Vec result([=](uint32_t const) { return static_cast<T_Type>(value); });
 		                return result;
 		            }
 		        }

 		        template<auto T_v>
 		        requires(isConvertible_v<ALPAKA_TYPEOF(T_v), T_Type>)
 		        static constexpr auto all() requires requires { T_Storage::template all<T_v>(); }
 		        {
 		            return Vec<T_Type, T_dim, ALPAKA_TYPEOF(T_Storage::template all<static_cast<T_Type>(T_v)>())>{};
 		        }

 		        constexpr Vec toRT() const

 		        {
 		            return *this;
 		        }

 		        constexpr Vec revert() const
 		        {
 		            Vec invertedVec{};
 		            for(uint32_t i = 0u; i < T_dim; i++)
 		                invertedVec[T_dim - 1 - i] = (*this)[i];

 		            return invertedVec;
 		        }

 		        constexpr Vec& operator=(Vec const&) = default;
 		        constexpr Vec& operator=(Vec&&) = default;

 		        constexpr Vec operator-() const
 		        {
 		            return Vec([this](uint32_t const i) constexpr { return -(*this)[i]; });
 		        }

 		/** assign operator
 		 * @{
 		 */
 		#define ALPAKA_VECTOR_ASSIGN_OP(op)                                                                                   \
 		    template<typename T_OtherStorage>                                                                                 \
 		    constexpr Vec& operator op(Vec<T_Type, T_dim, T_OtherStorage> const& rhs)                                         \
 		    {                                                                                                                 \
 		        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
 		        {                                                                                                             \
 		            if constexpr(requires { unWrapp((*this)[i]) op rhs[i]; })                                                 \
 		            {                                                                                                         \
 		                unWrapp((*this)[i]) op rhs[i];                                                                        \
 		            }                                                                                                         \
 		            else                                                                                                      \
 		            {                                                                                                         \
 		                (*this)[i] op rhs[i];                                                                                 \
 		            }                                                                                                         \
 		        }                                                                                                             \
 		        return *this;                                                                                                 \
 		    }                                                                                                                 \
 		    constexpr Vec& operator op(concepts::LosslesslyConvertible<T_Type> auto const value)                              \
 		    {                                                                                                                 \
 		        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
 		        {                                                                                                             \
 		            if constexpr(requires { unWrapp((*this)[i]) op value; })                                                  \
 		            {                                                                                                         \
 		                unWrapp((*this)[i]) op value;                                                                         \
 		            }                                                                                                         \
 		            else                                                                                                      \
 		            {                                                                                                         \
 		                (*this)[i] op value;                                                                                  \
 		            }                                                                                                         \
 		        }                                                                                                             \
 		        return *this;                                                                                                 \
 		    }


 		        ALPAKA_VECTOR_ASSIGN_OP(+=)
 		        ALPAKA_VECTOR_ASSIGN_OP(-=)
 		        ALPAKA_VECTOR_ASSIGN_OP(/=)
 		        ALPAKA_VECTOR_ASSIGN_OP(*=)
 		        ALPAKA_VECTOR_ASSIGN_OP(=)

 		#undef ALPAKA_VECTOR_ASSIGN_OP

 		        /** @} */

 		        constexpr decltype(auto) operator[](std::integral auto const idx)
 		        {
 		            return Storage::operator[](idx);
 		        }

 		        constexpr decltype(auto) operator[](std::integral auto const idx) const
 		        {
 		            return Storage::operator[](idx);
 		        }

 		        /** named member access
 		         *
 		         * index -> name [0->x,1->y,2->z,3->w]
 		         * @{
 		         */
 		#define ALPAKA_NAMED_ARRAY_ACCESS(functionName, dimValue)                                                             \
 		    constexpr decltype(auto) functionName() requires(T_dim >= dimValue + 1)                                           \
 		    {                                                                                                                 \
 		        return (*this)[T_dim - 1u - dimValue];                                                                        \
 		    }                                                                                                                 \
 		    constexpr decltype(auto) functionName() const requires(T_dim >= dimValue + 1)                                     \
 		    {                                                                                                                 \
 		        return (*this)[T_dim - 1u - dimValue];                                                                        \
 		    }

 		        ALPAKA_NAMED_ARRAY_ACCESS(x, 0u)
 		        ALPAKA_NAMED_ARRAY_ACCESS(y, 1u)
 		        ALPAKA_NAMED_ARRAY_ACCESS(z, 2u)
 		        ALPAKA_NAMED_ARRAY_ACCESS(w, 3u)

 		#undef ALPAKA_NAMED_ARRAY_ACCESS

 		        /** @} */

 		        constexpr decltype(auto) back()
 		        {
 		            return (*this)[T_dim - 1u];
 		        }

 		        constexpr decltype(auto) back() const
 		        {
 		            return (*this)[T_dim - 1u];
 		        }

 		        /** Shrink the number of elements of a vector.
 		         *
 		         * Highest indices kept alive.
 		         *
 		         * @tparam T_numElements New dimension of the vector.
 		         * @return First T_numElements elements of the origin vector
 		         */
 		        template<uint32_t T_numElements>
 		        constexpr Vec<T_Type, T_numElements> rshrink() const
 		        {
 		            static_assert(T_numElements <= T_dim);
 		            Vec<T_Type, T_numElements> result{};
 		            for(uint32_t i = 0u; i < T_numElements; i++)
 		                result[T_numElements - 1u - i] = (*this)[T_dim - 1u - i];

 		            return result;
 		        }

 		        /** Shrink the vector
 		         *
 		         * Removes the last value.
 		         */
 		        constexpr Vec<T_Type, T_dim - 1u> eraseBack() const requires(T_dim > 1u)
 		        {
 		            constexpr auto reducedDim = T_dim - 1u;
 		            Vec<T_Type, reducedDim> result{};
 		            for(uint32_t i = 0u; i < reducedDim; i++)
 		                result[i] = (*this)[i];

 		            return result;
 		        }

 		        /** Shrink the number of elements of a vector.
 		         *
 		         * @tparam T_numElements New dimension of the vector.
 		         * @param startIdx Index within the origin vector which will be the last element in the result.
 		         * @return T_numElements elements of the origin vector starting with the index startIdx.
 		         *         Indexing will wrapp around when the begin of the origin vector is reached.
 		         */
 		        template<uint32_t T_numElements>
 		        constexpr Vec<type, T_numElements> rshrink(std::integral auto const startIdx) const
 		        {
 		            static_assert(T_numElements <= T_dim);
 		            Vec<type, T_numElements> result;
 		            for(uint32_t i = 0u; i < T_numElements; i++)
 		                result[T_numElements - 1u - i] = (*this)[(T_dim + startIdx - i) % T_dim];
 		            return result;
 		        }

 		        /** Assign an value to the given index position
 		         *
 		         * @tparam T_elementIdx Index of the element from the begin which shall be replaced; range: [ 0; T_dim - 1 ]
 		         * @param value value to assign to the element at the given index position
 		         * @return copy of the vector with where the index positions are updated with value
 		         */
 		        template<uint32_t T_elementIdx = 0>
 		        constexpr Vec<T_Type, T_dim> assign(T_Type const& value) const requires(T_elementIdx < T_dim)
 		        {
 		            auto result = *this;
 		            result[T_elementIdx] = value;
 		            return result;
 		        }

 		        /** Assign an value to the given index position
 		         *
 		         * @param selection CVec with the indices of the elements which shall be replaced; indices range must be
 		         * [0; T_dim -1]
 		         * @param value value to assign to the element at the given index position
 		         * @return copy of the vector with where the index positions are updated with value
 		         */
 		        constexpr Vec<T_Type, T_dim> assign(
 		            concepts::CVector auto const selection,
 		            concepts::Vector<T_Type> auto const& value) const requires(ALPAKA_TYPEOF(value)::dim() <= T_dim)
 		        {
 		            auto result = *this;
 		            result.ref(selection) = value;
 		            return result;
 		        }

 		        /** Assign an value to the given index position
 		         *
 		         * @tparam T_elementIdx Index of the element from the back which shall be replaced; range: [ 0; T_dim - 1 ]
 		         * @param value value to assign to the element at the given index position
 		         * @return copy of the vector with where the index positions are updated with value
 		         */
 		        template<uint32_t T_elementIdx = T_dim - 1u>
 		        constexpr Vec<T_Type, T_dim> rAssign(T_Type const& value) const requires(T_elementIdx < T_dim)
 		        {
 		            auto result = *this;
 		            result[T_elementIdx] = value;
 		            return result;
 		        }

 		        /** Removes a component
 		         *
 		         * It is not allowed to call this method on a vector with the dimensionality of one.
 		         *
 		         * @tparam dimToRemove index which shall be removed; range: [ 0; T_dim - 1 ]
 		         * @return vector with `T_dim - 1` elements
 		         */
 		        template<std::integral auto dimToRemove>
 		        constexpr Vec<type, T_dim - 1u> remove() const requires(T_dim >= 2u)
 		        {
 		            Vec<type, T_dim - 1u> result{};
 		            for(int i = 0u; i < static_cast<int>(T_dim - 1u); ++i)
 		            {
 		                // skip component which must be deleted
 		                int const sourceIdx = i >= static_cast<int>(dimToRemove) ? i + 1 : i;
 		                result[i] = (*this)[sourceIdx];
 		            }
 		            return result;
 		        }

 		        /** Returns product of all components.
 		         *
 		         * @return product of components
 		         */
 		        constexpr type product() const
 		        {
 		            type result = (*this)[0];
 		            for(uint32_t i = 1u; i < T_dim; i++)
 		                result *= (*this)[i];
 		            return result;
 		        }

 		        /** Returns sum of all components.
 		         *
 		         * @return sum of components
 		         */
 		        constexpr type sum() const
 		        {
 		            type result = (*this)[0];
 		            for(uint32_t i = 1u; i < T_dim; i++)
 		                result += (*this)[i];
 		            return result;
 		        }

 		        /**
 		         * == comparison operator.
 		         *
 		         * Compares dims of two DataSpaces.
 		         *
 		         * @param other Vec to compare to
 		         * @return true if all components in both vectors are equal, else false
 		         */
 		        template<typename T_OtherStorage>
 		        constexpr bool operator==(Vec<T_Type, T_dim, T_OtherStorage> const& rhs) const
 		        {
 		            bool result = true;
 		            for(uint32_t i = 0u; i < T_dim; i++)
 		                result = result && ((*this)[i] == rhs[i]);
 		            return result;
 		        }

 		        /**
 		         * != comparison operator.
 		         *
 		         * Compares dims of two DataSpaces.
 		         *
 		         * @param other Vec to compare to
 		         * @return true if one component in both vectors are not equal, else false
 		         */
 		        template<typename T_OtherStorage>
 		        constexpr bool operator!=(Vec<T_Type, T_dim, T_OtherStorage> const& rhs) const
 		        {
 		            return !((*this) == rhs);
 		        }

 		        template<typename T_OtherStorage>
 		        constexpr auto min(Vec<T_Type, T_dim, T_OtherStorage> const& rhs) const
 		        {
 		            Vec result{};
 		            for(uint32_t d = 0u; d < T_dim; d++)
 		                result[d] = std::min((*this)[d], rhs[d]);
 		            return result;
 		        }

 		        /** create string out of the vector
 		         *
 		         * @param separator string to separate components of the vector
 		         * @param enclosings string with dim 2 to enclose vector
 		         *                   dim == 0 ? no enclose symbols
 		         *                   dim == 1 ? means enclose symbol begin and end are equal
 		         *                   dim >= 2 ? letter[0] = begin enclose symbol
 		         *                               letter[1] = end enclose symbol
 		         *
 		         * example:
 		         * .toString(";","|")     -> |x;...;z|
 		         * .toString(",","[]")    -> [x,...,z]
 		         */
 		        std::string toString(std::string const separator = ",", std::string const enclosings = "{}") const
 		        {
 		            std::string locale_enclosing_begin;
 		            std::string locale_enclosing_end;
 		            size_t enclosing_dim = enclosings.size();

 		            if(enclosing_dim > 0)
 		            {
 		                /* % avoid out of memory access */
 		                locale_enclosing_begin = enclosings[0 % enclosing_dim];
 		                locale_enclosing_end = enclosings[1 % enclosing_dim];
 		            }

 		            std::stringstream stream;
 		            stream << locale_enclosing_begin << (*this)[0];

 		            for(uint32_t i = 1u; i < T_dim; ++i)
 		                stream << separator << (*this)[i];
 		            stream << locale_enclosing_end;
 		            return stream.str();
 		        }

 		        /** swizzle operator */
 		        template<typename T, T... T_values>
 		        constexpr auto operator[](Vec<T, sizeof...(T_values), detail::CVec<T, T_values...>> const v) const
 		        {
 		            using InType = ALPAKA_TYPEOF(v);
 		            return Vec<T_Type, InType::dim()>{(*this)[T_values]...};
 		        }

 		        template<typename T, T... T_values>
 		        constexpr auto ref(Vec<T, sizeof...(T_values), detail::CVec<T, T_values...>> const v)
 		        {
 		            using InType = ALPAKA_TYPEOF(v);
 		            using ArrayType = std::array<ALPAKA_TYPEOF(std::ref((*this)[T{0}])), sizeof...(T_values)>;
 		            auto array = ArrayType{std::ref((*this)[T_values])...};
 		            return Vec<T_Type, InType::dim(), ALPAKA_TYPEOF(array)>{array};
 		        };

 		        template<typename T, T... T_values>
 		        constexpr auto ref(Vec<T, sizeof...(T_values), detail::CVec<T, T_values...>> const v) const
 		        {
 		            using InType = ALPAKA_TYPEOF(v);
 		            using ArrayType = std::array<ALPAKA_TYPEOF(std::ref((*this)[T{0}])), sizeof...(T_values)>;
 		            auto array = ArrayType{std::ref((*this)[T_values])...};
 		            return Vec<T_Type, InType::dim(), ALPAKA_TYPEOF(array)>{array};
 		        };

 		        /** reduce all elements to a single value
 		         *
 		         * For better numerical stability a tree reduce algorithm is used.
 		         *
 		         * @tparam BinaryOp binary functor executed to reduce the range
 		         *                  The binary operation must be associative.
 		         * @return the type of the result depends on the binary functor
 		         */
 		        [[nodiscard]] constexpr auto reduce(auto&& reduceFunc) const
 		            -> decltype(reduceFunc(std::declval<type>(), std::declval<type>()))
 		        {
 		            return reduce_range(ALPAKA_FORWARD(reduceFunc));
 		        }

 		    private:
 		        /** reduce over a range of elements
 		         *
 		         * @tparam BinaryOp binary functor executed to reduce the range
 		         * @tparam T_start start index
 		         * @tparam T_end end index (excluded)
 		         * @return the type of the result depends on the binary functor
 		         */
 		        template<uint32_t T_start = 0u, uint32_t T_end = dim()>
 		        [[nodiscard]] constexpr auto reduce_range(auto&& reduceFunc) const
 		            -> decltype(reduceFunc(std::declval<type>(), std::declval<type>()))
 		        {
 		            // elements in the range
 		            constexpr uint32_t size = T_end - T_start;
 		            // single element termination
 		            if constexpr(size == 1u)
 		            {
 		                return (*this)[T_start];
 		            }
 		#if ALPAKA_LANG_SYCL
 		            // SYCL can not call recursive functions
 		            auto result = (*this)[T_start];
 		            for(uint32_t i = T_start + 1u; i < T_end; ++i)
 		            {
 		                result = reduceFunc(result, (*this)[i]);
 		            }
 		            return result;
 		#else
 		            // split range at midpoint
 		            constexpr uint32_t mid = T_start + size / 2u;

 		            // recursively reduce both halves and combine
 		            return reduceFunc(
 		                reduce_range<T_start, mid>(ALPAKA_FORWARD(reduceFunc)),
 		                reduce_range<mid, T_end>(ALPAKA_FORWARD(reduceFunc)));
 		#endif
 		        }
 		    };

 		    template<std::size_t I, typename T_Type, uint32_t T_dim, typename T_Storage>
 		    constexpr auto get(Vec<T_Type, T_dim, T_Storage> const& v)
 		    {
 		        return v[I];
 		    }

 		    template<std::size_t I, typename T_Type, uint32_t T_dim, typename T_Storage>
 		    constexpr decltype(auto) get(Vec<T_Type, T_dim, T_Storage>& v)
 		    {
 		        return v[I];
 		    }

 		    template<typename Type>
 		    struct Vec<Type, 0>
 		    {
 		        using type = Type;
 		        static constexpr uint32_t T_dim = 0;

 		        template<typename OtherType>
 		        constexpr operator Vec<OtherType, 0>() const
 		        {
 		            return Vec<OtherType, 0>();
 		        }

 		        /**
 		         * == comparison operator.
 		         *
 		         * Returns always true
 		         */
 		        constexpr bool operator==(Vec const& rhs) const
 		        {
 		            return true;
 		        }

 		        /**
 		         * != comparison operator.
 		         *
 		         * Returns always false
 		         */
 		        constexpr bool operator!=(Vec const& rhs) const
 		        {
 		            return false;
 		        }

 		        static constexpr Vec create(Type)
 		        {
 		            /* this method should never be actually called,
 		             * it exists only for Visual Studio to handle alpaka::Size_t< 0 >
 		             */
 		            static_assert(sizeof(Type) != 0 && false);
 		        }
 		    };

 		    // type deduction guide
 		    template<typename T_1, typename... T_Args>
 		    ALPAKA_FN_HOST_ACC Vec(T_1, T_Args...)
 		        -> Vec<T_1, uint32_t(sizeof...(T_Args) + 1u), ArrayStorage<T_1, uint32_t(sizeof...(T_Args) + 1u)>>;

 		    template<typename Type, uint32_t T_dim, typename T_Storage>
 		    std::ostream& operator<<(std::ostream& s, Vec<Type, T_dim, T_Storage> const& vec)
 		    {
 		        return s << vec.toString();
 		    }

 		/** binary operators
 		 * @{
 		 */
 		#define ALPAKA_VECTOR_BINARY_OP(typenameOrConcept, resultScalarType, op)                                              \
 		    template<typenameOrConcept T_Type, uint32_t T_dim, typename T_Storage, typename T_OtherStorage>                   \
 		    constexpr auto operator op(                                                                                       \
 		        const Vec<T_Type, T_dim, T_Storage>& lhs,                                                                     \
 		        const Vec<T_Type, T_dim, T_OtherStorage>& rhs)                                                                \
 		    {                                                                                                                 \
 		        /* to avoid allocation side effects the result is always a vector                                             \
 		         * with default policies                                                                                      \
 		         */                                                                                                           \
 		        Vec<resultScalarType, T_dim> result{};                                                                        \
 		        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
 		            result[i] = lhs[i] op rhs[i];                                                                             \
 		        return result;                                                                                                \
 		    }                                                                                                                 \
 		                                                                                                                      \
 		    template<                                                                                                         \
 		        typenameOrConcept T_Type,                                                                                     \
 		        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
 		        uint32_t T_dim,                                                                                               \
 		        typename T_Storage>                                                                                           \
 		    constexpr auto operator op(const Vec<T_Type, T_dim, T_Storage>& lhs, T_ValueType rhs)                             \
 		    {                                                                                                                 \
 		        /* to avoid allocation side effects the result is always a vector                                             \
 		         * with default policies                                                                                      \
 		         */                                                                                                           \
 		        Vec<resultScalarType, T_dim> result{};                                                                        \
 		        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
 		            result[i] = lhs[i] op rhs;                                                                                \
 		        return result;                                                                                                \
 		    }                                                                                                                 \
 		    template<                                                                                                         \
 		        typenameOrConcept T_Type,                                                                                     \
 		        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
 		        uint32_t T_dim,                                                                                               \
 		        typename T_Storage>                                                                                           \
 		    constexpr auto operator op(T_ValueType lhs, const Vec<T_Type, T_dim, T_Storage>& rhs)                             \
 		    {                                                                                                                 \
 		        /* to avoid allocation side effects the result is always a vector                                             \
 		         * with default policies                                                                                      \
 		         */                                                                                                           \
 		        Vec<resultScalarType, T_dim> result{};                                                                        \
 		        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
 		            result[i] = lhs op rhs[i];                                                                                \
 		        return result;                                                                                                \
 		    }
 		    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, +)
 		    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, -)
 		    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, *)
 		    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, /)
 		    ALPAKA_VECTOR_BINARY_OP(typename, bool, >=)
 		    ALPAKA_VECTOR_BINARY_OP(typename, bool, >)
 		    ALPAKA_VECTOR_BINARY_OP(typename, bool, <=)
 		    ALPAKA_VECTOR_BINARY_OP(typename, bool, <)
 		    ALPAKA_VECTOR_BINARY_OP(typename, bool, &&)
 		    ALPAKA_VECTOR_BINARY_OP(typename, bool, ||)
 		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, %)
 		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, <<)
 		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, >>)
 		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, &)
 		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, |)
 		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, ^)

 		#undef ALPAKA_VECTOR_BINARY_OP

 		    /** @} */


 		    /** Give the linear index of an N-dimensional index within an N-dimensional index space.
 		     *
 		     * @tparam T_IntegralType vector data type (must be an integral type)
 		     * @tparam T_dim dimension of the vector, should be >= 2
 		     * @param dim N-dimensional dim of the index space (N can be one dimension less compared to idx)
 		     * @param idx N-dimensional index within the index space
 		     *            @attention behaviour is undefined for negative index
 		     *            @attention if idx is outside of dim the result will be outside of the the index domain too
 		     * @return linear index within the index domain
 		     *
 		     * @{
 		     */
 		    template<std::integral T_IntegralType, typename T_Storage, typename T_OtherStorage, uint32_t T_dim>
 		    constexpr T_IntegralType linearize(
 		        Vec<T_IntegralType, T_dim - 1u, T_Storage> const& dim,
 		        Vec<T_IntegralType, T_dim, T_OtherStorage> const& idx) requires(T_dim >= 2u)
 		    {
 		        T_IntegralType linearIdx{idx[0]};
 		        for(uint32_t d = 1u; d < T_dim; ++d)
 		            linearIdx = linearIdx * dim[d - 1u] + idx[d];

 		        return linearIdx;
 		    }

 		    template<std::integral T_IntegralType, typename T_Storage, typename T_OtherStorage, uint32_t T_dim>
 		    constexpr T_IntegralType linearize(
 		        Vec<T_IntegralType, T_dim, T_Storage> const& dim,
 		        Vec<T_IntegralType, T_dim, T_OtherStorage> const& idx)
 		    {
 		        return linearize(dim.template rshrink<T_dim - 1u>(), idx);
 		    }

 		    template<std::integral T_IntegralType, typename T_Storage, typename T_OtherStorage>
 		    ALPAKA_FN_HOST_ACC T_IntegralType linearize(
 		        Vec<T_IntegralType, 1u, T_Storage> const&,
 		        Vec<T_IntegralType, 1u, T_OtherStorage> const& idx)
 		    {
 		        return idx.x();
 		    }

 		    /** @} */

 		    /** Maps a linear index to an N-dimensional index
 		     *
 		     * @tparam T_IntegralType vector data type (must be an integral type)
 		     * @param dim N-dimensional index space
 		     * @param linearIdx Linear index within dim.
 		     *        @attention If linearIdx is an index outside of dim the result will be outside of the index domain
 		     * too.
 		     * @return N-dimensional index
 		     *
 		     * @{
 		     */
 		    template<std::integral T_IntegralType, typename T_Storage, uint32_t T_dim>
 		    constexpr Vec<T_IntegralType, T_dim> mapToND(
 		        Vec<T_IntegralType, T_dim, T_Storage> const& dim,
 		        T_IntegralType linearIdx) requires(T_dim >= 2u)
 		    {
 		        constexpr uint32_t reducedDim = T_dim - 1u;
 		        Vec<T_IntegralType, reducedDim> pitchExtents;
 		        pitchExtents.back() = dim.back();
 		        for(uint32_t d = 1u; d < T_dim - 1u; ++d)
 		            pitchExtents[reducedDim - 1u - d] = dim[T_dim - 1u - d] * pitchExtents[reducedDim - d];

 		        Vec<T_IntegralType, T_dim> result;
 		        for(uint32_t d = 0u; d < T_dim - 1u; ++d)
 		        {
 		            result[d] = linearIdx / pitchExtents[d];
 		            linearIdx -= pitchExtents[d] * result[d];
 		        }
 		        result[T_dim - 1u] = linearIdx;
 		        return result;
 		    }

 		    template<std::integral T_IntegralType, typename T_Storage>
 		    ALPAKA_FN_HOST_ACC Vec<T_IntegralType, 1u> mapToND(
 		        Vec<T_IntegralType, 1u, T_Storage> const& dim,
 		        T_IntegralType linearIdx)
 		    {
 		        return {linearIdx};
 		    }

 		    /** @} */

 		    namespace trait
 		    {
 		        template<typename T_Type, uint32_t T_dim, typename T_Storage>
 		        struct IsVector<Vec<T_Type, T_dim, T_Storage>> : std::true_type
 		        {
 		        };

 		        template<typename T_Type, uint32_t T_dim, T_Type... T_values>
 		        struct IsCVector<Vec<T_Type, T_dim, detail::CVec<T_Type, T_values...>>> : std::true_type
 		        {
 		        };
 		    } // namespace trait

 		    namespace trait
 		    {
 		        template<typename T_Type, uint32_t T_dim, typename T_Storage>
 		        struct GetDim<alpaka::Vec<T_Type, T_dim, T_Storage>>
 		        {
 		            static constexpr uint32_t value = T_dim;
 		        };

 		        template<typename T>
 		        struct GetVec;

 		        template<std::integral T>
 		        struct GetVec<T>
 		        {
 		            using type = Vec<T, 1u>;
 		        };

 		        template<typename T_Type, uint32_t T_dim, typename T_Storage>
 		        struct GetVec<alpaka::Vec<T_Type, T_dim, T_Storage>>
 		        {
 		            using type = alpaka::Vec<T_Type, T_dim, T_Storage>;
 		        };

 		        template<typename T>
 		        using getVec_t = typename GetVec<T>::type;

 		        template<typename T_Type, uint32_t T_dim, typename T_Storage>
 		        struct GetValueType<Vec<T_Type, T_dim, T_Storage>>
 		        {
 		            using type = T_Type;
 		        };

 		    } // namespace trait

 		    template<typename T>
 		    consteval auto getVec(T const& any)
 		    {
 		        return trait::getVec_t<T>{any};
 		    }

 		    namespace internal
 		    {
 		        template<typename T_To, typename T_Type, uint32_t T_dim, typename T_Storage>
 		        struct PCast::Op<T_To, alpaka::Vec<T_Type, T_dim, T_Storage>>
 		        {
 		            constexpr decltype(auto) operator()(auto&& input) const
 		                requires std::convertible_to<T_Type, T_To> && (!std::same_as<T_To, T_Type>)
 		            {
 		                return typename alpaka::Vec<T_To, T_dim, T_Storage>::UniVec([&](uint32_t idx) constexpr
 		                                                                            { return static_cast<T_To>(input[idx]); });
 		            }

 		            constexpr decltype(auto) operator()(auto&& input) const requires std::same_as<T_To, T_Type>
 		            {
 		                return input;
 		            }
 		        };
 		    } // namespace internal

 		    /** @todo the function for integral values is defined in Utils.hpp
 		     * move this to a better place, e.g. math and expose this for the user too
 		     */
 		    template<concepts::Vector T_Vector0, concepts::Vector T_Vector1>
 		    requires(std::is_same_v<trait::GetValueType_t<T_Vector0>, trait::GetValueType_t<T_Vector1>>)
 		    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr concepts::Vector auto divCeil(T_Vector0 a, T_Vector1 b)
 		    {
 		        return (a + b - T_Vector0::all(1)) / b;
 		    }

 		    template<concepts::Vector T_Vector0, concepts::Vector T_Vector1>
 		    requires(std::is_same_v<trait::GetValueType_t<T_Vector0>, trait::GetValueType_t<T_Vector1>>)
 		    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr concepts::Vector auto divExZero(T_Vector0 a, T_Vector1 b)
 		    {
 		        auto tmp = a / b;

 		        using ValueType = alpaka::trait::GetValueType_t<T_Vector0>;
 		        for(uint32_t d = 0u; d < a.dim(); ++d)
 		            tmp[d] = std::max(tmp[d], ValueType{1u});
 		        return tmp;
 		    }
 		}; // namespace alpaka

 		namespace std
 		{
 		    template<typename T_Type, uint32_t T_dim, typename T_Storage>
 		    struct tuple_size<alpaka::Vec<T_Type, T_dim, T_Storage>>
 		    {
 		        static constexpr std::size_t value = T_dim;
 		    };

 		    template<std::size_t I, typename T_Type, uint32_t T_dim, typename T_Storage>
 		    struct tuple_element<I, alpaka::Vec<T_Type, T_dim, T_Storage>>
 		    {
 		        using type = T_Type;
 		    };
 		} // namespace std
 		// ==
 		// == ./include/alpaka/Vec.hpp ==
 		// ============================================================================

 	// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 	// #include <array>    // amalgamate: file already included
 	// #include <concepts>    // amalgamate: file already included
 	// #include <cstdint>    // amalgamate: file already included
 	#include <functional>
 	#include <type_traits>
 	// #include <utility>    // amalgamate: file already included

 	namespace alpaka
 	{
 	    /** @brief A vector with compile-time known values
 	     *
 	     * @details
 	     * A CVec is guaranteed to be constexpr, because all of its values are stored in the type. A CVec instance
 	     * satisfies the alpaka::concept::Vector. Some ways to create common types of vectors are fillCVec() and
 	     * iotaCVec().
 	     *
 	     * @tparam T The type of the vector's stored values
 	     * @tparam T_values List of values of type T that the vector stores; the length of the vector is inferred from the
 	     * length of this list
 	     */
 	    template<typename T, T... T_values>
 	    using CVec = Vec<T, sizeof...(T_values), detail::CVec<T, T_values...>>;

 	    namespace detail
 	    {
 	        template<typename T, T... T_values>
 	        [[nodiscard]] constexpr auto integerSequenceToCVec(std::integer_sequence<T, T_values...>)
 	        {
 	            return alpaka::CVec<T, T_values...>{};
 	        }

 	        template<typename T, T... T_values>
 	        [[nodiscard]] constexpr auto toIntegerSequence(alpaka::CVec<T, T_values...>)
 	        {
 	            return std::integer_sequence<T, T_values...>{};
 	        }

 	        template<typename Int, Int... Is1, Int... Is2>
 	        [[nodiscard]] constexpr auto combine(std::integer_sequence<Int, Is1...>, std::integer_sequence<Int, Is2...>)
 	        {
 	            return std::integer_sequence<Int, Is1..., Is2...>{};
 	        }

 	        template<typename Last>
 	        [[nodiscard]] constexpr auto concatenate(Last last)
 	        {
 	            return last;
 	        }

 	        template<typename First, typename... Rest>
 	        [[nodiscard]] constexpr auto concatenate(First first, Rest... rest)
 	        {
 	            return combine(first, concatenate(rest...));
 	        }

 	        template<bool pred, typename T, T T_v>
 	        using selectValue = std::conditional_t<pred, std::integer_sequence<T>, std::integer_sequence<T, T_v>>;

 	        /** @brief Return all values of an integer sequence for which a filter returns true
 	         *
 	         * @tparam T_UnaryOp The type of the function or functor to filter with. Must take one argument and return a
 	         * boolean.
 	         * @tparam T The type of the given values.
 	         * @tparam T_values The values to filter.
 	         * @param op The filter function/functor.
 	         * @param _ An integer sequence of values to filter
 	         * @return The filtered integer sequence
 	         */
 	        template<typename T_UnaryOp, typename T, T... T_values>
 	        [[nodiscard]] constexpr auto filterValues(T_UnaryOp const op, std::integer_sequence<T, T_values...> _)
 	        {
 	            return concatenate(selectValue<op(T_values), T, T_values>{}...);
 	        }

 	        /** A functor that can check for any of the contained values
 	         *
 	         * @details
 	         * The functor contains the given sequence of values and implements an `operator()(T value)`, which returns
 	         * true if the `value` is part of the sequence.
 	         *
 	         * @tparam T_Seq The sequence to check against
 	         */
 	        template<typename T_Seq>
 	        struct Contains;

 	        template<typename T, template<typename, T...> typename T_Seq, T... T_values>
 	        struct Contains<T_Seq<T, T_values...>>
 	        {
 	            using argument_type = T;

 	            constexpr bool operator()(T value) const
 	            {
 	                return ((value == T_values) || ...);
 	            }
 	        };

 	        /* this specialization is required for clang20 but in principle the specialization above should cover it
 	         * compile error: CVec.hpp:92:51: error: implicit instantiation of undefined template
 	         * 'alpaka::detail::Contains<std::integer_sequence<unsigned int, 0>>' 92 |         return
 	         * integerSequenceToCVec(filterValues(Contains<ALPAKA_TYPEOF(rightSeq)>{}, toIntegerSequence(left)));
 	         */
 	        template<typename T, T... T_values>
 	        struct Contains<std::integer_sequence<T, T_values...>>
 	        {
 	            using argument_type = T;

 	            constexpr bool operator()(T value) const
 	            {
 	                return ((value == T_values) || ...);
 	            }
 	        };
 	    } // namespace detail

 	    /** Create and return a CVector of the given length with values 1, 2, ...
 	     *
 	     * @details
 	     * The function is defined consteval, so the result can and should always be constexpr.
 	     *
 	     * @tparam T Type of the stored values
 	     * @tparam T_dim Length of the vector
 	     *
 	     * @return The vector containing the iota sequence
 	     */
 	    template<typename T, uint32_t T_dim>
 	    [[nodiscard]] consteval auto iotaCVec()
 	    {
 	        using IotaSeq = std::make_integer_sequence<T, T_dim>;
 	        return detail::integerSequenceToCVec(IotaSeq{});
 	    }

 	    /** Create and return a CVector of some length, filled with the given value
 	     *
 	     * @details
 	     * The function is defined consteval, so the result can and should always be constexpr.
 	     *
 	     * @tparam T Type of the stored values
 	     * @tparam T_dim Length of the vector
 	     * @tparam T_val Values to fill the vector with
 	     *
 	     * @return The filled vector
 	     */
 	    template<typename T, uint32_t T_dim, T T_val>
 	    [[nodiscard]] consteval auto fillCVec()
 	    {
 	        auto concatCVec
 	            = []<T... T_values>(CVec<T, T_values...> cvec) -> auto { return CVec<T, T_values..., T_val>{}; };

 	        static_assert(T_dim > 0);
 	        if constexpr(T_dim == 1)
 	            return CVec<T, T_val>{};
 	        else
 	            return concatCVec(fillCVec<T, T_dim - 1, T_val>());
 	    }

 	    /** Filter the left vector with the right vector's values
 	     *
 	     * @return A CVec that contains all values of the left vector that don't exist in the right vector. Preserves
 	     * original order.
 	     */
 	    [[nodiscard]] constexpr auto filter(concepts::CVector auto left, concepts::CVector auto right)
 	    {
 	        using namespace detail;
 	        constexpr auto rightSeq = toIntegerSequence(right);

 	        return integerSequenceToCVec(
 	            filterValues(detail::Contains<ALPAKA_TYPEOF(rightSeq)>{}, toIntegerSequence(left)));
 	    }

 	} // namespace alpaka
 	// ==
 	// == ./include/alpaka/CVec.hpp ==
 	// ============================================================================

 	// ============================================================================
 	// == ./include/alpaka/Simd.hpp ==
 	// ==
 	/* Copyright 2025 René Widera
 	 * SPDX-License-Identifier: MPL-2.0
 	 */

 	/** @file This file provides a basic implementation of a SIMD vector.
 	 *
 	 * The implementation is based on the class Vec:
 	 *   - the storge policy should become the native SIMD implementation e.g. std::simd
 	 *   - load/ store and simd specifis should be implemented in the storage policy
 	 *   - the name of storage policy should be changed
 	 *
 	 *   The current operator operations relay on compilers auto vectorization.
 	 */

 	// #pragma once
 	// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 	// #include "alpaka/cast.hpp"    // amalgamate: file already inlined
 	// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined
 		// ============================================================================
 		// == ./include/alpaka/mem/Alignment.hpp ==
 		// ==
 		/* Copyright 2025 René Widera
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include <cstdint>    // amalgamate: file already included
 		// #include <limits>    // amalgamate: file already included
 		#include <type_traits>

 		namespace alpaka
 		{
 		    /** @brief Strongly typed and constexpr representation of a byte-alignment of memory
 		     *
 		     * @details
 		     * The number of bytes is stored at compile-time using a value template parameter. Therefore, alignments should
 		     * always be declared `constexpr`. If no explicit alignment is given, a default will be set.
 		     *
 		     * To use the alignment, the Alignment::get() function can be called for a given type parameter, returning either
 		     * the object's set alignment, or the given type's alignment, if the default was used.
 		     *
 		     * @tparam T_byte The number of bytes in uint32_t.
 		     */
 		    template<uint32_t T_byte = std::numeric_limits<uint32_t>::max()>
 		    struct Alignment
 		    {
 		        /** Get the byte-alignment of a given type when using this alignment.
 		         *
 		         * @details
 		         * Trying to use an alignment with a smaller value than the alignment of the given `T_Type` results in a failed
 		         * `static_assert`.
 		         *
 		         * @tparam T_Type The type for which to get the alignment.
 		         * @return If T_byte is not specifically set: alignment of T_Type, else: value of T_byte
 		         */
 		        template<typename T_Type>
 		        static consteval uint32_t get()
 		        {
 		            // auto alignment
 		            if constexpr(T_byte == std::numeric_limits<uint32_t>::max())
 		                return static_cast<uint32_t>(alignof(T_Type));
 		            else
 		            {
 		                static_assert(
 		                    value >= alignof(T_Type),
 		                    "tried to use alignment that is smaller than the alignment of the type it's for");
 		                return value;
 		            }
 		        }

 		    private:
 		        static consteval uint32_t get()
 		        {
 		            return value;
 		        }

 		        static constexpr uint32_t value = T_byte;
 		    };

 		    using AutoAligned = Alignment<>;

 		    namespace trait
 		    {
 		        template<typename T_Type>
 		        struct IsAlignment : std::false_type
 		        {
 		        };

 		        template<uint32_t T_byte>
 		        struct IsAlignment<Alignment<T_byte>> : std::true_type
 		        {
 		        };
 		    } // namespace trait

 		    template<typename T_Type>
 		    constexpr bool isAlignment_v = trait::IsAlignment<T_Type>::value;

 		    namespace concepts
 		    {
 		        /** @brief Concept to check for an alignment object
 		         *
 		         * @details
 		         * An alignment represents a byte alignment of memory. The class is used for strong typing.
 		         * For more information, refer to the struct alpaka::Alignment or the general documentation.
 		         *
 		         * @todo link to alignment documentation in the general docs
 		         */
 		        template<typename T>
 		        concept Alignment = trait::IsAlignment<T>::value;
 		    } // namespace concepts
 		} // namespace alpaka
 		// ==
 		// == ./include/alpaka/mem/Alignment.hpp ==
 		// ============================================================================

 	// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 	// #include <array>    // amalgamate: file already included
 	// #include <bit>    // amalgamate: file already included
 	// #include <concepts>    // amalgamate: file already included
 	#include <cstddef>
 	// #include <cstdint>    // amalgamate: file already included
 	// #include <functional>    // amalgamate: file already included
 	// #include <iostream>    // amalgamate: file already included
 	// #include <ranges>    // amalgamate: file already included
 	// #include <sstream>    // amalgamate: file already included
 	#include <type_traits>

 	namespace alpaka
 	{
 	    namespace detail
 	    {
 	        template<typename T_ValueType, uint32_t T_numElements, concepts::Alignment T_Alignment>
 	        consteval uint32_t optimalAlignment()
 	        {
 	            constexpr uint32_t currentTypeAlignment = static_cast<uint32_t>(alignof(T_ValueType));
 	            if constexpr(T_numElements % 2 != 0u)
 	                return currentTypeAlignment;

 	            constexpr uint32_t dataSizeInBytes = static_cast<uint32_t>(sizeof(T_ValueType) * T_numElements);
 	            constexpr uint32_t alignment = std::min(T_Alignment::template get<T_ValueType>(), dataSizeInBytes);
 	            if constexpr(std::has_single_bit(alignment))
 	                return alignment;

 	            return static_cast<uint32_t>(alignof(T_ValueType));
 	        }

 	        /** Simd array storge for vector data
 	         *
 	         * The storage is align for native simd usage.
 	         */
 	        template<typename T_Type, uint32_t T_dim, concepts::Alignment T_Alignment>
 	        struct alignas(alpaka::detail::optimalAlignment<T_Type, T_dim, T_Alignment>()) SimdArrayStorage
 	            : protected std::array<T_Type, T_dim>
 	        {
 	            using type = T_Type;
 	            using BaseType = std::array<T_Type, T_dim>;
 	            using BaseType::operator[];
 	            using AlignmentType = Alignment<optimalAlignment<T_Type, T_dim, T_Alignment>()>;

 	            // constructor is required because exposing the array constructors does not work
 	            template<typename... T_Args>
 	            constexpr SimdArrayStorage(T_Args&&... args) : BaseType{std::forward<T_Args>(args)...}
 	            {
 	            }

 	            constexpr SimdArrayStorage(std::array<T_Type, T_dim> const& data) : BaseType{data}
 	            {
 	            }

 	            static constexpr AlignmentType getAlignment()
 	            {
 	                return AlignmentType{};
 	            }
 	        };
 	    } // namespace detail

 	    template<
 	        typename T_Type,
 	        uint32_t T_dim,
 	        concepts::Alignment T_Alignment = Alignment<sizeof(T_Type) * T_dim>,
 	        typename T_Storage = detail::SimdArrayStorage<T_Type, T_dim, T_Alignment>>
 	    struct Simd;

 	    template<typename T_Type, uint32_t T_dim, concepts::Alignment T_Alignment, typename T_Storage>
 	    struct Simd : private T_Storage
 	    {
 	        using Storage = T_Storage;
 	        using type = T_Type;
 	        using ParamType = type;

 	        using index_type = uint32_t;
 	        using size_type = uint32_t;
 	        using rank_type = uint32_t;

 	        // universal vec used as fallback if T_Storage is holding the state in the template signature
 	        using UniSimd = Simd<T_Type, T_dim>;

 	        /*Simds without elements are not allowed*/
 	        static_assert(T_dim > 0u);

 	        constexpr Simd() = default;

 	        /** Initialize via a generator expression
 	         *
 	         * The generator must return the value for the corresponding index of the component which is passed to the
 	         * generator.
 	         */
 	        template<
 	            typename F,
 	            std::enable_if_t<std::is_invocable_v<F, std::integral_constant<uint32_t, 0u>>, uint32_t> = 0u>
 	        constexpr explicit Simd(F&& generator)
 	            : Simd(std::forward<F>(generator), std::make_integer_sequence<uint32_t, T_dim>{})
 	        {
 	        }

 	    private:
 	        template<typename F, uint32_t... Is>
 	        constexpr explicit Simd(F&& generator, std::integer_sequence<uint32_t, Is...>)
 	            : Storage{generator(std::integral_constant<uint32_t, Is>{})...}
 	        {
 	        }

 	    public:
 	        /** Constructor for N-dimensional vector
 	         *
 	         * @attention This constructor allows implicit casts.
 	         *
 	         * @param args value of each dimension, x,y,z,...
 	         *
 	         * A constexpr vector should be initialized with {} instead of () because at least
 	         * CUDA 11.6 has problems in cases where a compile time evaluation is required.
 	         * @code{.cpp}
 	         *   constexpr auto vec1 = Simd{ 1 };
 	         *   constexpr auto vec2 = Simd{ 1, 2 };
 	         *   //or explicit
 	         *   constexpr auto vec3 = Simd<int, 3u>{ 1, 2, 3 };
 	         *   constexpr auto vec4 = Simd<int, 3u>{ {1, 2, 3} };
 	         * @endcode
 	         */
 	        template<typename... T_Args, typename = std::enable_if_t<(std::is_convertible_v<T_Args, T_Type> && ...)>>
 	        constexpr Simd(T_Args const&... args) : Storage(static_cast<T_Type>(args)...)
 	        {
 	        }

 	        constexpr Simd(Simd const& other) = default;

 	        constexpr Simd(T_Storage const& other) : T_Storage{other}
 	        {
 	        }

 	        /** constructor allows changing the storage policy
 	         */
 	        template<concepts::Alignment T_OtherAlignment, typename T_OtherStorage>
 	        constexpr Simd(Simd<T_Type, T_dim, T_OtherAlignment, T_OtherStorage> const& other)
 	            : Simd([&](uint32_t const i) constexpr { return other[i]; })
 	        {
 	        }

 	        /** Allow static_cast / explicit cast to member type for 1D vector */
 	        template<uint32_t T_deferDim = T_dim, typename = typename std::enable_if<T_deferDim == 1u>::type>
 	        constexpr explicit operator type()
 	        {
 	            return (*this)[0];
 	        }

 	        static consteval uint32_t dim()
 	        {
 	            return T_dim;
 	        }

 	        constexpr void copyFrom(T_Type const* data, concepts::Alignment auto alignment)
 	        {
 	            using MemoryAligndSimdType = Simd<T_Type, T_dim, ALPAKA_TYPEOF(alignment)>;
 	            /* We reinterpret the destination with the current memory alignment of the pointer, this should be safe
 	             * because the destination is assumed to be in registers. This will force using the default copy
 	             * constructor and therefore vector loads.
 	             *
 	             * @todo rework this part as soon as wee abstracted the storage policy to be the simd implementation
 	             */
 	            *reinterpret_cast<MemoryAligndSimdType*>(this) = *reinterpret_cast<MemoryAligndSimdType const*>(data);
 	        }

 	        constexpr void copyTo(auto* data, concepts::Alignment auto alignment) const
 	        {
 	            using MemoryAligndSimdType = Simd<T_Type, T_dim, ALPAKA_TYPEOF(alignment)>;
 	            /* We reinterpret the source with the current memory alignment of the pointer, this should be safe because
 	             * the destination is assumed to be in registers. This will force using the default copy constructor and
 	             * therefore vector loads.
 	             *
 	             * @todo rework this part as soon as wee abstracted the storage policy to be the simd implementation
 	             */
 	            *reinterpret_cast<MemoryAligndSimdType*>(data) = *reinterpret_cast<MemoryAligndSimdType const*>(this);
 	        }

 	        /**
 	         * Creates a Simd where all dimensions are set to the same value
 	         *
 	         * @param value Value which is set for all dimensions
 	         * @return new Simd<...>
 	         */
 	        static constexpr auto all(concepts::Convertible<T_Type> auto const& value)
 	        {
 	            Simd result([=](uint32_t const) { return static_cast<T_Type>(value); });
 	            return result;
 	        }

 	        constexpr Simd toRT() const
 	        {
 	            return *this;
 	        }

 	        constexpr Simd revert() const
 	        {
 	            Simd invertedSimd{};
 	            for(uint32_t i = 0u; i < T_dim; i++)
 	                invertedSimd[T_dim - 1 - i] = (*this)[i];

 	            return invertedSimd;
 	        }

 	        constexpr Simd& operator=(Simd const&) = default;
 	        constexpr Simd& operator=(Simd&&) = default;

 	        constexpr Simd operator-() const
 	        {
 	            return Simd([this](uint32_t const i) constexpr { return -(*this)[i]; });
 	        }

 	/** assign operator
 	 * @{
 	 */
 	#define ALPAKA_VECTOR_ASSIGN_OP(op)                                                                                   \
 	    template<typename T_OtherStorage>                                                                                 \
 	    constexpr Simd& operator op(Simd<T_Type, T_dim, T_OtherStorage> const& rhs)                                       \
 	    {                                                                                                                 \
 	        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
 	        {                                                                                                             \
 	            if constexpr(requires { unWrapp((*this)[i]) op rhs[i]; })                                                 \
 	            {                                                                                                         \
 	                unWrapp((*this)[i]) op rhs[i];                                                                        \
 	            }                                                                                                         \
 	            else                                                                                                      \
 	            {                                                                                                         \
 	                (*this)[i] op rhs[i];                                                                                 \
 	            }                                                                                                         \
 	        }                                                                                                             \
 	        return *this;                                                                                                 \
 	    }                                                                                                                 \
 	    constexpr Simd& operator op(concepts::LosslesslyConvertible<T_Type> auto const value)                             \
 	    {                                                                                                                 \
 	        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
 	        {                                                                                                             \
 	            if constexpr(requires { unWrapp((*this)[i]) op value; })                                                  \
 	            {                                                                                                         \
 	                unWrapp((*this)[i]) op value;                                                                         \
 	            }                                                                                                         \
 	            else                                                                                                      \
 	            {                                                                                                         \
 	                (*this)[i] op value;                                                                                  \
 	            }                                                                                                         \
 	        }                                                                                                             \
 	        return *this;                                                                                                 \
 	    }


 	        ALPAKA_VECTOR_ASSIGN_OP(+=)
 	        ALPAKA_VECTOR_ASSIGN_OP(-=)
 	        ALPAKA_VECTOR_ASSIGN_OP(/=)
 	        ALPAKA_VECTOR_ASSIGN_OP(*=)
 	        ALPAKA_VECTOR_ASSIGN_OP(=)

 	#undef ALPAKA_VECTOR_ASSIGN_OP

 	        /** @} */

 	        constexpr decltype(auto) operator[](std::integral auto const idx)
 	        {
 	            return Storage::operator[](idx);
 	        }

 	        constexpr decltype(auto) operator[](std::integral auto const idx) const
 	        {
 	            return Storage::operator[](idx);
 	        }

 	        /** named member access
 	         *
 	         * @attention The mapping from names x,y,z,w to memory indicies differ from the mapping of an alpaka vector @c
 	         * Vec
 	         *
 	         * index -> name [0->x,1->y,2->z,3->w]
 	         *               [0->r,1->g,2->b,3->a]
 	         *               [0->s0,1->s1,2->s2,...,10->sA,...,15->sF]
 	         * @{
 	         */
 	#define ALPAKA_NAMED_ARRAY_ACCESS(functionName, dimValue)                                                             \
 	    constexpr decltype(auto) functionName() requires(T_dim >= dimValue + 1)                                           \
 	    {                                                                                                                 \
 	        return (*this)[T_dim - 1u - dimValue];                                                                        \
 	    }                                                                                                                 \
 	    constexpr decltype(auto) functionName() const requires(T_dim >= dimValue + 1)                                     \
 	    {                                                                                                                 \
 	        return (*this)[T_dim - 1u - dimValue];                                                                        \
 	    }

 	        ALPAKA_NAMED_ARRAY_ACCESS(x, 0u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(y, 1u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(z, 2u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(w, 3u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(r, 0u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(g, 1u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(b, 2u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(a, 3u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(s0, 0u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(s1, 1u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(s2, 2u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(s3, 3u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(s4, 4u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(s5, 5u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(s6, 6u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(s7, 7u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(s8, 8u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(s9, 9u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(sA, 10u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(sB, 11u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(sC, 12u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(sD, 13u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(sE, 14u)
 	        ALPAKA_NAMED_ARRAY_ACCESS(sF, 15u)

 	#undef ALPAKA_NAMED_ARRAY_ACCESS

 	        /** @} */

 	        constexpr decltype(auto) back()
 	        {
 	            return (*this)[T_dim - 1u];
 	        }

 	        constexpr decltype(auto) back() const
 	        {
 	            return (*this)[T_dim - 1u];
 	        }

 	        /** Shrink the number of elements of a vector.
 	         *
 	         * Highest indices kept alive.
 	         *
 	         * @tparam T_numElements New dimension of the SIMD pack.
 	         * @return First T_numElements elements of the origin vector
 	         */
 	        template<uint32_t T_numElements>
 	        constexpr Simd<T_Type, T_numElements> rshrink() const
 	        {
 	            static_assert(T_numElements <= T_dim);
 	            Simd<T_Type, T_numElements> result{};
 	            for(uint32_t i = 0u; i < T_numElements; i++)
 	                result[T_numElements - 1u - i] = (*this)[T_dim - 1u - i];

 	            return result;
 	        }

 	        /** Shrink the SIMD pack
 	         *
 	         * Removes the last value.
 	         */
 	        constexpr Simd<T_Type, T_dim - 1u> eraseBack() const requires(T_dim > 1u)
 	        {
 	            constexpr auto reducedDim = T_dim - 1u;
 	            Simd<T_Type, reducedDim> result{};
 	            for(uint32_t i = 0u; i < reducedDim; i++)
 	                result[i] = (*this)[i];

 	            return result;
 	        }

 	        /** Shrink the number of elements of a vector.
 	         *
 	         * @tparam T_numElements New dimension of the SIMD pack.
 	         * @param startIdx Index within the origin vector which will be the last element in the result.
 	         * @return T_numElements elements of the origin vector starting with the index startIdx.
 	         *         Indexing will wrapp around when the begin of the origin vector is reached.
 	         */
 	        template<uint32_t T_numElements>
 	        constexpr Simd<type, T_numElements> rshrink(std::integral auto const startIdx) const
 	        {
 	            static_assert(T_numElements <= T_dim);
 	            Simd<type, T_numElements> result;
 	            for(uint32_t i = 0u; i < T_numElements; i++)
 	                result[T_numElements - 1u - i] = (*this)[(T_dim + startIdx - i) % T_dim];
 	            return result;
 	        }

 	        /** Removes a component
 	         *
 	         * It is not allowed to call this method on a vector with the dimensionality of one.
 	         *
 	         * @tparam dimToRemove index which shall be removed; range: [ 0; T_dim - 1 ]
 	         * @return vector with `T_dim - 1` elements
 	         */
 	        template<std::integral auto dimToRemove>
 	        constexpr Simd<type, T_dim - 1u> remove() const requires(T_dim >= 2u)
 	        {
 	            Simd<type, T_dim - 1u> result{};
 	            for(int i = 0u; i < static_cast<int>(T_dim - 1u); ++i)
 	            {
 	                // skip component which must be deleted
 	                int const sourceIdx = i >= static_cast<int>(dimToRemove) ? i + 1 : i;
 	                result[i] = (*this)[sourceIdx];
 	            }
 	            return result;
 	        }

 	        /** Returns product of all components.
 	         *
 	         * @return product of components
 	         */
 	        [[nodiscard]] constexpr type product() const
 	        {
 	            return reduce(std::multiplies{});
 	        }

 	        /** Returns sum of all components.
 	         *
 	         * @return sum of components
 	         */
 	        [[nodiscard]] constexpr type sum() const
 	        {
 	            return reduce(std::plus{});
 	        }

 	        /** reduce all elements to a single value
 	         *
 	         * For better numerical stability a tree reduce algorithm is used.
 	         *
 	         * @tparam BinaryOp binary functor executed to reduce the range
 	         *                  The binary operation must be associative.
 	         * @return the type of the result depends on the binary functor
 	         */
 	        [[nodiscard]] constexpr auto reduce(auto&& reduceFunc) const
 	            -> decltype(reduceFunc(std::declval<type>(), std::declval<type>()))
 	        {
 	            return reduce_range(ALPAKA_FORWARD(reduceFunc));
 	        }

 	        /**
 	         * == comparison operator.
 	         *
 	         * Compares dims of two DataSpaces.
 	         *
 	         * @param other Simd to compare to
 	         * @return true if all components in both vectors are equal, else false
 	         */
 	        template<typename T_OtherStorage>
 	        constexpr bool operator==(Simd<T_Type, T_dim, T_OtherStorage> const& rhs) const
 	        {
 	            bool result = true;
 	            for(uint32_t i = 0u; i < T_dim; i++)
 	                result = result && ((*this)[i] == rhs[i]);
 	            return result;
 	        }

 	        /**
 	         * != comparison operator.
 	         *
 	         * Compares dims of two DataSpaces.
 	         *
 	         * @param other Simd to compare to
 	         * @return true if one component in both vectors are not equal, else false
 	         */
 	        template<typename T_OtherStorage>
 	        constexpr bool operator!=(Simd<T_Type, T_dim, T_OtherStorage> const& rhs) const
 	        {
 	            return !((*this) == rhs);
 	        }

 	        template<typename T_OtherStorage>
 	        constexpr auto min(Simd<T_Type, T_dim, T_OtherStorage> const& rhs) const
 	        {
 	            Simd result{};
 	            for(uint32_t d = 0u; d < T_dim; d++)
 	                result[d] = std::min((*this)[d], rhs[d]);
 	            return result;
 	        }

 	        /** create string out of the SIMD pack
 	         *
 	         * @param separator string to separate components of the SIMD pack
 	         * @param enclosings string with dim 2 to enclose vector
 	         *                   dim == 0 ? no enclose symbols
 	         *                   dim == 1 ? means enclose symbol begin and end are equal
 	         *                   dim >= 2 ? letter[0] = begin enclose symbol
 	         *                               letter[1] = end enclose symbol
 	         *
 	         * example:
 	         * .toString(";","|")     -> |x;...;z|
 	         * .toString(",","[]")    -> [x,...,z]
 	         */
 	        std::string toString(std::string const separator = ",", std::string const enclosings = "{}") const
 	        {
 	            std::string locale_enclosing_begin;
 	            std::string locale_enclosing_end;
 	            size_t enclosing_dim = enclosings.size();

 	            if(enclosing_dim > 0)
 	            {
 	                /* % avoid out of memory access */
 	                locale_enclosing_begin = enclosings[0 % enclosing_dim];
 	                locale_enclosing_end = enclosings[1 % enclosing_dim];
 	            }

 	            std::stringstream stream;
 	            stream << locale_enclosing_begin << (*this)[0];

 	            for(uint32_t i = 1u; i < T_dim; ++i)
 	                stream << separator << (*this)[i];
 	            stream << locale_enclosing_end;
 	            return stream.str();
 	        }

 	    private:
 	        /** reduce over a range of elements
 	         *
 	         * @tparam BinaryOp binary functor executed to reduce the range
 	         * @tparam T_start start index
 	         * @tparam T_end end index (excluded)
 	         * @return the type of the result depends on the binary functor
 	         */
 	        template<uint32_t T_start = 0u, uint32_t T_end = dim()>
 	        [[nodiscard]] constexpr auto reduce_range(auto&& reduceFunc) const
 	            -> decltype(reduceFunc(std::declval<type>(), std::declval<type>()))
 	        {
 	            // elements in the range
 	            constexpr uint32_t size = T_end - T_start;
 	            // single element termination
 	            if constexpr(size == 1u)
 	            {
 	                return (*this)[T_start];
 	            }
 	#if ALPAKA_LANG_SYCL
 	            // SYCL can not call recursive functions
 	            auto result = (*this)[T_start];
 	            for(uint32_t i = T_start + 1u; i < T_end; ++i)
 	            {
 	                result = reduceFunc(result, (*this)[i]);
 	            }
 	            return result;
 	#else
 	            // split range at midpoint
 	            constexpr uint32_t mid = T_start + size / 2u;

 	            // recursively reduce both halves and combine
 	            return reduceFunc(
 	                reduce_range<T_start, mid>(ALPAKA_FORWARD(reduceFunc)),
 	                reduce_range<mid, T_end>(ALPAKA_FORWARD(reduceFunc)));
 	#endif
 	        }
 	    };

 	    template<std::size_t I, typename T_Type, uint32_t T_dim, concepts::Alignment T_Alignment, typename T_Storage>
 	    constexpr auto get(Simd<T_Type, T_dim, T_Alignment, T_Storage> const& v)
 	    {
 	        return v[I];
 	    }

 	    template<std::size_t I, typename T_Type, uint32_t T_dim, concepts::Alignment T_Alignment, typename T_Storage>
 	    constexpr auto& get(Simd<T_Type, T_dim, T_Alignment, T_Storage>& v)
 	    {
 	        return v[I];
 	    }

 	    template<typename Type>
 	    struct Simd<Type, 0>
 	    {
 	        using type = Type;
 	        static constexpr uint32_t T_dim = 0;

 	        template<typename OtherType>
 	        constexpr operator Simd<OtherType, 0>() const
 	        {
 	            return Simd<OtherType, 0>();
 	        }

 	        /**
 	         * == comparison operator.
 	         *
 	         * Returns always true
 	         */
 	        constexpr bool operator==(Simd const& rhs) const
 	        {
 	            return true;
 	        }

 	        /**
 	         * != comparison operator.
 	         *
 	         * Returns always false
 	         */
 	        constexpr bool operator!=(Simd const& rhs) const
 	        {
 	            return false;
 	        }

 	        static constexpr Simd create(Type)
 	        {
 	            /* this method should never be actually called,
 	             * it exists only for Visual Studio to handle alpaka::Size_t< 0 >
 	             */
 	            static_assert(sizeof(Type) != 0 && false);
 	        }
 	    };

 	    template<typename Type, uint32_t T_dim, concepts::Alignment T_Alignment, typename T_Storage>
 	    std::ostream& operator<<(std::ostream& s, Simd<Type, T_dim, T_Alignment, T_Storage> const& vec)
 	    {
 	        return s << vec.toString();
 	    }

 	    // type deduction guide
 	    template<typename T_1, typename... T_Args>
 	    ALPAKA_FN_HOST_ACC Simd(T_1, T_Args...) -> Simd<
 	        T_1,
 	        uint32_t(sizeof...(T_Args) + 1u),
 	        Alignment<sizeof(T_1) * uint32_t(sizeof...(T_Args) + 1u)>,
 	        detail::SimdArrayStorage<
 	            T_1,
 	            uint32_t(sizeof...(T_Args) + 1u),
 	            Alignment<sizeof(T_1) * uint32_t(sizeof...(T_Args) + 1u)>>>;

 	/** binary operators
 	 * @{
 	 */
 	#define ALPAKA_VECTOR_BINARY_OP(typenameOrConcept, resultScalarType, op)                                              \
 	    template<                                                                                                         \
 	        typenameOrConcept T_Type,                                                                                     \
 	        uint32_t T_dim,                                                                                               \
 	        concepts::Alignment T_Alignment,                                                                              \
 	        typename T_Storage,                                                                                           \
 	        concepts::Alignment T_OtherAlignment,                                                                         \
 	        typename T_OtherStorage>                                                                                      \
 	    constexpr auto operator op(                                                                                       \
 	        const Simd<T_Type, T_dim, T_Alignment, T_Storage>& lhs,                                                       \
 	        const Simd<T_Type, T_dim, T_OtherAlignment, T_OtherStorage>& rhs)                                             \
 	    {                                                                                                                 \
 	        /* to avoid allocation side effects the result is always a vector                                             \
 	         * with default policies                                                                                      \
 	         */                                                                                                           \
 	        Simd<resultScalarType, T_dim> result{};                                                                       \
 	        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
 	            result[i] = lhs[i] op rhs[i];                                                                             \
 	        return result;                                                                                                \
 	    }                                                                                                                 \
 	                                                                                                                      \
 	    template<                                                                                                         \
 	        typenameOrConcept T_Type,                                                                                     \
 	        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
 	        uint32_t T_dim,                                                                                               \
 	        concepts::Alignment T_Alignment,                                                                              \
 	        typename T_Storage>                                                                                           \
 	    constexpr auto operator op(const Simd<T_Type, T_dim, T_Alignment, T_Storage>& lhs, T_ValueType rhs)               \
 	    {                                                                                                                 \
 	        /* to avoid allocation side effects the result is always a vector                                             \
 	         * with default policies                                                                                      \
 	         */                                                                                                           \
 	        Simd<resultScalarType, T_dim> result{};                                                                       \
 	        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
 	            result[i] = lhs[i] op rhs;                                                                                \
 	        return result;                                                                                                \
 	    }                                                                                                                 \
 	    template<                                                                                                         \
 	        typenameOrConcept T_Type,                                                                                     \
 	        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
 	        uint32_t T_dim,                                                                                               \
 	        concepts::Alignment T_Alignment,                                                                              \
 	        typename T_Storage>                                                                                           \
 	    constexpr auto operator op(T_ValueType lhs, const Simd<T_Type, T_dim, T_Alignment, T_Storage>& rhs)               \
 	    {                                                                                                                 \
 	        /* to avoid allocation side effects the result is always a vector                                             \
 	         * with default policies                                                                                      \
 	         */                                                                                                           \
 	        Simd<resultScalarType, T_dim> result{};                                                                       \
 	        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
 	            result[i] = lhs op rhs[i];                                                                                \
 	        return result;                                                                                                \
 	    }
 	    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, +)
 	    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, -)
 	    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, *)
 	    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, /)
 	    ALPAKA_VECTOR_BINARY_OP(typename, bool, >=)
 	    ALPAKA_VECTOR_BINARY_OP(typename, bool, >)
 	    ALPAKA_VECTOR_BINARY_OP(typename, bool, <=)
 	    ALPAKA_VECTOR_BINARY_OP(typename, bool, <)
 	    ALPAKA_VECTOR_BINARY_OP(typename, bool, &&)
 	    ALPAKA_VECTOR_BINARY_OP(typename, bool, ||)
 	    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, %)
 	    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, <<)
 	    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, >>)
 	    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, &)
 	    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, |)
 	    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, ^)

 	#undef ALPAKA_VECTOR_BINARY_OP

 	    /** @} */


 	    template<typename T>
 	    struct IsSimd : std::false_type
 	    {
 	    };

 	    template<typename T_Type, uint32_t T_dim, concepts::Alignment T_Alignment, typename T_Storage>
 	    struct IsSimd<Simd<T_Type, T_dim, T_Alignment, T_Storage>> : std::true_type
 	    {
 	    };

 	    template<typename T>
 	    constexpr bool isSimd_v = IsSimd<T>::value;

 	    namespace concepts
 	    {
 	        template<typename T>
 	        concept Simd = isSimd_v<T>;

 	        template<typename T>
 	        concept SimdOrScalar = (isSimd_v<T> || std::integral<T>);


 	        template<typename T, typename T_RequiredComponent>
 	        concept TypeOrSimd = (isSimd_v<T> || std::is_same_v<T, T_RequiredComponent>);

 	        template<typename T, typename T_RequiredComponent>
 	        concept SimdOrConvertibleType = (isSimd_v<T> || std::is_convertible_v<T, T_RequiredComponent>);
 	    } // namespace concepts

 	    namespace trait
 	    {
 	        template<typename T_Type, uint32_t T_dim, concepts::Alignment T_Alignment, typename T_Storage>
 	        struct GetDim<alpaka::Simd<T_Type, T_dim, T_Alignment, T_Storage>>
 	        {
 	            static constexpr uint32_t value = T_dim;
 	        };

 	        template<typename T_Type, uint32_t T_dim, concepts::Alignment T_Alignment, typename T_Storage>
 	        struct GetValueType<alpaka::Simd<T_Type, T_dim, T_Alignment, T_Storage>>
 	        {
 	            using type = T_Type;
 	        };
 	    } // namespace trait

 	    namespace internal
 	    {
 	        template<
 	            typename T_To,
 	            typename T_Type,
 	            uint32_t T_dim,
 	            alpaka::concepts::Alignment T_Alignment,
 	            typename T_Storage>
 	        struct PCast::Op<T_To, alpaka::Simd<T_Type, T_dim, T_Alignment, T_Storage>>
 	        {
 	            constexpr decltype(auto) operator()(auto&& input) const
 	                requires std::convertible_to<T_Type, T_To> && (!std::same_as<T_To, T_Type>)
 	            {
 	                return typename alpaka::Simd<T_To, T_dim, T_Alignment, T_Storage>::UniSimd(
 	                    [&](uint32_t idx) constexpr { return static_cast<T_To>(input[idx]); });
 	            }

 	            constexpr decltype(auto) operator()(auto&& input) const requires std::same_as<T_To, T_Type>
 	            {
 	                return input;
 	            }
 	        };
 	    } // namespace internal
 	}; // namespace alpaka

 	namespace std
 	{
 	    template<typename T_Type, uint32_t T_dim, alpaka::concepts::Alignment T_Alignment, typename T_Storage>
 	    struct tuple_size<alpaka::Simd<T_Type, T_dim, T_Alignment, T_Storage>>
 	    {
 	        static constexpr std::size_t value = T_dim;
 	    };

 	    template<
 	        std::size_t I,
 	        typename T_Type,
 	        uint32_t T_dim,
 	        alpaka::concepts::Alignment T_Alignment,
 	        typename T_Storage>
 	    struct tuple_element<I, alpaka::Simd<T_Type, T_dim, T_Alignment, T_Storage>>
 	    {
 	        using type = T_Type;
 	    };
 	} // namespace std
 	// ==
 	// == ./include/alpaka/Simd.hpp ==
 	// ============================================================================

 	// ============================================================================
 	// == ./include/alpaka/UniqueId.hpp ==
 	// ==
 	/* Copyright 2024 René Widera
 	 * SPDX-License-Identifier: MPL-2.0
 	 */

 	// #pragma once
 	// #include <cstdint>    // amalgamate: file already included
 	#include <source_location>
 	#include <string_view>

 	namespace alpaka
 	{
 	    class UniqueId
 	    {
 	    public:
 	        static constexpr size_t getId(std::source_location const location = std::source_location::current())
 	        {
 	            return generate(location);
 	        }

 	    private:
 	        static constexpr size_t generate(std::source_location const& location)
 	        {
 	            size_t hash = 0xc6a4'a793'5bd1'e995;
 	            hashCombine(hash, location.file_name());
 	            hashCombine(hash, location.function_name());
 	            hashCombine(hash, location.line());
 	            hashCombine(hash, static_cast<size_t>(location.column()) << 32u);
 	            return hash;
 	        }

 	        static constexpr void hashCombine(size_t& seed, std::string_view value)
 	        {
 	            for(char c : value)
 	            {
 	                seed ^= static_cast<size_t>(c) + 0x9e37'79b9 + (seed << 6) + (seed >> 2);
 	            }
 	        }

 	        static constexpr void hashCombine(size_t& seed, size_t value)
 	        {
 	            seed ^= value + 0x9e37'79b9 + (seed << 6) + (seed >> 2);
 	        }
 	    };

 	    /** creates a unique id on any call
 	     *
 	     * If a class is storing the compile time id and the file of the class is included within two compile units the
 	     * id will be equal in both compile units.
 	     * The id is derived from the file name, function name, line, and column from where this method is called.
 	     * If this call is used to default set a template parameter of a class it will only generate once a unique number
 	     * not each time the class will be used.
 	     *
 	     * @param location The location is the base for the unique id. For the same location the same id is generated.
 	     * @return unique id
 	     */
 	    inline consteval size_t uniqueId(std::source_location const location = std::source_location::current())
 	    {
 	        return UniqueId::getId(location);
 	    }
 	} // namespace alpaka
 	// ==
 	// == ./include/alpaka/UniqueId.hpp ==
 	// ============================================================================

 // #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 	// ============================================================================
 	// == ./include/alpaka/api/api.hpp ==
 	// ==
 	/* Copyright 2024 René Widera
 	 * SPDX-License-Identifier: MPL-2.0
 	 */


 	// #pragma once
 		// ============================================================================
 		// == ./include/alpaka/api/cuda/Api.hpp ==
 		// ==
 		/* Copyright 2024 René Widera
 		 * SPDX-License-Identifier: MPL-2.0
 		 */


 		// #pragma once
 			// ============================================================================
 			// == ./include/alpaka/api/unifiedCudaHip/trait.hpp ==
 			// ==
 			/* Copyright 2024 René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */


 			// #pragma once
 				// ============================================================================
 				// == ./include/alpaka/api/concepts/api.hpp ==
 				// ==
 				/* Copyright 2024 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 					// ============================================================================
 					// == ./include/alpaka/concepts/hasName.hpp ==
 					// ==
 					/* Copyright 2024 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 						// ============================================================================
 						// == ./include/alpaka/internal/interface.hpp ==
 						// ==
 						/* Copyright 2024 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 							// ============================================================================
 							// == ./include/alpaka/KernelBundle.hpp ==
 							// ==
 							/* Copyright 2023 René Widera, Mehmet Yusufoglu
 							 * SPDX-License-Identifier: MPL-2.0
 							 */

 							// #pragma once
 								// ============================================================================
 								// == ./include/alpaka/apply.hpp ==
 								// ==
 								/* Copyright 2025 René Widera
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 								// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 								// #include <utility>    // amalgamate: file already included

 								namespace alpaka
 								{
 								    namespace detail
 								    {
 								        template<typename T_Func, typename T_TupleLike, std::size_t... T_idx>
 								        ALPAKA_FN_INLINE constexpr decltype(auto) applyImpl(
 								            T_Func&& func,
 								            T_TupleLike&& tuple,
 								            std::index_sequence<T_idx...>)
 								        {
 								            using std::get;
 								            return func(get<T_idx>(std::forward<T_TupleLike>(tuple))...);
 								        }
 								    } // namespace detail

 								    /** Applies a function to the elements of a tuple-like object.
 								     *
 								     * This function forwards the function and the tuple-like object, and uses an index sequence to unpack the tuple.
 								     *
 								     * @param func The function to apply.
 								     * @param tuple The tuple-like object containing the arguments for the function.
 								     * @return The result of applying the function to the elements of the tuple-like object.
 								     */
 								    template<typename T_Func, typename T_TupleLike>
 								    ALPAKA_FN_INLINE constexpr decltype(auto) apply(T_Func&& func, T_TupleLike&& tuple)
 								    {
 								        /** @attention Do not use std::tuple_size_v here because it results in compile issues with gcc11.4 */
 								        return detail::applyImpl(
 								            std::forward<T_Func>(func),
 								            std::forward<T_TupleLike>(tuple),
 								            std::make_index_sequence<std::tuple_size<std::decay_t<T_TupleLike>>::value>{});
 								    }
 								} // namespace alpaka
 								// ==
 								// == ./include/alpaka/apply.hpp ==
 								// ============================================================================

 								// ============================================================================
 								// == ./include/alpaka/core/Dict.hpp ==
 								// ==
 								/* Copyright 2024 René Widera
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 									// ============================================================================
 									// == ./include/alpaka/Tuple.hpp ==
 									// ==
 									/* Copyright 2025 Tapish Narwal, René Widera
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once
 									// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 									// #include <tuple>    // amalgamate: file already included
 									#include <type_traits>
 									// #include <utility>    // amalgamate: file already included

 									namespace alpaka
 									{
 									    template<typename... T_Args>
 									    struct Tuple;

 									    namespace detail
 									    {
 									        template<std::size_t I, typename T>
 									        struct TupleLeaf
 									        {
 									            using type = T;
 									            T value;
 									        };

 									        template<typename IndexSequence, typename... T_Args>
 									        struct TupleImpl;

 									        template<std::size_t... Is, typename... T_Args>
 									        struct TupleImpl<std::index_sequence<Is...>, T_Args...> : TupleLeaf<Is, T_Args>...
 									        {
 									            template<typename... T_CArgs>
 									            constexpr TupleImpl(T_CArgs&&... us) noexcept((std::is_nothrow_constructible_v<T_Args, T_CArgs&&> && ...))
 									                : TupleLeaf<Is, T_Args>{std::forward<T_CArgs>(us)}...
 									            {
 									            }
 									        };
 									    } // namespace detail

 									    /** basic tuple implementation
 									     *
 									     * This class is trivially copyable if all members are trivially copable too and can therefore used for a
 									     * collection to pass arguments into kernels. You should use @see alpaka::apply to apply operation to the tuple.
 									     */
 									    template<typename... T_Args>
 									    struct Tuple : detail::TupleImpl<std::make_index_sequence<sizeof...(T_Args)>, T_Args...>
 									    {
 									        using StdTuple = std::tuple<T_Args...>;
 									        using Base = detail::TupleImpl<std::make_index_sequence<sizeof...(T_Args)>, T_Args...>;

 									        template<typename... T_CArgs>
 									        requires(
 									            sizeof...(T_Args) == sizeof...(T_CArgs) && sizeof...(T_Args) > 0
 									            && (!std::is_same_v<std::remove_cvref_t<std::tuple_element_t<0, std::tuple<T_CArgs...>>>, Tuple>)
 									            && (std::is_constructible_v<T_Args, T_CArgs &&> && ...))
 									        constexpr Tuple(T_CArgs&&... us) noexcept((std::is_nothrow_constructible_v<T_Args, T_CArgs&&> && ...))
 									            : Base(std::forward<T_CArgs>(us)...)
 									        {
 									        }

 									        /** get element by index
 									         *
 									         * @tparam I index which should not be larger than the number of elements -1
 									         * @{
 									         */
 									        template<size_t I>
 									        constexpr auto const& get() const
 									        {
 									            static_assert(I < sizeof...(T_Args), "Index is outside of the allowed range.");
 									            return static_cast<detail::TupleLeaf<I, std::tuple_element_t<I, StdTuple>> const&>(*this).value;
 									        }

 									        template<size_t I>
 									        constexpr auto const& get()
 									        {
 									            static_assert(I < sizeof...(T_Args), "Index is outside of the allowed range.");
 									            return static_cast<detail::TupleLeaf<I, std::tuple_element_t<I, StdTuple>>&>(*this).value;
 									        }

 									        /** @} */
 									    };

 									    template<typename... T_Args>
 									    Tuple(T_Args&&...) -> Tuple<T_Args...>;

 									    template<size_t T_idx>
 									    constexpr decltype(auto) get(auto&& t) noexcept requires(alpaka::isSpecializationOf_v<ALPAKA_TYPEOF(t), Tuple>)
 									    {
 									        return ALPAKA_FORWARD(t).template get<T_idx>();
 									    }

 									    constexpr auto makeTuple(auto&&... args)
 									    {
 									        return Tuple{ALPAKA_FORWARD(args)...};
 									    }
 									} // namespace alpaka

 									namespace std
 									{
 									    // Specialization of tuple_size for our custom Tuple
 									    template<typename... T_Args>
 									    struct tuple_size<alpaka::Tuple<T_Args...>> : std::integral_constant<std::size_t, sizeof...(T_Args)>
 									    {
 									    };

 									    template<std::size_t I, typename... T_Args>
 									    struct tuple_element<I, alpaka::Tuple<T_Args...>>
 									    {
 									        using type = typename std::tuple_element_t<I, typename alpaka::Tuple<T_Args...>::StdTuple>;
 									    };
 									} // namespace std
 									// ==
 									// == ./include/alpaka/Tuple.hpp ==
 									// ============================================================================

 								// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 								// #include <cstdio>    // amalgamate: file already included
 								// #include <tuple>    // amalgamate: file already included
 								// #include <utility>    // amalgamate: file already included

 								namespace alpaka
 								{
 								    // https://stackoverflow.com/a/64606884
 								    template<typename X, typename T_Tuple>
 								    struct Idx
 								    {
 								        static_assert(sizeof(T_Tuple) && false);
 								    };

 								    template<typename X, template<typename...> typename T_Tuple, typename... T>
 								    struct Idx<X, T_Tuple<T...>>
 								    {
 								        template<std::size_t... idx>
 								        static constexpr ssize_t find_idx(std::index_sequence<idx...>)
 								        {
 								            ssize_t found_idx = -1;
 								            // notUsed is required to avoid warning that the expression is not used
 								            [[maybe_unused]] bool notUsed
 								                = ((std::is_same_v<X, typename T::KeyType> && (found_idx = idx, true)) || ...);
 								            return found_idx;
 								        }

 								    public:
 								        static constexpr ssize_t value = find_idx(std::index_sequence_for<T...>{});
 								    };

 								    template<typename X, template<typename...> typename T_Tuple>
 								    class Idx<X, T_Tuple<>>
 								    {
 								        static constexpr ssize_t find_idx(std::index_sequence<>)
 								        {
 								            return -1;
 								        }

 								    public:
 								        static constexpr ssize_t value = find_idx(std::index_sequence_for<>{});
 								    };

 								    template<typename T_Key, typename T_Tuple>
 								    inline consteval ssize_t idx(T_Tuple&& t, T_Key const& key = T_Key{})
 								    {
 								        constexpr auto idx = Idx<T_Key, std::decay_t<T_Tuple>>::value;
 								        return idx;
 								    }

 								    template<typename T_Key, typename T_Tuple>
 								    consteval bool hasTag(T_Tuple&& t, T_Key const& key = T_Key{})
 								    {
 								        constexpr auto idx = Idx<T_Key, std::decay_t<T_Tuple>>::value;
 								        return idx != -1;
 								    }

 								    template<typename T_Key, typename T_Tuple>
 								    inline constexpr decltype(auto) getTag(T_Tuple&& t, T_Key const& key = T_Key{})
 								    {
 								        constexpr auto idx = Idx<T_Key, std::decay_t<T_Tuple>>::value;
 								        static_assert(idx != -1, "Member in dict missing!");
 								        static_assert(idx < std::tuple_size_v<std::decay_t<T_Tuple>>, "index out of range!");
 								        return unWrapp(get<idx>(std::forward<T_Tuple>(t)).value);
 								    }

 								    template<typename T_Key, typename T_Value>
 								    struct DictEntry
 								    {
 								        using KeyType = T_Key;
 								        using ValueType = T_Value;

 								        constexpr DictEntry(T_Key const, T_Value const& v) : value{v}
 								        {
 								        }

 								        constexpr DictEntry() = default;

 								        T_Value value;
 								    };

 								    namespace trait
 								    {
 								        template<typename T_Object, typename T_Sfinae = void>
 								        struct ToDictEntry
 								        {
 								            template<typename T>
 								            static constexpr auto get(T&& data)
 								            {
 								                return std::forward<T>(data);
 								            }
 								        };
 								    } // namespace trait

 								    template<typename... T_DictEntry>
 								    struct Dict
 								    {
 								        static_assert(sizeof...(T_DictEntry) && false);
 								    };

 								    template<typename... T_Keys, typename... T_Values>
 								    struct Dict<DictEntry<T_Keys, T_Values>...> : Tuple<DictEntry<T_Keys, T_Values>...>
 								    {
 								        using TupleType = Tuple<DictEntry<T_Keys, T_Values>...>;

 								        constexpr Dict(Tuple<DictEntry<T_Keys, T_Values>...> const& data) : Tuple<DictEntry<T_Keys, T_Values>...>{data}
 								        {
 								        }

 								        constexpr Dict(DictEntry<T_Keys, T_Values> const&... dictEntries)
 								            : Tuple<DictEntry<T_Keys, T_Values>...>{dictEntries...}
 								        {
 								        }

 								        constexpr Dict(Dict const&) = default;
 								        constexpr Dict(Dict&&) = default;

 								        static constexpr auto makeDict() requires(std::default_initializable<T_Values>, ...)
 								        {
 								            return Dict{alpaka::makeTuple(DictEntry<T_Keys, T_Values>{}...)};
 								        }

 								        ALPAKA_NO_HOST_ACC_WARNING
 								        constexpr decltype(auto) operator[](auto const tag) const
 								        {
 								            return getTag(*this, tag);
 								        }

 								        ALPAKA_NO_HOST_ACC_WARNING
 								        constexpr decltype(auto) operator[](auto const tag)
 								        {
 								            return getTag(*this, tag);
 								        }
 								    };

 								    template<size_t T_idx>
 								    constexpr decltype(auto) get(auto& t) noexcept requires(alpaka::isSpecializationOf_v<ALPAKA_TYPEOF(t), Dict>)
 								    {
 								        return t.template get<T_idx>();
 								    }

 								    template<size_t T_idx>
 								    constexpr decltype(auto) get(auto const& t) noexcept requires(alpaka::isSpecializationOf_v<ALPAKA_TYPEOF(t), Dict>)
 								    {
 								        return t.template get<T_idx>();
 								    }

 								    // type deduction guide
 								    template<typename... T_Keys, typename... T_Values>
 								    ALPAKA_FN_HOST_ACC Dict(Tuple<DictEntry<T_Keys, T_Values>...> const&) -> Dict<DictEntry<T_Keys, T_Values>...>;

 								    template<typename... T_Keys, typename... T_Values>
 								    ALPAKA_FN_HOST_ACC Dict(DictEntry<T_Keys, T_Values> const&...) -> Dict<DictEntry<T_Keys, T_Values>...>;

 								} // namespace alpaka

 								namespace std
 								{
 								    template<typename... T_Keys, typename... T_Values>
 								    struct tuple_size<alpaka::Dict<alpaka::DictEntry<T_Keys, T_Values>...>>
 								    {
 								        static constexpr std::size_t value = sizeof...(T_Keys);
 								    };

 								    template<std::size_t I, typename... T_Keys, typename... T_Values>
 								    struct tuple_element<I, alpaka::Dict<alpaka::DictEntry<T_Keys, T_Values>...>>
 								    {
 								        using type = decltype(alpaka::get<I>(std::declval<alpaka::Tuple<alpaka::DictEntry<T_Keys, T_Values>...>>()));
 								    };
 								} // namespace std

 								namespace alpaka
 								{

 								    template<std::size_t... idx0, std::size_t... idx1, typename T_Dict0, typename T_Dict1>
 								    constexpr auto joinDictHelper(
 								        std::index_sequence<idx0...>,
 								        std::index_sequence<idx1...>,
 								        T_Dict0 dict0,
 								        T_Dict1 dict1)
 								    {
 								        return Dict{get<idx0>(dict0)..., get<idx1>(dict1)...};
 								    }

 								    template<typename... T_Entries0, typename... T_Entries1>
 								    constexpr auto joinDict(Dict<T_Entries0...> const& dict0, Dict<T_Entries1...> const& dict1)
 								    {
 								        return joinDictHelper(
 								            std::index_sequence_for<T_Entries0...>{},
 								            std::index_sequence_for<T_Entries1...>{},
 								            dict0,
 								            dict1);
 								    }

 								    template<bool condition, typename... T_Entries0, typename... T_Entries1>
 								    requires(condition == true)
 								    constexpr auto conditionalAppendDict(Dict<T_Entries0...> const& dict0, Dict<T_Entries1...> const& dict1)
 								    {
 								        return joinDictHelper(
 								            std::index_sequence_for<T_Entries0...>{},
 								            std::index_sequence_for<T_Entries1...>{},
 								            dict0,
 								            dict1);
 								    }

 								    template<bool condition, typename... T_Entries0, typename... T_Entries1>
 								    requires(condition == false)
 								    constexpr auto conditionalAppendDict(Dict<T_Entries0...> const& dict0, Dict<T_Entries1...> const& dict1)
 								    {
 								        return dict0;
 								    }
 								} // namespace alpaka
 								// ==
 								// == ./include/alpaka/core/Dict.hpp ==
 								// ============================================================================

 								// ============================================================================
 								// == ./include/alpaka/core/RemoveRestrict.hpp ==
 								// ==
 								/* Copyright 2021 Rene Widera
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 								// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 								namespace alpaka
 								{
 								    //! Removes __restrict__ from a type
 								    template<typename T>
 								    struct remove_restrict
 								    {
 								        using type = T;
 								    };

 								#if ALPAKA_COMP_MSVC
 								    template<typename T>
 								    struct remove_restrict<T* __restrict>
 								    {
 								        using type = T*;
 								    };
 								#else
 								    template<typename T>
 								    struct remove_restrict<T* __restrict__>
 								    {
 								        using type = T*;
 								    };
 								#endif

 								    //! Helper to remove __restrict__ from a type
 								    template<typename T>
 								    using remove_restrict_t = typename remove_restrict<T>::type;
 								} // namespace alpaka
 								// ==
 								// == ./include/alpaka/core/RemoveRestrict.hpp ==
 								// ============================================================================

 							// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 								// ============================================================================
 								// == ./include/alpaka/mem/concepts.hpp ==
 								// ==
 								/* Copyright 2025 René Widera
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 								// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 								// #include <concepts>    // amalgamate: file already included
 								#include <type_traits>

 								namespace alpaka::concepts
 								{
 								    /** @todo Replace usage with alpaka::concepts::IMdSpan
 								     */
 								    template<typename T, typename T_ValueType = alpaka::NotRequired>
 								    concept MdSpan = alpaka::isMdSpan_v<T>
 								                     && (std::same_as<T_ValueType, trait::GetValueType_t<std::decay_t<T>>>
 								                         || std::same_as<T_ValueType, alpaka::NotRequired>);
 								} // namespace alpaka::concepts
 								// ==
 								// == ./include/alpaka/mem/concepts.hpp ==
 								// ============================================================================

 								// ============================================================================
 								// == ./include/alpaka/onHost/demangledName.hpp ==
 								// ==
 								/* Copyright 2024 René Widera
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 								// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 								#include <regex>
 								#include <source_location>
 								// #include <string>    // amalgamate: file already included
 								#include <string_view>

 								/** This type is required to be in the global namespace to avoid invalid offsets during demangling */
 								struct AlpakaDemangleReferenceType
 								{
 								};

 								namespace alpaka::onHost
 								{
 								    /// \file
 								    /// use source_location to derive the demangled type name
 								    /// based on:
 								    /// https://www.reddit.com/r/cpp/comments/lfi6jt/finally_a_possibly_portable_way_to_convert_types/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button

 								    template<typename T>
 								    constexpr auto EmbedTypeIntoSignature()
 								    {
 								        return std::string_view{std::source_location::current().function_name()};
 								    }

 								    template<typename T>
 								    struct Demangled
 								    {
 								        static constexpr auto name()
 								        {
 								            constexpr size_t testSignatureLength = sizeof("AlpakaDemangleReferenceType") - 1;
 								            auto const DummySignature = EmbedTypeIntoSignature<AlpakaDemangleReferenceType>();
 								            // count char's until the type name starts
 								            auto const startPosition = DummySignature.find("AlpakaDemangleReferenceType");
 								            // count char's after the type information by removing type name information and pre information
 								            auto const tailLength = DummySignature.size() - startPosition - testSignatureLength;
 								            auto const EmbeddingSignature = EmbedTypeIntoSignature<T>();
 								            auto const typeLength = EmbeddingSignature.size() - startPosition - tailLength;
 								            return EmbeddingSignature.substr(startPosition, typeLength);
 								        }
 								    };

 								    template<typename T>
 								    constexpr auto demangledName()
 								    {
 								        return std::string(Demangled<T>::name());
 								    }

 								    template<typename T>
 								    constexpr auto demangledName(T const&)
 								    {
 								        return std::string(Demangled<T>::name());
 								    }

 								    /** Simplify the C++ signature of a function
 								     *
 								     *  Template parameters will be left out and the alpaka namespace will be removed.
 								     */
 								    inline std::string simplifyFunctionSignature(std::string const& deName)
 								    {
 								        // 1. Remove the type assignments in template parameters (e.g., T_DeviceKind = ...)
 								        std::string simplified = std::regex_replace(deName, std::regex("<[^>]*=\\s*[^>]*>"), "<...>");

 								        // 2. Remove redundant occurrences of "alpaka::" within the template arguments (keep it once)
 								        simplified = std::regex_replace(simplified, std::regex("alpaka::(?![A-Za-z0-9_]+<)"), "");

 								        // 3. Simplify nested templates by removing template arguments, e.g., <...>
 								        simplified = std::regex_replace(simplified, std::regex("<[^>]*>"), "<...>");

 								        // 4. Optionally remove remaining `alpaka::` namespaces if desired
 								        simplified = std::regex_replace(simplified, std::regex("^(alpaka::)+"), "");

 								        return simplified;
 								    }

 								    /** Get a simplified demangled name of an object
 								     *
 								     * Template parameters will be left out and the alpaka namespace will be removed.
 								     */
 								    inline std::string demangledNameShort(auto const& demangledName)
 								    {
 								        return simplifyDemangedName(demangledName(demangledName));
 								    }

 								} // namespace alpaka::onHost
 								// ==
 								// == ./include/alpaka/onHost/demangledName.hpp ==
 								// ============================================================================

 							// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 							// #include <tuple>    // amalgamate: file already included
 							#include <type_traits>

 							namespace alpaka
 							{
 							    namespace onHost
 							    {
 							        /** Provides an instance of an object which can be used within the compute kernel*/
 							        struct MakeAccessibleOnAcc
 							        {
 							            template<typename T_Any>
 							            struct Op
 							            {
 							                /** @return @attention returns a reference to the original data */
 							                auto const& operator()(auto const& any) const
 							                {
 							                    return any;
 							                }

 							                auto& operator()(auto& any) const
 							                {
 							                    return any;
 							                }
 							            };
 							        };

 							        /** Provides an instance of an object which can be used within the compute kernel
 							         *
 							         * @return compute kernel compatible object if MakeAccessibleOnAcc is specialized else the identity
 							         */
 							        inline decltype(auto) makeAccessibleOnAcc(auto&& any)
 							        {
 							            return MakeAccessibleOnAcc::Op<ALPAKA_TYPEOF(any)>{}(ALPAKA_FORWARD(any));
 							        }
 							    } // namespace onHost

 							    //! \brief The class used to bind kernel function object and arguments together. Once an instance of this class
 							    //! is created, arguments are not needed to be separately given to functions who need kernel function and
 							    //! arguments.
 							    //! \tparam TKernelFn The kernel function object type.
 							    //! \tparam TArgs Kernel function object
 							    //! invocation argument types as a parameter pack.
 							    template<typename TKernelFn, typename... TArgs>
 							    class KernelBundle
 							    {
 							    public:
 							        //! The function object type
 							        using KernelFn = std::decay_t<TKernelFn>;
 							        //! Tuple type to encapsulate kernel function argument types and argument values
 							        using ArgTuple = std::conditional_t<
 							            sizeof...(TArgs) == 0,
 							            std::tuple<>,
 							            alpaka::Tuple<remove_restrict_t<ALPAKA_TYPEOF(onHost::makeAccessibleOnAcc(std::declval<TArgs>()))>...>>;

 							        // Constructor
 							        constexpr KernelBundle(KernelFn const& kernelFn) : m_kernelFn{kernelFn}, m_args(std::tuple<>{})
 							        {
 							            static_assert(
 							                alpaka::concepts::KernelFn<KernelFn>,
 							                "Kernel functor must be trivially copyable or specialize trait::IsKernelTriviallyCopyable<>!");
 							        }

 							        // Constructor
 							        constexpr KernelBundle(KernelFn const& kernelFn, auto&&... args)
 							            : m_kernelFn{kernelFn}
 							            , m_args(onHost::makeAccessibleOnAcc(ALPAKA_FORWARD(args))...)
 							        {
 							            static_assert(
 							                alpaka::concepts::KernelFn<KernelFn>,
 							                "Kernel functor must be trivially copyable or specialize trait::IsKernelTriviallyCopyable<>!");
 							            static_assert(
 							                (alpaka::concepts::KernelArg<
 							                     remove_restrict_t<ALPAKA_TYPEOF(onHost::makeAccessibleOnAcc(std::declval<TArgs>()))>>
 							                 && ...),
 							                "All kernel arguments must be trivially copyable or specialize "
 							                "trait::IsKernelArgumentTriviallyCopyable<>!");
 							        }

 							        constexpr KernelBundle(KernelBundle const& b) = default;
 							        constexpr KernelBundle& operator=(KernelBundle const&) = default;

 							        /** allow move assignment and constriction
 							         *
 							         *  @attention if the functor or the arguments contains non movable types the move operators can be
 							         * inaccessible.
 							         *
 							         *  @{
 							         */
 							        constexpr KernelBundle(KernelBundle&& b) = default;
 							        constexpr KernelBundle& operator=(KernelBundle&&) = default;

 							        /** @} */

 							        template<typename TAcc>
 							        requires(
 							            alpaka::concepts::KernelFn<KernelFn>
 							            && std::is_invocable_v<
 							                std::remove_const_t<KernelFn>,
 							                TAcc,
 							                remove_restrict_t<ALPAKA_TYPEOF(onHost::makeAccessibleOnAcc(std::declval<TArgs>()))>...>)
 							        constexpr auto operator()(TAcc const& acc) const
 							        {
 							            static_assert(
 							                std::is_invocable_v<
 							                    std::add_const_t<KernelFn>,
 							                    TAcc,
 							                    remove_restrict_t<ALPAKA_TYPEOF(onHost::makeAccessibleOnAcc(std::declval<TArgs>()))>...>,
 							                "the operator() function of a kernel must be marked const");
 							            static_assert(
 							                std::same_as<
 							                    void,
 							                    std::invoke_result_t<
 							                        std::add_const_t<KernelFn>,
 							                        TAcc,
 							                        remove_restrict_t<ALPAKA_TYPEOF(onHost::makeAccessibleOnAcc(std::declval<TArgs>()))>...>>,
 							                "the return type of the operator() function of a kernel must be void");
 							            alpaka::apply(
 							                /* It is required to take the arguments as const reference.
 							                 * The reason is that these arguments are shared between threads in a block. If the user like to mutate
 							                 * these he should use a non const copy in the kernel function signature. This is the reason why we can
 							                 * not keep const correctness for buffers and view within the copy-constructor of these.
 							                 */
 							                [&](alpaka::concepts::KernelArg auto const&... args) constexpr { m_kernelFn(acc, args...); },
 							                m_args);
 							        }

 							        KernelFn m_kernelFn;
 							        // Store the argument types without const and reference
 							        ArgTuple m_args;
 							    };

 							    //! \brief User defined deduction guide with trailing return type. For CTAD during the construction.
 							    //! \tparam TKernelFn The kernel function object type.
 							    //! \tparam TArgs Kernel function object argument types as a parameter pack.
 							    //! \param kernelFn The kernel object
 							    //! \param args The kernel invocation arguments.

 							    //! \return Kernel function bundle. An instance of KernelBundle which consists the kernel function object and its
 							    //! arguments.
 							    template<typename TKernelFn, typename... TArgs>
 							    ALPAKA_FN_HOST KernelBundle(TKernelFn const&, TArgs&&...) -> KernelBundle<TKernelFn, TArgs...>;

 							    namespace trait
 							    {
 							        template<typename T>
 							        struct IsKernelBundle : std::integral_constant<bool, isSpecializationOf_v<T, KernelBundle>>
 							        {
 							        };
 							    } // namespace trait

 							    template<typename T>
 							    constexpr bool isKernelBundle_v = trait::IsKernelBundle<T>::value;

 							} // namespace alpaka

 							namespace alpaka::concepts
 							{
 							    /** Concept to check if a type is a KernelBundle
 							     *
 							     * @tparam T Type to check
 							     */
 							    template<typename T>
 							    concept KernelBundle = isKernelBundle_v<T>;
 							} // namespace alpaka::concepts
 							// ==
 							// == ./include/alpaka/KernelBundle.hpp ==
 							// ============================================================================

 						// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
 							// ============================================================================
 							// == ./include/alpaka/onHost/Handle.hpp ==
 							// ==
 							/* Copyright 2024 René Widera
 							 * SPDX-License-Identifier: MPL-2.0
 							 */

 							// #pragma once
 							// #include <iostream>    // amalgamate: file already included
 							#include <memory>
 							#include <type_traits>

 							namespace alpaka::onHost
 							{
 							    template<typename T_Object, typename... T_Args>
 							    inline auto make_sharedSingleton(T_Args&&... args)
 							    {
 							        static std::mutex mutex;
 							        static std::weak_ptr<T_Object> platform;

 							        std::lock_guard<std::mutex> lk(mutex);
 							        if(auto sharedPtr = platform.lock())
 							        {
 							            return sharedPtr;
 							        }
 							        auto new_platform = std::make_shared<T_Object>(std::forward<T_Args>(args)...);
 							        platform = new_platform;
 							        return new_platform;
 							    }

 							    template<typename T>
 							    using Handle = std::shared_ptr<T>;
 							} // namespace alpaka::onHost
 							// ==
 							// == ./include/alpaka/onHost/Handle.hpp ==
 							// ============================================================================

 						// #include "alpaka/onHost/demangledName.hpp"    // amalgamate: file already inlined

 						namespace alpaka
 						{
 						    /** alpaka internal implementations.
 						     *
 						     * @attention do not use any functions from this namespace in our user applications.
 						     *          The interface can change at any time without further notice and is for internal use only.
 						     */
 						    namespace internal
 						    {
 						        struct GetStaticName
 						        {
 						            template<typename T_Any>
 						            struct Op
 						            {
 						                auto operator()([[maybe_unused]] T_Any const& any) const
 						                {
 						                    if constexpr(requires { T_Any::getName(); })
 						                        return T_Any::getName();
 						                    else
 						                        return onHost::demangledName(any);
 						                }
 						            };
 						        };

 						        struct GetName
 						        {
 						            template<typename T_Any>
 						            struct Op
 						            {
 						                auto operator()(T_Any const& any) const
 						                {
 						                    return any.getName();
 						                }
 						            };
 						        };

 						        struct GetApi
 						        {
 						            template<typename T_Any>
 						            struct Op
 						            {
 						                inline constexpr auto operator()(auto&& any) const
 						                {
 						                    return any.getApi();
 						                }
 						            };
 						        };

 						        inline constexpr auto getApi(auto&& any)
 						        {
 						            return GetApi::Op<std::decay_t<decltype(any)>>{}(any);
 						        }

 						        template<typename T_Any>
 						        inline constexpr auto getApi(onHost::Handle<T_Any>&& anyHandle)
 						        {
 						            return GetApi::Op<ALPAKA_TYPEOF(*anyHandle.get())>{}(*anyHandle.get());
 						        }

 						        struct GetDeviceType
 						        {
 						            template<typename T_Any>
 						            struct Op
 						            {
 						                inline constexpr auto operator()(auto&& any) const
 						                {
 						                    return any.getDeviceKind();
 						                }
 						            };
 						        };

 						        inline constexpr auto getDeviceKind(auto&& any)
 						        {
 						            return GetDeviceType::Op<std::decay_t<decltype(any)>>{}(any);
 						        }

 						        struct GetAlignment
 						        {
 						            template<typename T_Any>
 						            struct Op
 						            {
 						                constexpr auto operator()(auto&& any) const requires requires { any.getAlignment(); }
 						                {
 						                    return any.getAlignment();
 						                }

 						                constexpr auto operator()(auto&& any) const
 						                {
 						                    return Alignment<>{};
 						                }
 						            };
 						        };

 						        constexpr auto getAlignment(auto&& any)
 						        {
 						            return GetAlignment::Op<std::decay_t<decltype(any)>>{}(any);
 						        }

 						        /** Load data from a data source as SIMD vector
 						         *
 						         * A data source is not required to have physical stored data, it can also be a generator, therefore only the
 						         * data source knows how load create aSIMD vector.
 						         */
 						        struct LoadAsSimd
 						        {
 						            template<typename T_AnyDataSource, alpaka::concepts::Alignment T_Alignment, alpaka::concepts::Vector T_Idx>
 						            struct Op
 						            {
 						                /** Get data as SIMD vector
 						                 *
 						                 * @see loadAsSimd for more details.
 						                 */
 						                template<uint32_t T_simdWidth>
 						                constexpr auto load(auto&& anyDataSource, T_Alignment dataAlignment, T_Idx const& index) const;
 						            };
 						        };

 						        /** Get data as SIMD vector
 						         *
 						         * Load T_simdWidth contiguous data staring from index. The data is contiguous in the fast moving dimension of
 						         * index.
 						         *
 						         * @tparam T_simdWidth number of elements in the SIMD vector
 						         * @param anyDataSource data source to load data from
 						         * @param dataAlignment Alignment of the data source resulting SIMD vector. This can be smaller or equal
 						         * compared to the data source alignment due to possible offsets applied before.
 						         * @param index Offset index relative to the first element of data source.
 						         * @return SIMD vector with data loaded from the data source, aligned to dataAlignment
 						         */
 						        template<uint32_t T_simdWidth>
 						        constexpr auto loadAsSimd(auto&& anyDataSource, auto dataAlignment, auto const& index)
 						        {
 						            return LoadAsSimd::Op<ALPAKA_TYPEOF(anyDataSource), ALPAKA_TYPEOF(dataAlignment), ALPAKA_TYPEOF(index)>{}
 						                .template load<T_simdWidth>(ALPAKA_FORWARD(anyDataSource), dataAlignment, index);
 						        }
 						    } // namespace internal
 						} // namespace alpaka
 						// ==
 						// == ./include/alpaka/internal/interface.hpp ==
 						// ============================================================================


 					// #include <concepts>    // amalgamate: file already included
 					#include <type_traits>

 					namespace alpaka::concepts
 					{
 					    /**
 					     */
 					    template<typename T>
 					    concept HasStaticName = requires(T t) {
 					        { internal::GetStaticName::Op<std::decay_t<T>>{}(t) } -> std::convertible_to<std::string>;
 					    };

 					    template<typename T>
 					    concept HasName = requires(T t) {
 					        { internal::GetName::Op<T>{}(t) } -> std::convertible_to<std::string>;
 					    };
 					} // namespace alpaka::concepts
 					// ==
 					// == ./include/alpaka/concepts/hasName.hpp ==
 					// ============================================================================


 				// #include <concepts>    // amalgamate: file already included

 				namespace alpaka
 				{
 				    namespace detail
 				    {
 				        struct ApiBase
 				        {
 				        };
 				    } // namespace detail

 				    namespace trait
 				    {
 				        template<typename T_Type>
 				        struct IsApi : std::is_base_of<detail::ApiBase, T_Type>
 				        {
 				        };
 				    } // namespace trait

 				    template<typename T_Type>
 				    constexpr bool isApi_v = trait::IsApi<T_Type>::value;

 				    namespace concepts
 				    {
 				        /** @brief Concept to check for APIs
 				         *
 				         * @details
 				         * This concept requires that the template is an API. An API in alpaka is the representation of a software
 				         * library that can target one or multiple accelerators. Examples of APIs are alpaka::api::Cuda and
 				         * alpaka::api::Host. An Api together with an alpaka::concepts::DeviceKind can make up an
 				         * alpaka::onHost::Device.
 				         */
 				        template<typename T>
 				        concept Api = isApi_v<T> && requires(T t) { requires HasStaticName<T>; };
 				    } // namespace concepts
 				} // namespace alpaka
 				// ==
 				// == ./include/alpaka/api/concepts/api.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/api/trait.hpp ==
 				// ==
 				/* Copyright 2024 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 					// ============================================================================
 					// == ./include/alpaka/math/internal/math.hpp ==
 					// ==
 					/* Copyright 2023 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber,
 					 * Jeffrey Kelling, Sergei Bastrakov, Andrea Bocci, René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once

 					// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 						// ============================================================================
 						// == ./include/alpaka/math/internal/stlMath.hpp ==
 						// ==
 						/* Copyright 2023 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber,
 						 * Jeffrey Kelling, Sergei Bastrakov, Andrea Bocci, René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 						#include <cmath>
 						#include <complex>

 						namespace alpaka::math::internal
 						{
 						    struct StlMath
 						    {
 						    };

 						    constexpr auto stlMath = StlMath{};

 						} // namespace alpaka::math::internal
 						// ==
 						// == ./include/alpaka/math/internal/stlMath.hpp ==
 						// ============================================================================


 					// #include <cmath>    // amalgamate: file already included
 					// #include <complex>    // amalgamate: file already included
 					#include <type_traits>

 					namespace alpaka::math::internal
 					{

 					#define ALPAKA_MATH_UNARY_FUNCTOR(FUNC_NAME, OP_NAME)                                                                 \
 					    struct FUNC_NAME                                                                                                  \
 					    {                                                                                                                 \
 					        template<typename T_MathImpl, typename T_Arg>                                                                 \
 					        struct Op                                                                                                     \
 					        {                                                                                                             \
 					            constexpr auto operator()(T_MathImpl, T_Arg const& argument) const                                        \
 					            {                                                                                                         \
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)                                                       \
 					                {                                                                                                     \
 					                    /* use for ADL lookup namespace std only if StlMath is used */                                    \
 					                    using std::OP_NAME;                                                                               \
 					                    return OP_NAME(argument);                                                                         \
 					                }                                                                                                     \
 					                else                                                                                                  \
 					                    return OP_NAME(argument);                                                                         \
 					            }                                                                                                         \
 					        };                                                                                                            \
 					    }

 					    ALPAKA_MATH_UNARY_FUNCTOR(Abs, abs);

 					    ALPAKA_MATH_UNARY_FUNCTOR(Cos, cos);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Acos, acos);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Acosh, acosh);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Cosh, cosh);

 					    ALPAKA_MATH_UNARY_FUNCTOR(Sin, sin);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Asin, asin);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Asinh, asinh);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Sinh, sinh);

 					    ALPAKA_MATH_UNARY_FUNCTOR(Tan, tan);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Atan, atan);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Atanh, atanh);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Tanh, tanh);

 					    ALPAKA_MATH_UNARY_FUNCTOR(Cbrt, cbrt);

 					    ALPAKA_MATH_UNARY_FUNCTOR(Ceil, ceil);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Round, round);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Lround, lround);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Llround, llround);

 					    ALPAKA_MATH_UNARY_FUNCTOR(Trunc, trunc);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Floor, floor);

 					    ALPAKA_MATH_UNARY_FUNCTOR(Log, log);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Log2, log2);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Log10, log10);

 					    ALPAKA_MATH_UNARY_FUNCTOR(Exp, exp);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Sqrt, sqrt);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Arg, arg);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Erf, erf);

 					    ALPAKA_MATH_UNARY_FUNCTOR(Isnan, isnan);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Isinf, isinf);
 					    ALPAKA_MATH_UNARY_FUNCTOR(Isfinite, isfinite);

 					    ALPAKA_MATH_UNARY_FUNCTOR(Conj, conj);

 					#undef ALPAKA_MATH_UNARY_FUNCTOR

 					    namespace detail
 					    {
 					        //! Fallback implementation when no better ADL match was found
 					        template<typename T_Arg>
 					        ALPAKA_FN_INLINE constexpr auto rsqrt(T_Arg const& arg)
 					        {
 					            // Still use ADL to try find sqrt(arg)
 					            using std::sqrt;
 					            return static_cast<T_Arg>(1) / sqrt(arg);
 					        }
 					    } // namespace detail

 					    struct Rsqrt
 					    {
 					        template<typename T_MathImpl, typename T_Arg>
 					        struct Op
 					        {
 					            constexpr auto operator()(T_MathImpl, T_Arg const& arg) const
 					            {
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)
 					                {
 					                    // use for ADL lookup namespace std only if StlMath is used
 					                    using detail::rsqrt;
 					                    return rsqrt(arg);
 					                }
 					                else
 					                    return rsqrt(arg);
 					            }
 					        };
 					    };

 					    struct Atan2
 					    {
 					        template<typename T_MathImpl, typename T_Y, typename T_X>
 					        struct Op
 					        {
 					            constexpr auto operator()(T_MathImpl, T_Y const& y, T_X const& x) const
 					            {
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)
 					                {
 					                    // use for ADL lookup namespace std only if StlMath is used
 					                    using std::atan2;
 					                    return atan2(y, x);
 					                }
 					                else
 					                    return atan2(y, x);
 					            }
 					        };
 					    };

 					    namespace detail
 					    {
 					        //! Fallback implementation when no better ADL match was found
 					        template<typename T_Arg>
 					        constexpr auto sincos(T_Arg const& arg, T_Arg& result_sin, T_Arg& result_cos)
 					        {
 					            // Still use ADL to try find sin(arg) and cos(arg)
 					            using std::sin;
 					            result_sin = sin(arg);
 					            using std::cos;
 					            result_cos = cos(arg);
 					        }
 					    } // namespace detail

 					    // Sincos function
 					    struct SinCos
 					    {
 					        template<typename T_MathImpl, typename T_Arg>
 					        struct Op
 					        {
 					            constexpr auto operator()(T_MathImpl, T_Arg const& arg, T_Arg& result_sin, T_Arg& result_cos) const
 					            {
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)
 					                {
 					                    // use for ADL lookup namespace std only if StlMath is used
 					                    using detail::sincos;
 					                    return sincos(arg, result_sin, result_cos);
 					                }
 					                else
 					                    return sincos(arg, result_sin, result_cos);
 					            }
 					        };
 					    };

 					    struct Copysign
 					    {
 					        template<typename T_MathImpl, typename T_Mag, typename T_Sgn>
 					        struct Op
 					        {
 					            constexpr auto operator()(T_MathImpl, T_Mag const& mag, T_Sgn const& sgn) const
 					            {
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)
 					                {
 					                    // use for ADL lookup namespace std only if StlMath is used
 					                    using std::copysign;
 					                    return copysign(mag, sgn);
 					                }
 					                else
 					                    return copysign(mag, sgn);
 					            }
 					        };
 					    };

 					    struct Min
 					    {
 					        template<typename T_MathImpl, typename T_A, typename T_B>
 					        struct Op
 					        {
 					            constexpr auto operator()(T_MathImpl, T_A const& a, T_B const& b) const
 					            {
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)
 					                {
 					                    // use for ADL lookup namespace std only if StlMath is used
 					                    using std::min;
 					                    return min(a, b);
 					                }
 					                else
 					                    return min(a, b);
 					            }
 					        };
 					    };

 					    struct Max
 					    {
 					        template<typename T_MathImpl, typename T_A, typename T_B>
 					        struct Op
 					        {
 					            constexpr auto operator()(T_MathImpl, T_A const& a, T_B const& b) const
 					            {
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)
 					                {
 					                    // use for ADL lookup namespace std only if StlMath is used
 					                    using std::max;
 					                    return max(a, b);
 					                }
 					                else
 					                    return max(a, b);
 					            }
 					        };
 					    };

 					    struct Pow
 					    {
 					        template<typename T_MathImpl, typename T_Base, typename T_Exp>
 					        struct Op
 					        {
 					            constexpr auto operator()(T_MathImpl, T_Base const& base, T_Exp const& exp) const
 					            {
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)
 					                {
 					                    // use for ADL lookup namespace std only if StlMath is used
 					                    using std::pow;
 					                    return pow(base, exp);
 					                }
 					                else
 					                    return pow(base, exp);
 					            }
 					        };
 					    };

 					    struct Fmod
 					    {
 					        template<typename T_MathImpl, typename T_X, typename T_Y>
 					        struct Op
 					        {
 					            constexpr auto operator()(T_MathImpl, T_X const& x, T_Y const& y) const
 					            {
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)
 					                {
 					                    // use for ADL lookup namespace std only if StlMath is used
 					                    using std::fmod;
 					                    return fmod(x, y);
 					                }
 					                else
 					                    return fmod(x, y);
 					            }
 					        };
 					    };

 					    struct Remainder
 					    {
 					        template<typename T_MathImpl, typename T_X, typename T_Y>
 					        struct Op
 					        {
 					            constexpr auto operator()(T_MathImpl, T_X const& x, T_Y const& y) const
 					            {
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)
 					                {
 					                    // use for ADL lookup namespace std only if StlMath is used
 					                    using std::remainder;
 					                    return remainder(x, y);
 					                }
 					                else
 					                    return remainder(x, y);
 					            }
 					        };
 					    };

 					    struct Fma
 					    {
 					        template<typename T_MathImpl, typename T_X, typename T_Y, typename T_Z>
 					        struct Op
 					        {
 					            constexpr auto operator()(T_MathImpl, T_X const& x, T_Y const& y, T_Z const& z) const
 					            {
 					                if constexpr(std::same_as<T_MathImpl, StlMath>)
 					                {
 					                    // use for ADL lookup namespace std only if StlMath is used
 					                    using std::fma;
 					                    return fma(x, y, z);
 					                }
 					                else
 					                    return fma(x, y, z);
 					            }
 					        };
 					    };
 					} // namespace alpaka::math::internal

 						// ============================================================================
 						// == ./include/alpaka/math/internal/stlMathImpl.hpp ==
 						// ==
 						/* Copyright 2023 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber,
 						 * Jeffrey Kelling, Sergei Bastrakov, Andrea Bocci, René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						/** @file This file contains specializations of methods where we do not want to fall back to `std::*` functions.
 						 */

 							// ============================================================================
 							// == ./include/alpaka/core/Unreachable.hpp ==
 							// ==
 							/* Copyright 2022 Jan Stephan, Jeffrey Kelling
 							 * SPDX-License-Identifier: MPL-2.0
 							 */

 							// #pragma once
 							// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 							//! Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches. It will issue
 							//! a false warning about a missing return statement unless it is told that the following code section is unreachable.
 							//!
 							//! \param x A dummy value for the expected return type of the calling function.
 							#if (ALPAKA_COMP_NVCC && ALPAKA_ARCH_PTX)
 							#    if ALPAKA_LANG_CUDA >= ALPAKA_VERSION_NUMBER(11, 3, 0)
 							#        define ALPAKA_UNREACHABLE(...) __builtin_unreachable()
 							#    else
 							#        define ALPAKA_UNREACHABLE(...) return __VA_ARGS__
 							#    endif
 							#elif ALPAKA_COMP_MSVC
 							#    define ALPAKA_UNREACHABLE(...) __assume(false)
 							#elif ALPAKA_COMP_GNUC || ALPAKA_COMP_CLANG
 							#    define ALPAKA_UNREACHABLE(...) __builtin_unreachable()
 							#else
 							#    define ALPAKA_UNREACHABLE(...)
 							#endif
 							// ==
 							// == ./include/alpaka/core/Unreachable.hpp ==
 							// ============================================================================

 							// ============================================================================
 							// == ./include/alpaka/core/decay.hpp ==
 							// ==
 							/* Copyright 2023 Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber
 							 * SPDX-License-Identifier: MPL-2.0
 							 */

 							// #pragma once
 							#include <type_traits>

 							namespace alpaka
 							{
 							    //! Provides a decaying wrapper around std::is_same. Example: is_decayed_v<volatile float, float> returns true.
 							    template<typename T, typename U>
 							    inline constexpr auto is_decayed_v = std::is_same_v<std::decay_t<T>, std::decay_t<U>>;
 							} // namespace alpaka
 							// ==
 							// == ./include/alpaka/core/decay.hpp ==
 							// ============================================================================

 						// #include "alpaka/math/internal/math.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/math/internal/stlMath.hpp"    // amalgamate: file already inlined

 						// #include <bit>    // amalgamate: file already included
 						// #include <cmath>    // amalgamate: file already included
 						// #include <cstdint>    // amalgamate: file already included
 						#include <type_traits>

 						namespace alpaka::math::internal
 						{
 						    template<typename T_A, typename T_B>
 						    requires(std::is_arithmetic_v<T_A> && std::is_arithmetic_v<T_B>)
 						    struct Min::Op<StlMath, T_A, T_B>
 						    {
 						        constexpr auto operator()(StlMath, T_A const& a, T_B const& b) const
 						        {
 						            if constexpr(std::is_integral_v<T_A> && std::is_integral_v<T_B>)
 						            {
 						                using std::min;
 						                return min(a, b);
 						            }
 						            else if constexpr(
 						                is_decayed_v<T_A, float> || is_decayed_v<T_B, float> || is_decayed_v<T_A, double>
 						                || is_decayed_v<T_B, double>)
 						            {
 						                using std::fmin;
 						                return fmin(a, b);
 						            }
 						            else
 						                static_assert(!sizeof(T_A), "Unsupported data type");

 						            ALPAKA_UNREACHABLE(std::common_type_t<T_A, T_B>{});
 						        }
 						    };

 						    template<typename T_A, typename T_B>
 						    requires(std::is_arithmetic_v<T_A> && std::is_arithmetic_v<T_B>)
 						    struct Max::Op<StlMath, T_A, T_B>
 						    {
 						        constexpr auto operator()(StlMath, T_A const& a, T_B const& b) const
 						        {
 						            if constexpr(std::is_integral_v<T_A> && std::is_integral_v<T_B>)
 						            {
 						                using std::max;
 						                return max(a, b);
 						            }
 						            else if constexpr(
 						                is_decayed_v<T_A, float> || is_decayed_v<T_B, float> || is_decayed_v<T_A, double>
 						                || is_decayed_v<T_B, double>)
 						            {
 						                using std::fmax;
 						                return fmax(a, b);
 						            }
 						            else
 						                static_assert(!sizeof(T_A), "Unsupported data type");

 						            ALPAKA_UNREACHABLE(std::common_type_t<T_A, T_B>{});
 						        }
 						    };

 						    //! Custom IEEE 754 bitwise implementation of isnan
 						    //! std counterpart does not work correctly for `-ffast-math` flags at CPU.
 						    template<std::floating_point T_Arg>
 						    struct Isnan::Op<StlMath, T_Arg>
 						    {
 						        constexpr auto operator()(StlMath, T_Arg const& arg) const -> bool
 						        {
 						            if constexpr(std::is_same_v<T_Arg, float>)
 						            {
 						                constexpr uint32_t expMask = 0x7F80'0000;
 						                constexpr uint32_t fracMask = 0x007F'FFFF;
 						                uint32_t bits = std::bit_cast<uint32_t>(arg);
 						                bool result = ((bits & expMask) == expMask) && (bits & fracMask);
 						                return result;
 						            }
 						            else if constexpr(std::is_same_v<T_Arg, double>)
 						            {
 						                constexpr uint64_t expMask = 0x7FF0'0000'0000'0000ULL;
 						                constexpr uint64_t fracMask = 0x000F'FFFF'FFFF'FFFFULL;
 						                uint64_t bits = std::bit_cast<uint64_t>(arg);
 						                bool result = ((bits & expMask) == expMask) && (bits & fracMask);
 						                return result;
 						            }
 						            else
 						            {
 						                static_assert(!sizeof(T_Arg), "Unsupported floating-point type");
 						                ALPAKA_UNREACHABLE(T_Arg{});
 						            }
 						        }
 						    };

 						    //! Custom IEEE 754 bitwise implementation of isinf
 						    //! std counterpart does not work correctly for `-ffast-math` flags at CPU.
 						    template<std::floating_point T_Arg>
 						    struct Isinf::Op<StlMath, T_Arg>
 						    {
 						        constexpr auto operator()(StlMath, T_Arg const& arg) const -> bool
 						        {
 						            if constexpr(std::is_same_v<T_Arg, float>)
 						            {
 						                constexpr uint32_t expMask = 0x7F80'0000;
 						                constexpr uint32_t fracMask = 0x007F'FFFF;
 						                uint32_t bits = std::bit_cast<uint32_t>(arg);
 						                bool result = ((bits & expMask) == expMask) && !(bits & fracMask);
 						                return result;
 						            }
 						            else if constexpr(std::is_same_v<T_Arg, double>)
 						            {
 						                constexpr uint64_t expMask = 0x7FF0'0000'0000'0000ULL;
 						                constexpr uint64_t fracMask = 0x000F'FFFF'FFFF'FFFFULL;
 						                uint64_t bits = std::bit_cast<uint64_t>(arg);
 						                bool result = ((bits & expMask) == expMask) && !(bits & fracMask);
 						                return result;
 						            }
 						            else
 						            {
 						                static_assert(!sizeof(T_Arg), "Unsupported floating-point type");
 						                ALPAKA_UNREACHABLE(T_Arg{});
 						            }
 						        }
 						    };

 						    //! Custom IEEE 754 bitwise implementation of isinf
 						    //! std counterpart does not work correctly for `-ffast-math` flags at CPU.
 						    template<std::floating_point T_Arg>
 						    struct Isfinite::Op<StlMath, T_Arg>
 						    {
 						        constexpr auto operator()(StlMath, T_Arg const& arg) const -> bool
 						        {
 						            if constexpr(std::is_same_v<T_Arg, float>)
 						            {
 						                constexpr uint32_t expMask = 0x7F80'0000;
 						                uint32_t bits = std::bit_cast<uint32_t>(arg);
 						                bool result = (bits & expMask) != expMask;
 						                return result;
 						            }
 						            else if constexpr(std::is_same_v<T_Arg, double>)
 						            {
 						                constexpr uint64_t expMask = 0x7FF0'0000'0000'0000ULL;
 						                uint64_t bits = std::bit_cast<uint64_t>(arg);
 						                bool result = (bits & expMask) != expMask;
 						                return result;
 						            }
 						            else
 						            {
 						                static_assert(!sizeof(T_Arg), "Unsupported floating-point type");
 						                ALPAKA_UNREACHABLE(T_Arg{});
 						            }
 						        }
 						    };

 						    //! Custom IEEE 754 bitwise implementation of isnan
 						    //! std counterpart does not work correctly for `-ffast-math` flags at CPU.
 						} // namespace alpaka::math::internal
 						// ==
 						// == ./include/alpaka/math/internal/stlMathImpl.hpp ==
 						// ============================================================================

 					// ==
 					// == ./include/alpaka/math/internal/math.hpp ==
 					// ============================================================================

 					// ============================================================================
 					// == ./include/alpaka/tag.hpp ==
 					// ==
 					/* Copyright 2024 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
 						// ============================================================================
 						// == ./include/alpaka/core/Tag.hpp ==
 						// ==
 						/* Copyright 2024 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						#include <type_traits>

 						namespace alpaka
 						{
 						    template<typename T_Id = decltype([]() -> void {})>
 						    struct Tag
 						    {
 						    };

 						#define ALPAKA_TAG(name)                                                                                              \
 						    constexpr Tag<std::integral_constant<size_t, __COUNTER__>> name                                                   \
 						    {                                                                                                                 \
 						    }

 						    namespace trait
 						    {
 						        template<typename T_Object, typename T_Sfinae = void>
 						        struct IsTag : std::false_type
 						        {
 						        };

 						        template<typename T_Id>
 						        struct IsTag<Tag<T_Id>> : std::true_type
 						        {
 						        };

 						        template<typename T_Id>
 						        constexpr bool isTag_v = IsTag<T_Id>::value;

 						    } // namespace trait

 						} // namespace alpaka
 						// ==
 						// == ./include/alpaka/core/Tag.hpp ==
 						// ============================================================================

 					// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined

 					#include <cassert>
 					// #include <string>    // amalgamate: file already included
 					// #include <tuple>    // amalgamate: file already included

 					namespace alpaka
 					{
 					    namespace object
 					    {
 					        struct Api
 					        {
 					        };

 					        constexpr Api api;

 					        struct DeviceKind
 					        {
 					        };

 					        constexpr DeviceKind deviceKind;

 					        ALPAKA_TAG(exec);

 					        ALPAKA_TAG(deviceSpec);

 					        ALPAKA_TAG(dynSharedMemBytes);
 					    } // namespace object

 					    namespace queueKind
 					    {
 					        namespace detail
 					        {
 					            struct QueueKindBase
 					            {
 					            };
 					        } // namespace detail

 					        namespace trait
 					        {
 					            template<typename T_QueueKind>
 					            struct IsQueueKind : std::is_base_of<detail::QueueKindBase, T_QueueKind>
 					            {
 					            };
 					        } // namespace trait

 					        template<typename T_QueueKind>
 					        constexpr bool isQueueKind_v = trait::IsQueueKind<T_QueueKind>::value;
 					    } // namespace queueKind

 					    namespace concepts
 					    {
 					        /** Concept to check if a type is a queue kind
 					         *
 					         * @details
 					         * Example queue kinds are alpaka::queueKind::Blocking or alpaka::queueKind::NonBlocking.
 					         */
 					        template<typename T_QueueKind>
 					        concept QueueKind = queueKind::isQueueKind_v<T_QueueKind>;
 					    } // namespace concepts

 					    namespace queueKind
 					    {
 					        constexpr bool operator==(alpaka::concepts::QueueKind auto lhs, alpaka::concepts::QueueKind auto rhs)
 					        {
 					            return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
 					        }

 					        constexpr bool operator!=(alpaka::concepts::QueueKind auto lhs, alpaka::concepts::QueueKind auto rhs)
 					        {
 					            return !(lhs == rhs);
 					        }

 					        /** Queue should block during the task execution
 					         */
 					        struct Blocking : detail::QueueKindBase
 					        {
 					            static std::string getName()
 					            {
 					                return "Blocking";
 					            }
 					        };

 					        constexpr auto blocking = Blocking{};

 					        /** Queue should process task asynchronously
 					         */
 					        struct NonBlocking : detail::QueueKindBase
 					        {
 					            static std::string getName()
 					            {
 					                return "NonBlocking";
 					            }
 					        };

 					        constexpr auto nonBlocking = NonBlocking{};
 					    } // namespace queueKind

 					    namespace deviceKind
 					    {
 					        namespace detail
 					        {
 					            struct DeviceKindBase
 					            {
 					            };
 					        } // namespace detail

 					        namespace trait
 					        {
 					            template<typename T_DeviceKind>
 					            struct IsDeviceKind : std::is_base_of<detail::DeviceKindBase, T_DeviceKind>
 					            {
 					            };
 					        } // namespace trait

 					        template<typename T_DeviceKind>
 					        constexpr bool isDeviceKind_v = trait::IsDeviceKind<T_DeviceKind>::value;
 					    } // namespace deviceKind

 					    namespace concepts
 					    {
 					        /** @brief Concept to check if something is a device kind
 					         *
 					         * @details
 					         * A device kind in alpaka is a type of acceleration device, such as a GPU vendor. Examples are
 					         * alpaka::deviceKind::amdGpu or alpaka::deviceKind::cpu. Together with an alpaka::onHost::Api, it can make
 					         * up an alpaka::onHost::Device.
 					         */
 					        template<typename T_DeviceKind>
 					        concept DeviceKind = deviceKind::isDeviceKind_v<T_DeviceKind>;
 					    } // namespace concepts

 					    namespace deviceKind
 					    {
 					        constexpr bool operator==(concepts::DeviceKind auto lhs, concepts::DeviceKind auto rhs)
 					        {
 					            return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
 					        }

 					        constexpr bool operator!=(concepts::DeviceKind auto lhs, concepts::DeviceKind auto rhs)
 					        {
 					            return !(lhs == rhs);
 					        }

 					        struct Cpu : detail::DeviceKindBase
 					        {
 					            static std::string getName()
 					            {
 					                return "Cpu";
 					            }
 					        };

 					        constexpr auto cpu = Cpu{};

 					        struct AmdGpu : detail::DeviceKindBase
 					        {
 					            static std::string getName()
 					            {
 					                return "AmdGpu";
 					            }
 					        };

 					        constexpr auto amdGpu = AmdGpu{};

 					        struct NvidiaGpu : detail::DeviceKindBase
 					        {
 					            static std::string getName()
 					            {
 					                return "NvidiaGpu";
 					            }
 					        };

 					        constexpr auto nvidiaGpu = NvidiaGpu{};

 					        struct IntelGpu : detail::DeviceKindBase
 					        {
 					            static std::string getName()
 					            {
 					                return "IntelGpu";
 					            }
 					        };

 					        constexpr auto intelGpu = IntelGpu{};

 					        constexpr auto allDevices = std::make_tuple(cpu, amdGpu, nvidiaGpu, intelGpu);

 					    } // namespace deviceKind

 					    namespace layer
 					    {
 					        namespace detail
 					        {
 					            struct LayerBase
 					            {
 					            };
 					        } // namespace detail

 					        namespace trait
 					        {
 					            template<typename T_Layer>
 					            struct IsLayer : std::is_base_of<detail::LayerBase, T_Layer>
 					            {
 					            };
 					        } // namespace trait

 					        template<typename T_Layer>
 					        constexpr bool isLayer_v = trait::IsLayer<T_Layer>::value;
 					    } // namespace layer

 					    namespace concepts
 					    {
 					        /** @brief Concept to check for a compute layer of an accelerator
 					         *
 					         * @details
 					         * A layer is one specific part of the compute hierarchy of accelerators, for example alpaka::layer::Thread or
 					         * alpaka::layer::Block.
 					         */
 					        template<typename T_Layer>
 					        concept Layer = layer::isLayer_v<T_Layer>;
 					    } // namespace concepts

 					    namespace layer
 					    {
 					        struct Thread : detail::LayerBase
 					        {
 					        };

 					        constexpr auto thread = Thread{};

 					        struct Block : detail::LayerBase
 					        {
 					        };

 					        constexpr auto block = Block{};

 					        ALPAKA_TAG(shared);
 					        ALPAKA_TAG(dynShared);
 					    } // namespace layer

 					    namespace frame
 					    {
 					        ALPAKA_TAG(count);
 					        ALPAKA_TAG(extent);
 					    } // namespace frame

 					    namespace action
 					    {
 					        ALPAKA_TAG(threadBlockSync);
 					    } // namespace action

 					    struct Empty
 					    {
 					    };

 					    namespace exec
 					    {
 					        namespace trait
 					        {
 					            template<typename T_Executor>
 					            struct IsSeqExecutor : std::false_type
 					            {
 					            };
 					        } // namespace trait

 					        template<typename T_Exec>
 					        constexpr bool isSeqExecutor_v = trait::IsSeqExecutor<T_Exec>::value;
 					    } // namespace exec

 					    /** check if a executor can only be used with a single thred per block
 					     *
 					     * @return true if a block can only have a single thread, else false
 					     */
 					    template<typename T_Exec>
 					    consteval bool isSeqExecutor(T_Exec exec)
 					    {
 					        return exec::isSeqExecutor_v<T_Exec>;
 					    }
 					} // namespace alpaka
 					// ==
 					// == ./include/alpaka/tag.hpp ==
 					// ============================================================================


 				// #include <algorithm>    // amalgamate: file already included
 				// #include <cstdint>    // amalgamate: file already included

 				namespace alpaka
 				{
 				    namespace trait
 				    {
 				        /** Map's all API's by default to stl math functions. */
 				        struct GetMathImpl
 				        {
 				            template<alpaka::concepts::Api T_Api>
 				            struct Op
 				            {
 				                constexpr decltype(auto) operator()(T_Api const) const
 				                {
 				                    return alpaka::math::internal::stlMath;
 				                }
 				            };
 				        };

 				        template<alpaka::concepts::Api T_Api>
 				        constexpr decltype(auto) getMathImpl(T_Api const api)
 				        {
 				            return GetMathImpl::Op<T_Api>{}(api);
 				        }

 				        struct GetArchSimdWidth
 				        {
 				            template<typename T_Type, alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 				            struct Op
 				            {
 				                consteval uint32_t operator()(T_Api const, T_DeviceKind const) const
 				                {
 				                    static_assert(sizeof(T_Api) && false, "Missing definition of GetArchSimdWidth for API.");
 				                    return 1u;
 				                }
 				            };
 				        };

 				        /** Number of commands a CPU can issue at the same time. */
 				        struct GetNumPipelines
 				        {
 				            template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 				            struct Op
 				            {
 				                /** @return the return value must be >= 1 */
 				                consteval uint32_t operator()(T_Api const, T_DeviceKind const) const
 				                {
 				                    static_assert(sizeof(T_Api) && false, "Missing definition of GetNumPipelines for API.");
 				                    return 1u;
 				                }
 				            };
 				        };

 				        struct GetCachelineSize
 				        {
 				            template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 				            struct Op
 				            {
 				                consteval uint32_t operator()(T_Api const, T_DeviceKind const) const
 				                {
 				                    static_assert(sizeof(T_Api) && false, "GetCachelineSize for the current used API is not defined.");
 				                    return 42u;
 				                }
 				            };
 				        };

 				        // true for alpaka MdSpan implementations
 				        template<typename T>
 				        struct IsExecutor : std::false_type
 				        {
 				        };
 				    } // namespace trait

 				    template<typename T>
 				    constexpr bool isExecutor = trait::IsExecutor<T>::value;

 				    namespace concepts
 				    {
 				        /** @brief Concept to check for an executor
 				         *
 				         * @details
 				         * An executor in alpaka is a specific way of executing on an alpaka::onHost::Device. Examples of executors are
 				         * alpaka::exec::GpuCuda or alpaka::onHost::cpu::OmpBlocks.
 				         */
 				        template<typename T>
 				        concept Executor = alpaka::isExecutor<T>;
 				    } // namespace concepts

 				    constexpr bool operator==(concepts::Executor auto lhs, concepts::Executor auto rhs)
 				    {
 				        return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
 				    }

 				    constexpr bool operator!=(concepts::Executor auto lhs, concepts::Executor auto rhs)
 				    {
 				        return !(lhs == rhs);
 				    }

 				    /** Get the SIMD width in bytes for an API and device kind combination.
 				     *
 				     * @tparam T_Type data type
 				     * @return number of elements that can be processed in parallel in a vector register
 				     */
 				    template<typename T_Type>
 				    consteval uint32_t getArchSimdWidth(
 				        concepts::Api auto const api,
 				        alpaka::concepts::DeviceKind auto const deviceType)
 				    {
 				        return trait::GetArchSimdWidth::Op<T_Type, ALPAKA_TYPEOF(api), ALPAKA_TYPEOF(deviceType)>{}(api, deviceType);
 				    }

 				    /** Get the number of instructions that can be issued in parallel
 				     */
 				    consteval uint32_t getNumPipelines(
 				        concepts::Api auto const api,
 				        alpaka::concepts::DeviceKind auto const deviceType)
 				    {
 				        return trait::GetNumPipelines::Op<ALPAKA_TYPEOF(api), ALPAKA_TYPEOF(deviceType)>{}(api, deviceType);
 				    }

 				    /**  Get the number of elements to compute per thread
 				     *
 				     * This function considers the SIMD width for the corresponding data type and the potential for instruction
 				     * parallelism.
 				     *
 				     * @tparam T_Type The data type used to determine the SIMD width.
 				     * @return The minimum number of elements a thread should compute to achieve optimal utilization.
 				     */
 				    template<typename T_Type>
 				    consteval uint32_t getNumElemPerThread(
 				        concepts::Api auto const api,
 				        alpaka::concepts::DeviceKind auto const deviceType)
 				    {
 				        return getArchSimdWidth<T_Type>(api, deviceType) * getNumPipelines(api, deviceType);
 				    }

 				    /** get the cacheline size in bytes
 				     *
 				     * Cache line size is the distance between two memory address that guarantees to be false sharing free.
 				     *
 				     * @return cacheline size in bytes
 				     */
 				    consteval uint32_t getCachelineSize(
 				        concepts::Api auto const api,
 				        alpaka::concepts::DeviceKind auto const deviceType)
 				    {
 				        return trait::GetCachelineSize::Op<ALPAKA_TYPEOF(api), ALPAKA_TYPEOF(deviceType)>{}(api, deviceType);
 				    }

 				    namespace onAcc::trait
 				    {
 				        /** Defines the implementation used for atomic operations toghether with the used executor */
 				        struct GetAtomicImpl
 				        {
 				            template<alpaka::concepts::Executor T_Executor>
 				            struct Op
 				            {
 				                constexpr decltype(auto) operator()(T_Executor const) const
 				                {
 				                    static_assert(
 				                        sizeof(T_Executor) && false,
 				                        "Atomic implementation for the current used executor is not defined.");
 				                    return 0;
 				                }
 				            };
 				        };

 				        template<alpaka::concepts::Executor T_Executor>
 				        constexpr decltype(auto) getAtomicImpl(T_Executor const executor)
 				        {
 				            return GetAtomicImpl::Op<T_Executor>{}(executor);
 				        }
 				    } // namespace onAcc::trait
 				} // namespace alpaka
 				// ==
 				// == ./include/alpaka/api/trait.hpp ==
 				// ============================================================================


 			#include <type_traits>

 			namespace alpaka::unifiedCudaHip::trait
 			{
 			    template<alpaka::concepts::Executor T_Executor>
 			    struct IsUnifiedExecutor : std::false_type
 			    {
 			    };

 			    template<alpaka::concepts::Api T_Api>
 			    struct IsUnifiedApi : std::false_type
 			    {
 			    };
 			} // namespace alpaka::unifiedCudaHip::trait
 			// ==
 			// == ./include/alpaka/api/unifiedCudaHip/trait.hpp ==
 			// ============================================================================

 			// ============================================================================
 			// == ./include/alpaka/concepts.hpp ==
 			// ==
 			/* Copyright 2024 René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/concepts/hasName.hpp"    // amalgamate: file already inlined
 				// ============================================================================
 				// == ./include/alpaka/mem/concepts/ExpectedValueType.hpp ==
 				// ==
 				/* Copyright 2025 Simeon Ehrig
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 				// #include <concepts>    // amalgamate: file already included

 				namespace alpaka::concepts
 				{
 				    /** Check whether the specified data type matches the expected type, or if the expected type is
 				     *`alpaka::NotRequired`, then all data types passes the concept.
 				     **/
 				    template<typename T_Data, typename T_Expected>
 				    concept ExpectedValueType = std::same_as<T_Expected, T_Data> || std::same_as<T_Expected, alpaka::NotRequired>;
 				} // namespace alpaka::concepts
 				// ==
 				// == ./include/alpaka/mem/concepts/ExpectedValueType.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/mem/concepts/IBuffer.hpp ==
 				// ==
 				/* Copyright 2025 Simeon Ehrig
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/mem/concepts/ExpectedValueType.hpp"    // amalgamate: file already inlined
 					// ============================================================================
 					// == ./include/alpaka/mem/concepts/IView.hpp ==
 					// ==
 					/* Copyright 2025 Simeon Ehrig
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/mem/concepts/ExpectedValueType.hpp"    // amalgamate: file already inlined
 						// ============================================================================
 						// == ./include/alpaka/mem/concepts/IMdSpan.hpp ==
 						// ==
 						/* Copyright 2025 Simeon Ehrig
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/mem/concepts/ExpectedValueType.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 						// #include <concepts>    // amalgamate: file already included

 						namespace alpaka::concepts
 						{
 						    namespace impl
 						    {
 						        /** @brief Interface concept for objects describing multidimensional memory access.
 						         *
 						         * @details
 						         *
 						         * An object of type `alpaka::mdspan` does not store any information about the storage location, e.g., whether
 						         * the memory is located on a CPU or a GPU. The interface corresponds to that of a standard library container
 						         * with continuous memory, but has some differences to support multidimensional memory. For example, instead of
 						         * the member function `size()`, which returns the 1D size, `alpaka::mdspan` like objects provides the function
 						         *`getExtents()`, which returns the size of each dimension.
 						         *
 						         * @param t Object of type `alpaka::mdspan`. May or may not have a const modifier.
 						         * @param mut_t Mutable object of type `alpaka::mdspan`. Does not have a const modifier.
 						         * @param const_t Constant object of type `alpaka::mdspan`. Does have a const modifier.
 						         * @param vec Vector with the same number of elements as the dimension of the `alpaka::mdspan` like object.
 						         * Used to call the access operator.
 						         *
 						         *  @section components Components
 						         *
 						         * An `alpaka::mdspan` like object contains 4 components:
 						         * - A pointer to the actual memory.
 						         * - An extents object that describes the number of dimensions and their respective sizes.
 						         * - A pitch object that specifies how many bytes are required to jump to the next element in each dimension.
 						         * - An alignment object that describes how the elements are aligned in memory, see:
 						         * alpaka::concepts::Alignment
 						         *
 						         * @section membertypes Member types
 						         * - <b>T::value_type</b>: The element type. May or may not be const.
 						         * - <b>T::reference</b>: The element reference type is either const or non-const, depending on
 						         *`T::value_type`.
 						         * - <b>T::const_reference</b>: The constant reference type for an element. Always const.
 						         * - <b>T::pointer</b>: The element pointer type is either const or non-const, depending on
 						         *`T::value_type`.
 						         * - <b>T::const_pointer</b>: The constant pointer type for an element. Always pointer-to-const.
 						         * - <b>T::index_type</b>: The index type of the pitch.
 						         *
 						         * @note The access operator [] with an integral as an argument is only available if the dimension is one.
 						         **/
 						        template<typename T, typename T_Mut, typename T_Const>
 						        concept IMdSpan
 						            = requires(T t, T_Mut mut_t, T_Const const_t, alpaka::Vec<typename T::index_type, T::dim()> vec) {
 						                  typename T::value_type;
 						                  typename T::reference;
 						                  typename T::const_reference;
 						                  typename T::pointer;
 						                  typename T::const_pointer;
 						                  typename T::index_type;

 						                  requires std::movable<T_Mut>;
 						                  /// The bool operator returns true if access to the memory is valid. For example, memory access may
 						                  /// be invalid after moving the DataSource.
 						                  static_cast<bool>(t);

 						                  { T::dim() } -> std::same_as<uint32_t>;
 						                  { *mut_t } -> std::same_as<typename T::reference>;
 						                  { *const_t } -> std::same_as<typename T::const_reference>;
 						                  { mut_t.data() } -> std::same_as<typename T::pointer>;
 						                  { const_t.data() } -> std::same_as<typename T::const_pointer>;
 						                  /// @todo check for a MDIterator concept
 						                  t.begin();
 						                  t.end();
 						                  t.cbegin();
 						                  t.cend();

 						                  { mut_t[vec] } -> std::same_as<typename T::reference>;
 						                  { const_t[vec] } -> std::same_as<typename T::const_reference>;
 						                  // only if MdSpan like object is 1D, the access operator with an integral is available
 						                  requires(T::dim() > 1) || requires {
 						                      { mut_t[typename T::index_type{0}] } -> std::same_as<typename T::reference>;
 						                  };
 						                  requires(T::dim() > 1) || requires {
 						                      { const_t[typename T::index_type{0}] } -> std::same_as<typename T::const_reference>;
 						                  };

 						                  /// @todo add getSlice, getConstSlice and getView, getConstView functions

 						                  { t.getAlignment() } -> alpaka::concepts::Alignment;
 						                  /// @todo implement concept alpaka::concepts::Extents and use it as return value
 						                  t.getExtents();
 						                  /// @todo implement concept alpaka::concepts::Pitches and use it as return value
 						                  t.getPitches();
 						              };

 						    } // namespace impl

 						    /** @brief Interface concept for objects describing multidimensional memory access.
 						     *
 						     * @details
 						     * An object of type `alpaka::mdspan` does not store any information about the storage location, e.g., whether
 						     * the memory is located on a CPU or a GPU.
 						     *
 						     * @attention Use `alpaka::IMdSpan` to restrict types in your code. The actual interface is described in
 						     * alpaka::concepts::impl::IMdSpan.
 						     **/
 						    template<typename T, typename T_ValueType = alpaka::NotRequired>
 						    concept IMdSpan = requires {
 						        requires impl::IMdSpan<
 						            std::remove_reference_t<T>,
 						            std::remove_const_t<std::remove_reference_t<T>>,
 						            std::add_const_t<std::remove_reference_t<T>>>;
 						        requires ExpectedValueType<trait::GetValueType_t<std::decay_t<T>>, T_ValueType>;
 						    };
 						} // namespace alpaka::concepts
 						// ==
 						// == ./include/alpaka/mem/concepts/IMdSpan.hpp ==
 						// ============================================================================

 					// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 					#include <type_traits>

 					namespace alpaka::concepts
 					{
 					    namespace impl
 					    {
 					        /** @brief Interface concept for objects describing api-related multidimensional memory access.
 					         *
 					         * @details
 					         * An `alpaka::view`-like object contains information about the device(s) to which it is connected. The
 					         * `alpaka::view`-like object has no memory ownership, and therefore, it does not manage the memory lifetime.
 					         * The represented memory can have any dimensionality.
 					         *
 					         * Any object fitting the `IView` concept is also an `IMdSpan`.
 					         **/
 					        template<typename T, typename T_Mut, typename T_Const>
 					        concept IView = requires(T t) {
 					            requires IMdSpan<T, T_Mut, T_Const>;
 					            { t.getApi() } -> alpaka::concepts::Api;
 					        };
 					    } // namespace impl

 					    /** @brief Interface concept for objects describing api-related multidimensional memory access.
 					     *
 					     * @details
 					     * An `alpaka::view`-like object contains information about the device(s) to which it is connected. The
 					     * `alpaka::view`-like object has no memory ownership, and, therefore, it does not manage the memory lifetime.
 					     * The represented memory can have any dimensionality.
 					     *
 					     * @attention Use `alpaka::IView` to restrict types in your code. The actual interface is described in
 					     * alpaka::concepts::impl::IView.
 					     **/
 					    template<typename T, typename T_ValueType = alpaka::NotRequired>
 					    concept IView = requires(T t) {
 					        requires impl::IView<
 					            std::remove_reference_t<T>,
 					            std::remove_const_t<std::remove_reference_t<T>>,
 					            std::add_const_t<std::remove_reference_t<T>>>;
 					        requires ExpectedValueType<trait::GetValueType_t<std::decay_t<T>>, T_ValueType>;
 					    };
 					} // namespace alpaka::concepts
 					// ==
 					// == ./include/alpaka/mem/concepts/IView.hpp ==
 					// ============================================================================

 				// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 				#include <type_traits>

 				namespace alpaka::concepts
 				{
 				    /** Dummy function for concepts.
 				     *
 				     * Represent a callable without arguments and return value void. Required because nvcc could not handle empty
 				     * lambdas in concepts.
 				     */
 				    inline void empty_callable()
 				    {
 				    }

 				    namespace impl
 				    {
 				        /** @brief Interface concept for objects describing multidimensional owned memory.
 				         *
 				         * @details
 				         * An `alpaka::buffer`-like object contains information about the device(s) to which it is connected. The
 				         * `alpaka::buffer`-like object has memory ownership and therefore manages memory lifetime according to the
 				         * RAII principle. The represented memory can have any dimensionality.
 				         *
 				         * Any object that fulfills the `IBuffer` concept is also an `IView` and `IMdSpan`.
 				         *
 				         * @section memberfunction member functions
 				         *
 				         * - <b>t.addDestructorAction</b>: Adds a destructor action to the shared buffer.
 				         * @code{.unparsed}
 				         *    The action will be executed when the buffer is destroyed.
 				         *    This can be used to add additional cleanup actions e.g. waiting on a specific queue.
 				         *    Actions are executed in FIFO order.
 				         * @endcode
 				         * - <b>t.destructorWaitFor</b>: Add an action to be executed when the shared_ptr is destroyed.
 				         **/
 				        template<typename T, typename T_Mut, typename T_Const>
 				        concept IBuffer = requires(T t) {
 				            requires IView<T, T_Mut, T_Const>;
 				            t.addDestructorAction(alpaka::concepts::empty_callable);
 				            t.destructorWaitFor(alpaka::concepts::empty_callable);
 				        };
 				    } // namespace impl

 				    /** @brief Interface concept for objects describing multidimensional owned memory.
 				     *
 				     * @details
 				     * An `alpaka::buffer`-like object contains information about the device(s) to which it is connected. The
 				     * `alpaka::buffer`-like object has memory ownership and therefore manages memory lifetime according to the RAII
 				     * principle.
 				     *
 				     * @attention Use `alpaka::IBuffer` to restrict types in your code. The actual interface is described in
 				     * alpaka::concepts::impl::IBuffer.
 				     **/
 				    template<typename T, typename T_ValueType = alpaka::NotRequired>
 				    concept IBuffer = requires(T t) {
 				        requires impl::IBuffer<
 				            std::remove_reference_t<T>,
 				            std::remove_const_t<std::remove_reference_t<T>>,
 				            std::add_const_t<std::remove_reference_t<T>>>;
 				        requires ExpectedValueType<trait::GetValueType_t<std::decay_t<T>>, T_ValueType>;
 				    };
 				} // namespace alpaka::concepts
 				// ==
 				// == ./include/alpaka/mem/concepts/IBuffer.hpp ==
 				// ============================================================================

 			// #include "alpaka/mem/concepts/IMdSpan.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/mem/concepts/IView.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 			// #include <concepts>    // amalgamate: file already included

 			namespace alpaka
 			{
 			    namespace concepts
 			    {
 			        /** @brief Concept to check if the given type has a `get()` function.
 			         */
 			        template<typename T>
 			        concept HasGet = requires(T t) { t.get(); };

 			        /** @brief Concept to check if the given type has a static `dim()` function
 			         */
 			        template<typename T>
 			        concept HasStaticDim = requires(T t) { T::dim(); };

 			        /** @brief Concept to check if the given type is of the given dimensionality
 			         *
 			         * @details
 			         * The checked type must also fulfill HasStaticDim.
 			         *
 			         * @tparam T The type to check
 			         * @tparam T_dim The dimension the checked type should have
 			         */
 			        template<typename T, unsigned int T_dim>
 			        concept Dim = requires { T::dim() == T_dim; };

 			        /** @brief Concept to check if the given type is a GPU DeviceKind
 			         */
 			        template<typename T>
 			        concept GpuType
 			            = alpaka::concepts::DeviceKind<T>
 			              && (T{} == deviceKind::nvidiaGpu || T{} == deviceKind::amdGpu || T{} == deviceKind::intelGpu);

 			        /** @brief Concept to check if the given type is a pointer, using std::is_pointer
 			         */
 			        template<typename T>
 			        concept Pointer = std::is_pointer_v<T>;

 			        /** @todo Replace usage with alpaka::concepts::IView
 			         */
 			        template<typename T, typename T_ValueType = alpaka::NotRequired>
 			        concept View = MdSpan<T, T_ValueType> && requires(T t) {
 			            { getApi(t) } -> alpaka::concepts::Api;
 			        };

 			    } // namespace concepts

 			    namespace internal
 			    {
 			        template<alpaka::concepts::Api T_Api>
 			        struct GetApi::Op<T_Api>
 			        {
 			            inline constexpr auto operator()(auto&& api) const
 			            {
 			                return api;
 			            }
 			        };
 			    } // namespace internal
 			} // namespace alpaka
 			// ==
 			// == ./include/alpaka/concepts.hpp ==
 			// ============================================================================

 		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/onHost/trait.hpp ==
 			// ==
 			/* Copyright 2024 René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "Handle.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/KernelBundle.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 				// ============================================================================
 				// == ./include/alpaka/executor.hpp ==
 				// ==
 				/* Copyright 2024 René Widera, Mehmet Yusufoglu
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 					// ============================================================================
 					// == ./include/alpaka/api/cuda/executor.hpp ==
 					// ==
 					/* Copyright 2024 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 						// ============================================================================
 						// == ./include/alpaka/api/unifiedCudaHip/tag.hpp ==
 						// ==
 						/* Copyright 2024 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						namespace alpaka
 						{
 						    namespace onAcc::internal
 						    {
 						        struct CudaHipAtomic
 						        {
 						        };

 						        constexpr auto cudaHipAtomic = CudaHipAtomic{};

 						    } // namespace onAcc::internal

 						    namespace math::internal
 						    {
 						        struct CudaHipMath
 						        {
 						        };

 						        constexpr auto cudaHipMath = CudaHipMath{};
 						    } // namespace math::internal
 						} // namespace alpaka
 						// ==
 						// == ./include/alpaka/api/unifiedCudaHip/tag.hpp ==
 						// ============================================================================

 					// #include "alpaka/api/unifiedCudaHip/trait.hpp"    // amalgamate: file already inlined

 					// #include <string>    // amalgamate: file already included

 					namespace alpaka
 					{
 					    namespace exec
 					    {
 					        struct GpuCuda
 					        {
 					            static std::string getName()
 					            {
 					                return "GpuCuda";
 					            }
 					        };

 					        constexpr GpuCuda gpuCuda;
 					    } // namespace exec

 					    namespace trait
 					    {
 					        template<>
 					        struct IsExecutor<exec::GpuCuda> : std::true_type
 					        {
 					        };
 					    } // namespace trait
 					} // namespace alpaka

 					namespace alpaka::onAcc::trait
 					{
 					    template<>
 					    struct GetAtomicImpl::Op<alpaka::exec::GpuCuda>
 					    {
 					        constexpr decltype(auto) operator()(alpaka::exec::GpuCuda const) const
 					        {
 					            return internal::cudaHipAtomic;
 					        }
 					    };
 					} // namespace alpaka::onAcc::trait

 					namespace alpaka::unifiedCudaHip::trait
 					{
 					    template<>
 					    struct IsUnifiedExecutor<alpaka::exec::GpuCuda> : std::true_type
 					    {
 					    };
 					} // namespace alpaka::unifiedCudaHip::trait
 					// ==
 					// == ./include/alpaka/api/cuda/executor.hpp ==
 					// ============================================================================

 					// ============================================================================
 					// == ./include/alpaka/api/hip/executor.hpp ==
 					// ==
 					/* Copyright 2024 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/api/unifiedCudaHip/trait.hpp"    // amalgamate: file already inlined

 					// #include <string>    // amalgamate: file already included

 					namespace alpaka
 					{
 					    namespace exec
 					    {
 					        struct GpuHip
 					        {
 					            static std::string getName()
 					            {
 					                return "GpuHip";
 					            }
 					        };

 					        constexpr GpuHip gpuHip;
 					    } // namespace exec

 					    namespace trait
 					    {
 					        template<>
 					        struct IsExecutor<exec::GpuHip> : std::true_type
 					        {
 					        };
 					    } // namespace trait
 					} // namespace alpaka

 					namespace alpaka::onAcc::trait
 					{
 					    template<>
 					    struct GetAtomicImpl::Op<alpaka::exec::GpuHip>
 					    {
 					        constexpr decltype(auto) operator()(alpaka::exec::GpuHip const) const
 					        {
 					            return internal::cudaHipAtomic;
 					        }
 					    };
 					} // namespace alpaka::onAcc::trait

 					namespace alpaka::unifiedCudaHip::trait
 					{
 					    template<>
 					    struct IsUnifiedExecutor<alpaka::exec::GpuHip> : std::true_type
 					    {
 					    };
 					} // namespace alpaka::unifiedCudaHip::trait
 					// ==
 					// == ./include/alpaka/api/hip/executor.hpp ==
 					// ============================================================================

 					// ============================================================================
 					// == ./include/alpaka/api/host/executor.hpp ==
 					// ==
 					/* Copyright 2024 René Widera, Mehmet Yusufoglu
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 						// ============================================================================
 						// == ./include/alpaka/api/host/tag.hpp ==
 						// ==
 						/* Copyright 2024 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						namespace alpaka::onAcc
 						{
 						    namespace internal
 						    {
 						        struct StlAtomic
 						        {
 						        };

 						        constexpr auto stlAtomic = StlAtomic{};
 						    } // namespace internal
 						} // namespace alpaka::onAcc
 						// ==
 						// == ./include/alpaka/api/host/tag.hpp ==
 						// ============================================================================

 					// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 					// #include <string>    // amalgamate: file already included

 					namespace alpaka
 					{
 					    namespace exec
 					    {
 					        struct CpuSerial
 					        {
 					            static std::string getName()
 					            {
 					                return "CpuSerial";
 					            }
 					        };

 					        constexpr CpuSerial cpuSerial;

 					        struct CpuOmpBlocks
 					        {
 					            static std::string getName()
 					            {
 					                return "CpuOmpBlocks";
 					            }
 					        };

 					        constexpr CpuOmpBlocks cpuOmpBlocks;

 					        struct CpuTbbBlocks
 					        {
 					            static std::string getName()
 					            {
 					                return "CpuTbbBlocks";
 					            }
 					        };

 					        constexpr CpuTbbBlocks cpuTbbBlocks;

 					        namespace trait
 					        {
 					            template<>
 					            struct IsSeqExecutor<CpuSerial> : std::true_type
 					            {
 					            };

 					            template<>
 					            struct IsSeqExecutor<CpuOmpBlocks> : std::true_type
 					            {
 					            };

 					            template<>
 					            struct IsSeqExecutor<CpuTbbBlocks> : std::true_type
 					            {
 					            };
 					        } // namespace trait
 					    } // namespace exec

 					    namespace trait
 					    {
 					        template<>
 					        struct IsExecutor<exec::CpuSerial> : std::true_type
 					        {
 					        };

 					        template<>
 					        struct IsExecutor<exec::CpuOmpBlocks> : std::true_type
 					        {
 					        };

 					        template<>
 					        struct IsExecutor<exec::CpuTbbBlocks> : std::true_type
 					        {
 					        };

 					    } // namespace trait
 					} // namespace alpaka

 					namespace alpaka::onAcc::trait
 					{
 					    template<>
 					    struct GetAtomicImpl::Op<alpaka::exec::CpuSerial>
 					    {
 					        constexpr decltype(auto) operator()(alpaka::exec::CpuSerial const) const
 					        {
 					            return alpaka::onAcc::internal::stlAtomic;
 					        }
 					    };

 					    template<>
 					    struct GetAtomicImpl::Op<alpaka::exec::CpuOmpBlocks>
 					    {
 					        constexpr decltype(auto) operator()(alpaka::exec::CpuOmpBlocks const) const
 					        {
 					            return alpaka::onAcc::internal::stlAtomic;
 					        }
 					    };

 					    template<>
 					    struct GetAtomicImpl::Op<alpaka::exec::CpuTbbBlocks>
 					    {
 					        constexpr decltype(auto) operator()(alpaka::exec::CpuTbbBlocks const) const
 					        {
 					            return alpaka::onAcc::internal::stlAtomic;
 					        }
 					    };

 					} // namespace alpaka::onAcc::trait
 					// ==
 					// == ./include/alpaka/api/host/executor.hpp ==
 					// ============================================================================

 					// ============================================================================
 					// == ./include/alpaka/api/oneApi/executor.hpp ==
 					// ==
 					/* Copyright 2025 Simeon Ehrig
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 						// ============================================================================
 						// == ./include/alpaka/api/syclGeneric/tag.hpp ==
 						// ==
 						/* Copyright 2025 Simeon Ehrig
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						namespace alpaka
 						{
 						    namespace onAcc::internal
 						    {
 						        struct SyclAtomic
 						        {
 						        };

 						        constexpr auto syclAtomic = SyclAtomic{};
 						    } // namespace onAcc::internal

 						    namespace math::internal
 						    {
 						        struct SyclMath
 						        {
 						        };

 						        constexpr auto syclMath = SyclMath{};
 						    } // namespace math::internal
 						} // namespace alpaka
 						// ==
 						// == ./include/alpaka/api/syclGeneric/tag.hpp ==
 						// ============================================================================

 					// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined

 					namespace alpaka
 					{
 					    namespace exec
 					    {
 					        struct OneApi
 					        {
 					            static std::string getName()
 					            {
 					                return "OneApi";
 					            }
 					        };

 					        constexpr OneApi oneApi{};
 					    } // namespace exec

 					    namespace trait
 					    {
 					        template<>
 					        struct IsExecutor<exec::OneApi> : std::true_type
 					        {
 					        };
 					    } // namespace trait

 					    namespace onAcc::trait
 					    {
 					        template<>
 					        struct GetAtomicImpl::Op<alpaka::exec::OneApi>
 					        {
 					            constexpr decltype(auto) operator()(alpaka::exec::OneApi const) const
 					            {
 					                return alpaka::onAcc::internal::syclAtomic;
 					            }
 					        };
 					    } // namespace onAcc::trait
 					} // namespace alpaka
 					// ==
 					// == ./include/alpaka/api/oneApi/executor.hpp ==
 					// ============================================================================


 				namespace alpaka::exec
 				{
 				    /** list of all executors supported by alpaka
 				     *
 				     * The order is from high parallelism to low parallelism for executors which are falling into the same category.
 				     * This list is used at places where a function can be called without an executor. In this case the first available
 				     * executor is used.
 				     */
 				    constexpr auto allExecutors = std::make_tuple(gpuCuda, gpuHip, oneApi, cpuOmpBlocks, cpuTbbBlocks, cpuSerial);
 				} // namespace alpaka::exec
 				// ==
 				// == ./include/alpaka/executor.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/meta/filter.hpp ==
 				// ==
 				/* Copyright 2024 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #include <functional>    // amalgamate: file already included
 				// #include <tuple>    // amalgamate: file already included
 				// #include <utility>    // amalgamate: file already included

 				// #pragma once
 				namespace alpaka::meta
 				{
 				    constexpr auto filter(auto const unaryConditionFn, auto const list)
 				    {
 				        return std::apply(
 				            [=](auto... ts) constexpr
 				            {
 				                return std::tuple_cat(
 				                    std::conditional_t<unaryConditionFn(ts), std::tuple<decltype(ts)>, std::tuple<>>{}...);
 				            },
 				            list);
 				    }
 				} // namespace alpaka::meta
 				// ==
 				// == ./include/alpaka/meta/filter.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/onHost/concepts.hpp ==
 				// ==
 				/* Copyright 2024 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 					// ============================================================================
 					// == ./include/alpaka/onHost/internal/interface.hpp ==
 					// ==
 					/* Copyright 2024 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include "alpaka/KernelBundle.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 						// ============================================================================
 						// == ./include/alpaka/onHost/DeviceProperties.hpp ==
 						// ==
 						/* Copyright 2024 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */


 						// #pragma once
 						// #include <cstdint>    // amalgamate: file already included
 						#include <ostream>
 						// #include <string>    // amalgamate: file already included

 						namespace alpaka::onHost
 						{
 						    struct DeviceProperties
 						    {
 						        auto getName() const
 						        {
 						            return m_name;
 						        }

 						        std::string m_name;
 						        uint32_t m_multiProcessorCount;
 						        uint32_t m_warpSize;
 						        uint32_t m_maxThreadsPerBlock;
 						    };

 						    inline std::ostream& operator<<(std::ostream& s, DeviceProperties const& p)
 						    {
 						        s << "name: " << p.m_name << "\n";
 						        s << "multiProcessorCount: " << p.m_multiProcessorCount << "\n";
 						        s << "warpSize: " << p.m_warpSize << "\n";
 						        s << "maxThreadsPerBlock: " << p.m_maxThreadsPerBlock << "\n";
 						        return s;
 						    };
 						} // namespace alpaka::onHost
 						// ==
 						// == ./include/alpaka/onHost/DeviceProperties.hpp ==
 						// ============================================================================

 						// ============================================================================
 						// == ./include/alpaka/onHost/FrameSpec.hpp ==
 						// ==
 						/* Copyright 2024 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 							// ============================================================================
 							// == ./include/alpaka/onHost/ThreadSpec.hpp ==
 							// ==
 							/* Copyright 2024 René Widera
 							 * SPDX-License-Identifier: MPL-2.0
 							 */

 							// #pragma once
 							// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 							// #include <cstdint>    // amalgamate: file already included
 							// #include <ostream>    // amalgamate: file already included

 							namespace alpaka::onHost
 							{
 							    template<
 							        alpaka::concepts::Vector T_NumBlocks,
 							        alpaka::concepts::Vector<typename T_NumBlocks::type, T_NumBlocks::dim()> T_NumThreads>
 							    struct ThreadSpec
 							    {
 							        using type = typename T_NumBlocks::type;
 							        using NumBlocksVecType = typename T_NumBlocks::UniVec;
 							        using NumThreadsVecType = T_NumThreads;

 							        static consteval uint32_t dim()
 							        {
 							            return T_NumThreads::dim();
 							        }

 							        NumBlocksVecType m_numBlocks;
 							        NumThreadsVecType m_numThreads;

 							        constexpr ThreadSpec(T_NumBlocks const& numBlocks, T_NumThreads const& numThreadsPerBlock)
 							            : m_numBlocks(numBlocks)
 							            , m_numThreads(numThreadsPerBlock)
 							        {
 							        }
 							    };

 							    template<alpaka::concepts::VectorOrScalar T_NumBlocks, alpaka::concepts::VectorOrScalar T_NumThreads>
 							    ThreadSpec(T_NumBlocks const&, T_NumThreads const&)
 							        -> ThreadSpec<alpaka::trait::getVec_t<T_NumBlocks>, alpaka::trait::getVec_t<T_NumThreads>>;

 							    namespace trait
 							    {
 							        template<typename T>
 							        struct IsThreadSpec : std::false_type
 							        {
 							        };

 							        template<alpaka::concepts::Vector T_NumBlocks, alpaka::concepts::Vector T_NumThreads>
 							        struct IsThreadSpec<onHost::ThreadSpec<T_NumBlocks, T_NumThreads>> : std::true_type
 							        {
 							        };
 							    } // namespace trait

 							    template<typename T>
 							    constexpr bool isThreadSpec_v = trait::IsThreadSpec<T>::value;

 							    namespace concepts
 							    {
 							        /** Concept to check if a type is a ThreadSpec
 							         *
 							         * @tparam T Type to check
 							         * @tparam T_IndexType enforce a index type of the thread specification, if not provided the type is not
 							         * checked
 							         * @tparam T_dim enforce a dimensionality of the thread specification, if not provided the value is not
 							         * checked
 							         */
 							        template<typename T, typename T_IndexType = alpaka::NotRequired, uint32_t T_dim = alpaka::notRequiredDim>
 							        concept ThreadSpec
 							            = isThreadSpec_v<T>
 							              && (std::same_as<T_IndexType, alpaka::NotRequired> || std::same_as<typename T::type, T_IndexType>)
 							              && ((T_dim == alpaka::notRequiredDim) || (T::dim() == T_dim));
 							    } // namespace concepts

 							    std::ostream& operator<<(std::ostream& s, concepts::ThreadSpec auto const& t)
 							    {
 							        return s << "ThreadSpec{ blocks=" << t.m_numBlocks << ", threads=" << t.m_numThreads << " }";
 							    }
 							} // namespace alpaka::onHost
 							// ==
 							// == ./include/alpaka/onHost/ThreadSpec.hpp ==
 							// ============================================================================


 						// #include <cstdint>    // amalgamate: file already included
 						// #include <ostream>    // amalgamate: file already included

 						namespace alpaka::onHost
 						{
 						    /** @brief Device/Api-agnostic description of an execution pattern for a kernel.
 						     *
 						     * @details
 						     * A frame specification describes how a multidimensional index range [0; K) is divided into fixed-size chunks,
 						     * called frames (NF), each with a frame extent (FE), where `K = NF * FE`.
 						     * K does not need to match the problem size (P), e.g., the number of elements in a buffer you want to process in a
 						     * kernel. How NF and FE are mapped to physical worker threads and thread blocks within the kernel depends entirely
 						     * on the kernel implementation. Often, the best performance of a kernel can be achieved if `K <= P`, and if the
 						     * kernel uses SIMD operations, `K <= P/(SIMD width)`. A kernel enqueued with a frame specification should always
 						     * be written to be executable with any `FrameSpec` and should not depend on hard-coded thread numbers, to ensure
 						     * portability between devices.
 						     *
 						     * The specification contains three parameters:
 						     * - `numFrames`: The n-dimensional number of frames.
 						     * - `frameExtents`: The n-dimensional size of one execution unit.
 						     * - `threadSpec` (optional): Backend-specific specification of the actual execution resources,
 						     * consisting of the number of blocks and threads. By default, this is automatically chosen
 						     * by alpaka when starting a kernel to fit the `alpaka::onHost::Device` and `alpaka::exec`, ensuring
 						     * compatibility. User-provided specifications might reduce the (performance-)portability.
 						     */
 						    template<
 						        alpaka::concepts::Vector T_NumFrames,
 						        alpaka::concepts::Vector<typename T_NumFrames::type, T_NumFrames::dim()> T_FrameExtents,
 						        alpaka::concepts::Vector<typename T_NumFrames::type, T_NumFrames::dim()> T_ThreadExtents>
 						    struct FrameSpec
 						    {
 						        using type = typename T_NumFrames::type;

 						        using NumFramesVecType = T_NumFrames;
 						        using FrameExtentsVecType = T_FrameExtents;
 						        using ThreadExtentsVecType = T_ThreadExtents;
 						        using ThreadSpecType = ThreadSpec<T_NumFrames, T_ThreadExtents>;

 						        static consteval uint32_t dim()
 						        {
 						            return T_FrameExtents::dim();
 						        }

 						        T_NumFrames m_numFrames;
 						        T_FrameExtents m_frameExtent;
 						        ThreadSpecType m_threadSpec;

 						        FrameSpec(T_NumFrames const& numFrames, T_FrameExtents const& frameExtent)
 						            : m_numFrames(numFrames)
 						            , m_frameExtent(frameExtent)
 						            , m_threadSpec(numFrames, frameExtent)
 						        {
 						        }

 						        FrameSpec(T_NumFrames const& numFrames, T_FrameExtents const& frameExtent, T_ThreadExtents const& numThreads)
 						            : m_numFrames(numFrames)
 						            , m_frameExtent(frameExtent)
 						            , m_threadSpec(numFrames, numThreads)
 						        {
 						        }

 						        FrameSpec(
 						            T_NumFrames const& numFrames,
 						            T_FrameExtents const& frameExtent,
 						            T_NumFrames numBlocks,
 						            T_FrameExtents const& numThreads)
 						            : m_numFrames(numFrames)
 						            , m_frameExtent(frameExtent)
 						            , m_threadSpec(numBlocks, numThreads)
 						        {
 						        }

 						        auto getThreadSpec() const
 						        {
 						            return m_threadSpec;
 						        }
 						    };

 						    template<alpaka::concepts::VectorOrScalar T_NumFrames, alpaka::concepts::VectorOrScalar T_FrameExtents>
 						    FrameSpec(T_NumFrames const&, T_FrameExtents const&) -> FrameSpec<
 						        alpaka::trait::getVec_t<T_NumFrames>,
 						        alpaka::trait::getVec_t<T_FrameExtents>,
 						        alpaka::trait::getVec_t<T_FrameExtents>>;

 						    template<
 						        alpaka::concepts::VectorOrScalar T_NumFrames,
 						        alpaka::concepts::VectorOrScalar T_FrameExtents,
 						        alpaka::concepts::VectorOrScalar T_ThreadExtents>
 						    FrameSpec(T_NumFrames const&, T_FrameExtents const&, T_ThreadExtents const&) -> FrameSpec<
 						        alpaka::trait::getVec_t<T_NumFrames>,
 						        alpaka::trait::getVec_t<T_FrameExtents>,
 						        alpaka::trait::getVec_t<T_ThreadExtents>>;

 						    template<alpaka::concepts::VectorOrScalar T_NumFrames, alpaka::concepts::VectorOrScalar T_FrameExtents>
 						    FrameSpec(T_NumFrames const&, T_FrameExtents const&, T_NumFrames const&, T_FrameExtents const&) -> FrameSpec<
 						        alpaka::trait::getVec_t<T_NumFrames>,
 						        alpaka::trait::getVec_t<T_FrameExtents>,
 						        alpaka::trait::getVec_t<T_FrameExtents>>;

 						    namespace trait
 						    {
 						        template<typename T>
 						        struct IsFrameSpec : std::false_type
 						        {
 						        };

 						        template<
 						            alpaka::concepts::Vector T_NumFrames,
 						            alpaka::concepts::Vector T_FrameExtents,
 						            alpaka::concepts::Vector T_ThreadExtents>
 						        struct IsFrameSpec<onHost::FrameSpec<T_NumFrames, T_FrameExtents, T_ThreadExtents>> : std::true_type
 						        {
 						        };
 						    } // namespace trait

 						    template<typename T>
 						    constexpr bool isFrameSpec_v = trait::IsFrameSpec<T>::value;

 						    namespace concepts
 						    {
 						        /** Concept to check if a type is a FrameSpec
 						         *
 						         * @tparam T Type to check
 						         * @tparam T_IndexType enforce a index type of the frame specification, if not provided the type is not checked
 						         * @tparam T_dim enforce a dimensionality of the frame specification, if not provided the value is not
 						         * checked
 						         */
 						        template<typename T, typename T_IndexType = alpaka::NotRequired, uint32_t T_dim = alpaka::notRequiredDim>
 						        concept FrameSpec
 						            = isFrameSpec_v<T>
 						              && (std::same_as<T_IndexType, alpaka::NotRequired> || std::same_as<typename T::type, T_IndexType>)
 						              && ((T_dim == alpaka::notRequiredDim) || (T::dim() == T_dim));

 						        /** Concept to check if a type is a ThreadSpec or a FrameSpec
 						         *
 						         * @tparam T Type to check
 						         */
 						        template<typename T>
 						        concept ThreadOrFrameSpec = isFrameSpec_v<T> || isThreadSpec_v<T>;
 						    } // namespace concepts

 						    std::ostream& operator<<(std::ostream& s, concepts::FrameSpec auto const& d)
 						    {
 						        return s << "FrameSpec{ frames=" << d.m_numFrames << ", frameExtent=" << d.m_frameExtent << ", "
 						                 << d.getThreadSpec() << " }";
 						    }

 						} // namespace alpaka::onHost
 						// ==
 						// == ./include/alpaka/onHost/FrameSpec.hpp ==
 						// ============================================================================

 					// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/onHost/ThreadSpec.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 					namespace alpaka::onHost
 					{
 					    namespace internal
 					    {
 					        struct MakePlatform
 					        {
 					            template<typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 					            struct Op
 					            {
 					                auto operator()(T_Api api, T_DeviceKind deviceType) const;
 					            };
 					        };

 					        static auto makePlatform(auto api, alpaka::concepts::DeviceKind auto deviceType)
 					        {
 					            return MakePlatform::Op<ALPAKA_TYPEOF(api), ALPAKA_TYPEOF(deviceType)>{}(api, deviceType);
 					        }

 					        struct GetDeviceCount
 					        {
 					            template<typename T_Platform>
 					            struct Op
 					            {
 					                uint32_t operator()(T_Platform& platform) const
 					                {
 					                    return platform.getDeviceCount();
 					                }
 					            };
 					        };

 					        struct MakeDevice
 					        {
 					            template<typename T_Platform>
 					            struct Op
 					            {
 					                auto operator()(auto& platform, uint32_t idx) const
 					                {
 					                    return platform.makeDevice(idx);
 					                }
 					            };
 					        };

 					        struct GetDevice
 					        {
 					            template<typename T_Any>
 					            struct Op
 					            {
 					                auto operator()(T_Any const& any) const
 					                {
 					                    return any.getDevice();
 					                }
 					            };
 					        };

 					        inline constexpr auto getDevice(auto&& any)
 					        {
 					            return GetDevice::Op<std::decay_t<decltype(any)>>{}(any);
 					        }

 					        struct GetNativeHandle
 					        {
 					            template<typename T_Any>
 					            struct Op
 					            {
 					                auto operator()(T_Any const& any) const
 					                {
 					                    return any.getNativeHandle();
 					                }
 					            };
 					        };

 					        inline auto getNativeHandle(auto&& any)
 					        {
 					            return GetNativeHandle::Op<std::decay_t<decltype(any)>>{}(any);
 					        }

 					        struct MakeQueue
 					        {
 					            template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 					            struct Op
 					            {
 					                auto operator()(T_Device& device, T_QueueKind) const
 					                {
 					                    return device.makeQueue(T_QueueKind{});
 					                }
 					            };
 					        };

 					        struct MakeEvent
 					        {
 					            template<typename T_Device>
 					            struct Op
 					            {
 					                auto operator()(T_Device& device) const
 					                {
 					                    return device.makeEvent();
 					                }
 					            };
 					        };

 					        struct Wait
 					        {
 					            template<typename T_Any>
 					            struct Op
 					            {
 					                void operator()(T_Any& any)
 					                {
 					                    any.wait();
 					                }
 					            };
 					        };

 					        inline void wait(auto&& any)
 					        {
 					            Wait::Op<std::decay_t<decltype(any)>>{}(any);
 					        }

 					        struct WaitFor
 					        {
 					            template<typename T_Queue, typename T_Event>
 					            struct Op
 					            {
 					                void operator()(T_Queue& queue, T_Event& event)
 					                {
 					                    queue.waitFor(event);
 					                }
 					            };
 					        };

 					        inline void waitFor(auto& queue, auto& event)
 					        {
 					            WaitFor::Op<ALPAKA_TYPEOF(queue), ALPAKA_TYPEOF(event)>{}(queue, event);
 					        }

 					        struct IsEventComplete
 					        {
 					            template<typename T_Any>
 					            struct Op
 					            {
 					                bool operator()(T_Any& any)
 					                {
 					                    return any.isEventComplete();
 					                }
 					            };
 					        };

 					        inline bool isEventComplete(auto&& any)
 					        {
 					            return IsEventComplete::Op<ALPAKA_TYPEOF(any)>{}(any);
 					        }

 					        struct Enqueue
 					        {
 					            template<
 					                typename T_Queue,
 					                alpaka::concepts::Executor T_Executor,
 					                onHost::concepts::ThreadOrFrameSpec T_BlockCfg,
 					                alpaka::concepts::KernelBundle T_KernelBundle>
 					            struct Kernel
 					            {
 					                void operator()(
 					                    T_Queue& queue,
 					                    T_Executor const executor,
 					                    T_BlockCfg const& blockCfg,
 					                    T_KernelBundle const& kernelBundle) const
 					                {
 					                    queue.enqueue(executor, blockCfg, kernelBundle);
 					                }
 					            };

 					            template<typename T_Queue, typename T_Task>
 					            struct Task
 					            {
 					                void operator()(T_Queue& queue, T_Task const& task) const
 					                {
 					                    queue.enqueue(task);
 					                }
 					            };

 					            template<typename T_Queue, typename T_Event>
 					            struct Event
 					            {
 					                void operator()(T_Queue& queue, T_Event& event) const
 					                {
 					                    queue.enqueue(event);
 					                }
 					            };
 					        };

 					        inline void enqueue(auto& queue, auto const& task)
 					        {
 					            Enqueue::Task<std::decay_t<decltype(queue)>, std::decay_t<decltype(task)>>{}(queue, task);
 					        }

 					        template<typename TKernelFn, typename... TArgs>
 					        inline void enqueue(
 					            auto& queue,
 					            auto const executor,
 					            onHost::concepts::ThreadOrFrameSpec auto const& blockCfg,
 					            KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
 					        {
 					            Enqueue::Kernel<
 					                std::decay_t<decltype(queue)>,
 					                std::decay_t<decltype(executor)>,
 					                std::decay_t<decltype(blockCfg)>,
 					                KernelBundle<TKernelFn, TArgs...>>{}(queue, executor, blockCfg, kernelBundle);
 					        }

 					        struct AdjustThreadSpec
 					        {
 					            template<
 					                typename T_Device,
 					                alpaka::concepts::Executor T_Executor,
 					                onHost::concepts::FrameSpec T_FrameSpec,
 					                alpaka::concepts::KernelBundle T_KernelBundle>
 					            struct Op
 					            {
 					                auto operator()(
 					                    T_Device const&,
 					                    T_Executor const& executor,
 					                    T_FrameSpec const& frameSpec,
 					                    T_KernelBundle const& kernelBundle) const
 					                {
 					                    return frameSpec.getThreadSpec();
 					                }
 					            };
 					        };

 					        template<typename TKernelFn, typename... TArgs>
 					        static auto adjustThreadSpec(
 					            auto const& device,
 					            auto const& executor,
 					            onHost::concepts::FrameSpec auto const& dataBlocking,
 					            KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
 					        {
 					            return AdjustThreadSpec::Op<
 					                ALPAKA_TYPEOF(device),
 					                ALPAKA_TYPEOF(executor),
 					                ALPAKA_TYPEOF(dataBlocking),
 					                KernelBundle<TKernelFn, TArgs...>>{}(device, executor, dataBlocking, kernelBundle);
 					        }

 					        struct Data
 					        {
 					            template<typename T_Any>
 					            struct Op
 					            {
 					                decltype(auto) operator()(auto&& any) const
 					                {
 					                    return std::data(any);
 					                }
 					            };

 					            static decltype(auto) data(auto&& any)
 					            {
 					                return Op<std::decay_t<decltype(any)>>{}(any);
 					            }

 					            template<typename T_Any>
 					            static decltype(auto) data(Handle<T_Any>&& anyHandle)
 					            {
 					                return Op<std::decay_t<decltype(*anyHandle.get())>>{}(*anyHandle.get());
 					            }
 					        };

 					        struct Alloc
 					        {
 					            template<typename T_Type, typename T_Any, typename T_Extents>
 					            struct Op
 					            {
 					                void operator()(T_Any& any, T_Extents const&) const;
 					            };
 					        };

 					        struct AllocDeferred
 					        {
 					            template<typename T_Type, typename T_Any, typename T_Extents>
 					            struct Op
 					            {
 					                void operator()(T_Any& any, T_Extents const&) const;
 					            };
 					        };

 					        struct AllocUnified
 					        {
 					            template<typename T_Type, typename T_Any, typename T_Extents>
 					            struct Op
 					            {
 					                void operator()(T_Any& any, T_Extents const&) const;
 					            };
 					        };

 					        struct AllocMapped
 					        {
 					            template<typename T_Type, typename T_Any, typename T_Extents>
 					            struct Op
 					            {
 					                void operator()(T_Any& any, T_Extents const&) const;
 					            };
 					        };

 					        /** checks if a view can be accessed from the given device
 					         *
 					         * There are two paths to check if a view is accessible:
 					         *   - first: Try to validate the view in the scope of the device.
 					         *   - second: Try to validate based on soft criteria in the scope of the view's API.
 					         *             This path is required because the host API does not know about view data locations.
 					         *             The second path is optionally and will return always false if not specialized.
 					         */
 					        struct IsDataAccessible
 					        {
 					            template<typename T_Device, typename T_Any>
 					            struct FirstPath
 					            {
 					                bool operator()(T_Device& device, T_Any const& any) const;
 					            };

 					            template<typename T_DataApi, alpaka::concepts::DeviceKind T_DeviceKind, typename T_Any>
 					            struct SecondPath
 					            {
 					                bool operator()(T_DataApi, T_DeviceKind, T_Any const& any) const
 					                {
 					                    return false;
 					                }
 					            };
 					        };

 					        struct Memcpy
 					        {
 					            template<typename T_Queue, typename T_Dest, typename T_Source, typename T_Extents>
 					            struct Op
 					            {
 					                void operator()(T_Queue& queue, auto&&, T_Source const&, T_Extents const&) const;
 					            };
 					        };

 					        struct Memset
 					        {
 					            template<typename T_Queue, typename T_Dest, typename T_Extents>
 					            struct Op
 					            {
 					                void operator()(T_Queue& queue, auto&&, uint8_t, T_Extents const&) const;
 					            };
 					        };

 					        struct Fill
 					        {
 					            template<typename T_Queue, typename T_Dest, typename T_Value, typename T_Extents>
 					            struct Op
 					            {
 					                void operator()(T_Queue& queue, auto&&, T_Value, T_Extents const&) const;
 					            };
 					        };

 					        struct GetDeviceProperties
 					        {
 					            template<typename T_Any>
 					            struct Op
 					            {
 					                DeviceProperties operator()(auto const& platform, uint32_t idx) const;

 					                DeviceProperties operator()(auto const& device) const;
 					            };
 					        };

 					        inline DeviceProperties getDeviceProperties(auto const& platform, uint32_t idx)
 					        {
 					            return GetDeviceProperties::Op<ALPAKA_TYPEOF(platform)>{}(platform, idx);
 					        }

 					        struct GetExtents
 					        {
 					            template<typename T_Any>
 					            struct Op
 					            {
 					                decltype(auto) operator()(auto&& any) const
 					                {
 					                    return any.getExtents();
 					                }
 					            };
 					        };

 					        inline auto getExtents(auto&& any)
 					        {
 					            return GetExtents::Op<std::decay_t<decltype(any)>>{}(any);
 					        }

 					        template<typename T_Any>
 					        inline auto getExtents(Handle<T_Any>&& any)
 					        {
 					            return GetExtents::Op<ALPAKA_TYPEOF(*any.get())>{}(*any.get());
 					        }

 					        struct GetPitches
 					        {
 					            template<typename T_Any>
 					            struct Op
 					            {
 					                decltype(auto) operator()(auto&& any) const
 					                {
 					                    return any.getPitches();
 					                }
 					            };
 					        };

 					        inline auto getPitches(auto&& any)
 					        {
 					            return GetPitches::Op<std::decay_t<decltype(any)>>{}(any);
 					        }

 					        template<typename T_Any>
 					        inline auto getPitches(Handle<T_Any>&& any)
 					        {
 					            return GetPitches::Op<ALPAKA_TYPEOF(*any.get())>{}(*any.get());
 					        }

 					    } // namespace internal
 					} // namespace alpaka::onHost
 					// ==
 					// == ./include/alpaka/onHost/internal/interface.hpp ==
 					// ============================================================================


 				// #include <concepts>    // amalgamate: file already included
 				// #include <string>    // amalgamate: file already included

 				namespace alpaka::onHost
 				{
 				    namespace internal::concepts
 				    {
 				        template<typename T>
 				        concept Device = requires(T device) {
 				            { alpaka::internal::GetName::Op<T>{}(device) } -> std::convertible_to<std::string>;
 				            { internal::MakeEvent::Op<T>{}(device) };
 				            { internal::GetNativeHandle::Op<T>{}(device) };
 				            { internal::GetDeviceProperties::Op<T>{}(device) };
 				        };

 				        template<typename T>
 				        concept Platform = requires(T platform) {
 				            { alpaka::internal::GetName::Op<T>{}(platform) };
 				        };

 				        template<typename T>
 				        concept Queue = requires(T device) {
 				            { alpaka::internal::GetName::Op<T>{}(device) } -> std::convertible_to<std::string>;
 				            { internal::GetNativeHandle::Op<T>{}(device) };
 				        };

 				        template<typename T>
 				        concept QueueHandle = requires(T t) {
 				            typename T::element_type;
 				            requires Queue<typename T::element_type>;
 				        };

 				        template<typename T>
 				        concept PlatformHandle = requires(T t) {
 				            typename T::element_type;
 				            requires Platform<typename T::element_type>;
 				        };

 				        template<typename T>
 				        concept DeviceHandle = requires(T t) {
 				            typename T::element_type;
 				            requires Device<typename T::element_type>;
 				        };
 				    } // namespace internal::concepts

 				    namespace concepts
 				    {
 				        template<typename T>
 				        concept NameHandle = requires(T t) {
 				            typename T::element_type;
 				            requires alpaka::concepts::HasName<typename T::element_type>;
 				        };

 				        template<typename T>
 				        concept StaticNameHandle = requires(T t) {
 				            typename T::element_type;
 				            requires alpaka::concepts::HasStaticName<typename T::element_type>;
 				        };
 				    } // namespace concepts

 				} // namespace alpaka::onHost
 				// ==
 				// == ./include/alpaka/onHost/concepts.hpp ==
 				// ============================================================================

 			// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 			#include <type_traits>

 			namespace alpaka::onHost
 			{
 			    namespace trait
 			    {
 			        struct IsPlatformAvailable
 			        {
 			            template<alpaka::concepts::Api T_Api>
 			            struct Op : std::false_type
 			            {
 			            };
 			        };

 			        struct IsExecutorSupportedBy
 			        {
 			            template<alpaka::concepts::Executor T_Executor, typename T_Device>
 			            struct Op : std::false_type
 			            {
 			            };
 			        };

 			        template<alpaka::concepts::Executor T_Executor, internal::concepts::DeviceHandle T_DeviceHandle>
 			        struct IsExecutorSupportedBy::Op<T_Executor, T_DeviceHandle>
 			            : IsExecutorSupportedBy::Op<T_Executor, typename T_DeviceHandle::element_type>
 			        {
 			        };

 			        struct IsDeviceSupportedBy
 			        {
 			            template<alpaka::concepts::DeviceKind T_DeviceKind, typename T_Api>
 			            struct Op : std::false_type
 			            {
 			            };
 			        };

 			        template<typename T_Kernel, typename T_Spec>
 			        struct BlockDynSharedMemBytes
 			        {
 			            BlockDynSharedMemBytes(T_Kernel kernel, T_Spec spec)
 			            {
 			            }

 			            // requires (false) is disabling the function if you specialize these traits remove the require statement.
 			            // Disabling is required to enable the trait evaluation only in cases where the user is defining the trait.
 			            uint32_t operator()(auto const executor, auto const&... args) const requires(false)
 			            {
 			                return 0;
 			            }
 			        };

 			        template<
 			            alpaka::concepts::Executor T_Executor,
 			            onHost::concepts::ThreadSpec T_ThreadSpec,
 			            alpaka::concepts::KernelBundle T_KernelBundle>
 			        struct GetDynSharedMemBytes
 			        {
 			            static constexpr bool zeroSharedMemory = true;

 			            uint32_t operator()(
 			                T_Executor const executor,
 			                T_ThreadSpec const spec,
 			                [[maybe_unused]] T_KernelBundle const& kernelBundle) const
 			            {
 			                return 0u;
 			            }
 			        };

 			        template<alpaka::concepts::Executor T_Executor, typename T_Spec, typename T_KernelFn, typename... T_Args>
 			        requires requires() { std::declval<T_KernelFn>().dynSharedMemBytes; } || requires() {
 			            BlockDynSharedMemBytes<T_KernelFn, T_Spec>{std::declval<T_KernelFn>(), std::declval<T_Spec>()}(
 			                std::declval<T_Executor>(),
 			                std::declval<remove_restrict_t<std::decay_t<T_Args>>>()...);
 			        }
 			        struct GetDynSharedMemBytes<T_Executor, T_Spec, KernelBundle<T_KernelFn, T_Args...>>
 			        {
 			            uint32_t operator()(
 			                T_Executor const executor,
 			                T_Spec const spec,
 			                [[maybe_unused]] KernelBundle<T_KernelFn, T_Args...> const& kernelBundle) const
 			            {
 			                if constexpr(requires {
 			                                 BlockDynSharedMemBytes<T_KernelFn, T_Spec>{kernelBundle.m_kernelFn, spec}(
 			                                     executor,
 			                                     std::declval<remove_restrict_t<std::decay_t<T_Args>>>()...);
 			                             })
 			                {
 			                    return alpaka::apply(
 			                        [&](auto const&... args)
 			                        {
 			                            return BlockDynSharedMemBytes<T_KernelFn, T_Spec>{kernelBundle.m_kernelFn, spec}(
 			                                executor,
 			                                args...);
 			                        },
 			                        kernelBundle.m_args);
 			                }
 			                else
 			                {
 			                    return kernelBundle.m_kernelFn.dynSharedMemBytes;
 			                }
 			            }
 			        };

 			        template<
 			            alpaka::concepts::Executor T_Executor,
 			            onHost::concepts::ThreadSpec T_ThreadSpec,
 			            alpaka::concepts::KernelBundle T_KernelBundle>
 			        struct HasUserDefinedDynSharedMemBytes : std::true_type
 			        {
 			        };

 			        template<
 			            alpaka::concepts::Executor T_Executor,
 			            onHost::concepts::ThreadSpec T_ThreadSpec,
 			            alpaka::concepts::KernelBundle T_KernelBundle>
 			        requires(trait::GetDynSharedMemBytes<T_Executor, T_ThreadSpec, T_KernelBundle>::zeroSharedMemory == true)
 			        struct HasUserDefinedDynSharedMemBytes<T_Executor, T_ThreadSpec, T_KernelBundle> : std::false_type
 			        {
 			        };
 			    } // namespace trait

 			    consteval bool isPlatformAvaiable(alpaka::concepts::Api auto api)
 			    {
 			        return trait::IsPlatformAvailable::Op<std::decay_t<decltype(api)>>::value;
 			    }

 			    consteval bool isExecutorSupportedBy(auto executor, internal::concepts::DeviceHandle auto const& deviceHandle)
 			    {
 			        return trait::IsExecutorSupportedBy::Op<ALPAKA_TYPEOF(executor), ALPAKA_TYPEOF(deviceHandle)>::value;
 			    }

 			    constexpr auto supportedExecutors(internal::concepts::DeviceHandle auto deviceHandle, auto const listOfExecutors)
 			    {
 			        return meta::filter(
 			            // we can not use isExecutorSupportedBy() because gcc14 is stricter in the detection which functions can
 			            // be evaluated at compile time
 			            [&](auto executor) constexpr
 			            { return trait::IsExecutorSupportedBy::Op<ALPAKA_TYPEOF(executor), ALPAKA_TYPEOF(deviceHandle)>::value; },
 			            listOfExecutors);
 			    }

 			    constexpr auto supportedDevices(auto const api)
 			    {
 			        return meta::filter(
 			            // we can not use isExecutorSupportedBy() because gcc14 is stricter in the detection which functions can
 			            // be evaluated at compile time
 			            [&](auto devTag) constexpr
 			            { return trait::IsDeviceSupportedBy::Op<ALPAKA_TYPEOF(devTag), ALPAKA_TYPEOF(api)>::value; },
 			            deviceKind::allDevices);
 			    }

 			    template<
 			        alpaka::concepts::Executor T_Executor,
 			        onHost::concepts::ThreadSpec T_ThreadSpec,
 			        alpaka::concepts::KernelBundle T_KernelBundle>
 			    constexpr uint32_t getDynSharedMemBytes(
 			        T_Executor const executor,
 			        T_ThreadSpec spec,
 			        T_KernelBundle const& kernelBundle)
 			    {
 			        return trait::GetDynSharedMemBytes<T_Executor, T_ThreadSpec, T_KernelBundle>{}(executor, spec, kernelBundle);
 			    }

 			    template<
 			        alpaka::concepts::Executor T_Executor,
 			        onHost::concepts::ThreadSpec T_ThreadSpec,
 			        alpaka::concepts::KernelBundle T_KernelBundle>
 			    consteval bool hasUserDefinedDynSharedMemBytes(
 			        T_Executor const executor,
 			        T_ThreadSpec spec,
 			        T_KernelBundle const& kernelBundle)
 			    {
 			        return trait::HasUserDefinedDynSharedMemBytes<T_Executor, T_ThreadSpec, T_KernelBundle>::value;
 			    }

 			} // namespace alpaka::onHost
 			// ==
 			// == ./include/alpaka/onHost/trait.hpp ==
 			// ============================================================================

 		// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 		// #include <memory>    // amalgamate: file already included
 		// #include <sstream>    // amalgamate: file already included

 		namespace alpaka
 		{
 		    namespace api
 		    {
 		        struct Cuda : detail::ApiBase
 		        {
 		            using element_type = Cuda;

 		            auto get() const
 		            {
 		                return this;
 		            }

 		            void _()
 		            {
 		                static_assert(concepts::Api<Cuda>);
 		            }

 		            static std::string getName()
 		            {
 		                return "Cuda";
 		            }
 		        };

 		        constexpr auto cuda = Cuda{};

 		    } // namespace api

 		    namespace onHost::trait
 		    {
 		#if ALPAKA_LANG_CUDA
 		        template<>
 		        struct IsPlatformAvailable::Op<api::Cuda> : std::true_type
 		        {
 		        };

 		        template<>
 		        struct IsDeviceSupportedBy::Op<deviceKind::NvidiaGpu, api::Cuda> : std::true_type
 		        {
 		        };
 		#endif
 		    } // namespace onHost::trait

 		    namespace unifiedCudaHip::trait
 		    {
 		        template<>
 		        struct IsUnifiedApi<api::Cuda> : std::true_type
 		        {
 		        };
 		    } // namespace unifiedCudaHip::trait

 		    namespace trait
 		    {
 		        template<typename T_Type>
 		        struct GetArchSimdWidth::Op<T_Type, api::Cuda, deviceKind::NvidiaGpu>
 		        {
 		            constexpr uint32_t operator()(api::Cuda const, deviceKind::NvidiaGpu const) const
 		            {
 		                /** vector load and store width in bytes */
 		                constexpr size_t simdWidthInByte = 16u;
 		                return alpaka::divExZero(simdWidthInByte, sizeof(T_Type));
 		            }
 		        };

 		        template<>
 		        struct GetNumPipelines::Op<api::Cuda, deviceKind::NvidiaGpu>
 		        {
 		            constexpr uint32_t operator()(api::Cuda const, deviceKind::NvidiaGpu const) const
 		            {
 		                /* NVIDIA GPUs have two scheduler what we interpreted as pipelines. */
 		                constexpr uint32_t numPipes = 2u;
 		                return numPipes;
 		            }
 		        };

 		        template<>
 		        struct GetCachelineSize::Op<api::Cuda, deviceKind::NvidiaGpu>
 		        {
 		            constexpr uint32_t operator()(api::Cuda const, deviceKind::NvidiaGpu const) const
 		            {
 		                // loading 16 byte per thread will result in optimal memory bandwith
 		                return 16u;
 		            }
 		        };
 		    } // namespace trait
 		} // namespace alpaka
 		// ==
 		// == ./include/alpaka/api/cuda/Api.hpp ==
 		// ============================================================================

 		// ============================================================================
 		// == ./include/alpaka/api/hip/Api.hpp ==
 		// ==
 		/* Copyright 2024 René Widera
 		 * SPDX-License-Identifier: MPL-2.0
 		 */


 		// #pragma once
 		// #include "alpaka/api/unifiedCudaHip/trait.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 		// #include <memory>    // amalgamate: file already included
 		// #include <sstream>    // amalgamate: file already included

 		namespace alpaka
 		{
 		    namespace api
 		    {
 		        struct Hip : detail::ApiBase
 		        {
 		            using element_type = Hip;

 		            auto get() const
 		            {
 		                return this;
 		            }

 		            void _()
 		            {
 		                static_assert(concepts::Api<Hip>);
 		            }

 		            static std::string getName()
 		            {
 		                return "Hip";
 		            }
 		        };

 		        constexpr auto hip = Hip{};
 		    } // namespace api

 		    namespace onHost::trait
 		    {
 		#if ALPAKA_LANG_HIP
 		        template<>
 		        struct IsPlatformAvailable::Op<api::Hip> : std::true_type
 		        {
 		        };

 		        template<>
 		        struct IsDeviceSupportedBy::Op<deviceKind::AmdGpu, api::Hip> : std::true_type
 		        {
 		        };
 		#endif
 		    } // namespace onHost::trait

 		    namespace unifiedCudaHip::trait
 		    {
 		        template<>
 		        struct IsUnifiedApi<api::Hip> : std::true_type
 		        {
 		        };
 		    } // namespace unifiedCudaHip::trait

 		    namespace trait
 		    {
 		        template<typename T_Type>
 		        struct GetArchSimdWidth::Op<T_Type, api::Hip, deviceKind::AmdGpu>
 		        {
 		            constexpr uint32_t operator()(api::Hip const, deviceKind::AmdGpu const) const
 		            {
 		                /** vector load/store width in bytes */
 		                constexpr size_t simdWidthInByte = 16u;
 		                return alpaka::divExZero(simdWidthInByte, sizeof(T_Type));
 		            }
 		        };

 		        template<>
 		        struct GetNumPipelines::Op<api::Hip, deviceKind::AmdGpu>
 		        {
 		            constexpr uint32_t operator()(api::Hip const, deviceKind::AmdGpu const) const
 		            {
 		                /* AMD GPUs SIMD units will be interpreted as pipelines */
 		                constexpr uint32_t numPipes = 4u;
 		                return numPipes;
 		            }
 		        };

 		        template<>
 		        struct GetCachelineSize::Op<api::Hip, deviceKind::AmdGpu>
 		        {
 		            constexpr uint32_t operator()(api::Hip const, deviceKind::AmdGpu const) const
 		            {
 		                // loading 16 byte per thread will result in optimal memory bandwith
 		                return 16u;
 		            }
 		        };
 		    } // namespace trait
 		} // namespace alpaka
 		// ==
 		// == ./include/alpaka/api/hip/Api.hpp ==
 		// ============================================================================

 		// ============================================================================
 		// == ./include/alpaka/api/host/Api.hpp ==
 		// ==
 		/* Copyright 2024 René Widera
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 			// ============================================================================
 			// == ./include/alpaka/api/host/cpuArchSize.hpp ==
 			// ==
 			/* Copyright 2025 René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 			// #include <cstdint>    // amalgamate: file already included

 			namespace alpaka::onHost::internal
 			{
 			    template<typename T_Type>
 			    constexpr uint32_t getCPUSimdWidth()
 			    {
 			        constexpr size_t simdWidthInByte =
 			#if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX512DQ__) || defined(__AVX512VL__)
 			            64u;
 			#elif defined(__riscv_vector)
 			            64u;
 			#elif defined(__riscv)
 			            // do not use vectors if the vector extension is not set
 			            sizeof(T_Type);
 			        // ARM e.g. nvidia grace hopper
 			#elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SVE2_AES) || defined(__ARM_FEATURE_DOTPROD)
 			            64u;
 			#elif defined(__AVX2__)
 			            32u;
 			#elif defined(__SSE__) || defined(__SSE2__) || defined(__SSE4_1__) || defined(__SSE4_2__)
 			            16u;
 			#elif defined(__ARM_NEON__)
 			            16u;
 			#elif defined(__ALTIVEC__)
 			            16u;
 			#else
 			            sizeof(T_Type);
 			#endif
 			        return alpaka::divExZero(simdWidthInByte, sizeof(T_Type));
 			    }

 			    constexpr uint32_t getCPUNumPipelines()
 			    {
 			        /* INTEL can issue 4 commands and AMD typically 2, since we can not distinguish between both we use
 			         * the higher value.
 			         * ARM SVE can typically issue 4 commands too.
 			         *
 			         * Therefor we use at the moment as default 4.
 			         */
 			        constexpr uint32_t numPipes = 4u;
 			        return numPipes;
 			    }

 			    constexpr uint32_t getCPUCachelineSize()
 			    {
 			        constexpr uint32_t cachlineBytes =
 			#ifdef __cpp_lib_hardware_interference_size
 			            std::hardware_constructive_interference_size;

 			#else
 			            // Fallback value, typically 64 bytes
 			            64;
 			#endif
 			        return cachlineBytes;
 			    }

 			} // namespace alpaka::onHost::internal
 			// ==
 			// == ./include/alpaka/api/host/cpuArchSize.hpp ==
 			// ============================================================================

 		// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/mem/trait.hpp ==
 			// ==
 			/* Copyright 2025 René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once

 			// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 				// ============================================================================
 				// == ./include/alpaka/onAcc/layout.hpp ==
 				// ==
 				/* Copyright 2024 Andrea Bocci, René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				namespace alpaka::onAcc
 				{
 				    namespace layout
 				    {
 				        /** Generates indices scattered based on the number of worker threads for each dimension.*/
 				        struct Strided
 				        {
 				        };

 				        constexpr auto strided = Strided{};

 				        /** Indices will be contiguous within each dimension for each worker thread. */
 				        struct Contiguous
 				        {
 				        };

 				        constexpr auto contiguous = Contiguous{};

 				        /** The index layout will automatically selected based on the executor. */
 				        struct Optimized
 				        {
 				        };

 				        constexpr auto optimized = Optimized{};
 				    } // namespace layout
 				} // namespace alpaka::onAcc
 				// ==
 				// == ./include/alpaka/onAcc/layout.hpp ==
 				// ============================================================================

 			// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 			// #include <cstdint>    // amalgamate: file already included

 			namespace alpaka
 			{
 			    namespace onAcc::internal
 			    {
 			        namespace trait
 			        {
 			            struct AutoIndexMapping
 			            {
 			                template<typename T_Acc, typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 			                struct Op
 			                {
 			                    constexpr auto operator()(T_Acc const&, T_Api, T_DeviceKind) const
 			                    {
 			                        return layout::Strided{};
 			                    }
 			                };
 			            };
 			        } // namespace trait

 			        constexpr auto adjustMapping(auto const& acc)
 			        {
 			            return trait::AutoIndexMapping::
 			                Op<ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(acc.getApi()), ALPAKA_TYPEOF(acc.getDeviceKind())>{}(
 			                    acc,
 			                    acc.getApi(),
 			                    acc.getDeviceKind());
 			        }

 			    } // namespace onAcc::internal

 			    namespace internal
 			    {
 			        /** Specialize the trait for DataSource class if the object is copyable.
 			         *
 			         * @tparam TDataSource The DataSource class.
 			         *
 			         * @details
 			         *
 			         * The trait is used in the alpaka::internal::concepts::CopyConstructableDataSource concept to check whether
 			         * the copy constructor respects the const correctness of the data type.
 			         *
 			         * Example specialization:
 			         *
 			         * @code
 			         * template<typename T_Type>
 			         * struct CopyConstructableDataSource<Storage<T_Type> : std::true_type {
 			         *      using InnerMutable = Storage<std::remove_const_t<T_Type>>;
 			         *      using InnerConst = Storage<std::add_const_t<T_Type>>;
 			         * };
 			         * @endcode
 			         */
 			        template<typename TDataSource>
 			        struct CopyConstructableDataSource : std::false_type
 			        {
 			        };

 			    }; // namespace internal
 			} // namespace alpaka
 			// ==
 			// == ./include/alpaka/mem/trait.hpp ==
 			// ============================================================================

 		// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined

 		// #include <sstream>    // amalgamate: file already included

 		namespace alpaka
 		{
 		    namespace api
 		    {
 		        struct Host : detail::ApiBase
 		        {
 		            using element_type = Host;

 		            auto get() const
 		            {
 		                return this;
 		            }

 		            void _()
 		            {
 		                static_assert(concepts::Api<Host>);
 		            }

 		            static std::string getName()
 		            {
 		                return "Host";
 		            }
 		        };

 		        constexpr auto host = Host{};
 		    } // namespace api

 		    namespace onHost::trait
 		    {
 		        template<>
 		        struct IsPlatformAvailable::Op<api::Host> : std::true_type
 		        {
 		        };

 		        template<>
 		        struct IsDeviceSupportedBy::Op<deviceKind::Cpu, api::Host> : std::true_type
 		        {
 		        };

 		    } // namespace onHost::trait

 		    namespace trait
 		    {

 		        template<typename T_Type>
 		        struct GetArchSimdWidth::Op<T_Type, api::Host, deviceKind::Cpu>
 		        {
 		            constexpr uint32_t operator()(api::Host const, deviceKind::Cpu const) const
 		            {
 		                return alpaka::onHost::internal::getCPUSimdWidth<T_Type>();
 		            }
 		        };

 		        template<>
 		        struct GetNumPipelines::Op<api::Host, deviceKind::Cpu>
 		        {
 		            constexpr uint32_t operator()(api::Host const, deviceKind::Cpu const) const
 		            {
 		                return alpaka::onHost::internal::getCPUNumPipelines();
 		            }
 		        };

 		        template<>
 		        struct GetCachelineSize::Op<api::Host, deviceKind::Cpu>
 		        {
 		            constexpr uint32_t operator()(api::Host const, deviceKind::Cpu const) const
 		            {
 		                return alpaka::onHost::internal::getCPUCachelineSize();
 		            }
 		        };

 		    } // namespace trait

 		    namespace onAcc::internal::trait
 		    {
 		        template<typename T_Acc>
 		        struct AutoIndexMapping::Op<T_Acc, api::Host, deviceKind::Cpu>
 		        {
 		            constexpr auto operator()(T_Acc const&, api::Host, deviceKind::Cpu) const
 		            {
 		                return layout::Contiguous{};
 		            }
 		        };
 		    } // namespace onAcc::internal::trait
 		} // namespace alpaka
 		// ==
 		// == ./include/alpaka/api/host/Api.hpp ==
 		// ============================================================================

 		// ============================================================================
 		// == ./include/alpaka/api/oneApi/Api.hpp ==
 		// ==
 		/* Copyright 2024 René Widera, Simeon Ehrig
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include "alpaka/api/host/cpuArchSize.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/api/syclGeneric/Api.hpp ==
 			// ==
 			/* Copyright 2024 René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined

 			// #include <memory>    // amalgamate: file already included
 			// #include <string>    // amalgamate: file already included

 			namespace alpaka
 			{
 			    namespace api
 			    {
 			        template<typename TApiInterface>
 			        struct GenericSycl : detail::ApiBase
 			        {
 			            using element_type = TApiInterface;

 			            auto get() const
 			            {
 			                return static_cast<TApiInterface const*>(this);
 			            }

 			            void _()
 			            {
 			                static_assert(concepts::Api<GenericSycl<TApiInterface>>);
 			            }

 			            static std::string getName()
 			            {
 			                return "GenericSycl";
 			            }
 			        };
 			    } // namespace api
 			} // namespace alpaka
 			// ==
 			// == ./include/alpaka/api/syclGeneric/Api.hpp ==
 			// ============================================================================

 		// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 		// #include <string>    // amalgamate: file already included

 		namespace alpaka
 		{
 		    namespace api
 		    {
 		        struct OneApi : public GenericSycl<OneApi>
 		        {
 		            static std::string getName()
 		            {
 		                return "OneApi";
 		            }
 		        };

 		        constexpr auto oneApi = OneApi{};
 		    } // namespace api

 		#if ALPAKA_LANG_ONEAPI

 		    namespace onHost::trait
 		    {
 		        template<>
 		        struct IsPlatformAvailable::Op<api::OneApi> : std::true_type
 		        {
 		        };
 		#    ifndef ALPAKA_DISABLE_OneApi_IntelGpu
 		        template<>
 		        struct IsDeviceSupportedBy::Op<deviceKind::IntelGpu, api::OneApi> : std::true_type
 		        {
 		        };
 		#    endif
 		#    ifndef ALPAKA_DISABLE_OneApi_NvidiaGpu
 		        template<>
 		        struct IsDeviceSupportedBy::Op<deviceKind::NvidiaGpu, api::OneApi> : std::true_type
 		        {
 		        };
 		#    endif
 		#    ifndef ALPAKA_DISABLE_OneApi_AmdGpu
 		        template<>
 		        struct IsDeviceSupportedBy::Op<deviceKind::AmdGpu, api::OneApi> : std::true_type
 		        {
 		        };
 		#    endif
 		#    ifndef ALPAKA_DISABLE_OneApi_Cpu
 		        template<>
 		        struct IsDeviceSupportedBy::Op<deviceKind::Cpu, api::OneApi> : std::true_type
 		        {
 		        };
 		#    endif
 		    } // namespace onHost::trait

 		#endif
 		    namespace trait
 		    {

 		        template<typename T_Type>
 		        struct GetArchSimdWidth::Op<T_Type, api::OneApi, deviceKind::Cpu>
 		        {
 		            constexpr uint32_t operator()(api::OneApi const, deviceKind::Cpu const) const
 		            {
 		                return onHost::internal::getCPUSimdWidth<T_Type>();
 		            }
 		        };

 		        template<>
 		        struct GetNumPipelines::Op<api::OneApi, deviceKind::Cpu>
 		        {
 		            constexpr uint32_t operator()(api::OneApi const, deviceKind::Cpu const) const
 		            {
 		                return onHost::internal::getCPUNumPipelines();
 		            }
 		        };

 		        template<>
 		        struct GetCachelineSize::Op<api::OneApi, deviceKind::Cpu>
 		        {
 		            constexpr uint32_t operator()(api::OneApi const, deviceKind::Cpu const) const
 		            {
 		                return onHost::internal::getCPUCachelineSize();
 		            }
 		        };

 		        // for GPU
 		        template<typename T_Type, concepts::GpuType T_DeviceKind>
 		        struct GetArchSimdWidth::Op<T_Type, api::OneApi, T_DeviceKind>
 		        {
 		            constexpr uint32_t operator()(api::OneApi const, T_DeviceKind const) const
 		            {
 		                /** vector load and store width in bytes */
 		                // copied from CUDA/HIP -> not verified if this is the optional value
 		                constexpr std::size_t simdWidthInByte = 16u;
 		                return alpaka::divExZero(simdWidthInByte, sizeof(T_Type));
 		            }
 		        };

 		        template<concepts::GpuType T_DeviceKind>
 		        struct GetNumPipelines::Op<api::OneApi, T_DeviceKind>
 		        {
 		            constexpr uint32_t operator()(api::OneApi const, T_DeviceKind const) const
 		            {
 		                /* AMD GPUs SIMD units will be interpreted as pipelines, CUDA GPUs using 2 pipelines (2 warp schedular)
 		                 * @TODO check INTEL GPUs
 		                 */
 		                constexpr uint32_t numPipes = 4u;
 		                return numPipes;
 		            }
 		        };

 		        template<concepts::GpuType T_DeviceKind>
 		        struct GetCachelineSize::Op<api::OneApi, T_DeviceKind>
 		        {
 		            constexpr uint32_t operator()(api::OneApi const, T_DeviceKind const) const
 		            {
 		                // loading 16 byte per thread will result in optimal memory bandwith
 		                // copied from CUDA/HIP -> not verified if this is the optional value
 		                return 16u;
 		            }
 		        };
 		    } // namespace trait

 		    namespace onAcc::internal::trait
 		    {
 		        template<typename T_Acc>
 		        struct AutoIndexMapping::Op<T_Acc, api::OneApi, deviceKind::Cpu>
 		        {
 		            constexpr auto operator()(T_Acc const&, api::OneApi, deviceKind::Cpu) const
 		            {
 		                return layout::Contiguous{};
 		            }
 		        };
 		    } // namespace onAcc::internal::trait
 		} // namespace alpaka
 		// ==
 		// == ./include/alpaka/api/oneApi/Api.hpp ==
 		// ============================================================================

 	// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 	// #include "alpaka/meta/filter.hpp"    // amalgamate: file already inlined
 	// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined

 	// #include <algorithm>    // amalgamate: file already included
 	#include <type_traits>

 	namespace alpaka
 	{
 	    /** provides the API used during the execution of the current code path
 	     *
 	     * @attention if api::host os returned it can also mean that this method was called within the host controlling
 	     * workflow and not within a kernel running on a CPU device.
 	     */
 	    constexpr auto thisApi()
 	    {
 	#if ALPAKA_LANG_SYCL && ALPAKA_LANG_ONEAPI && __SYCL_DEVICE_ONLY__
 	        return api::oneApi;
 	#elif ALPAKA_LANG_CUDA && (ALPAKA_COMP_CLANG_CUDA || ALPAKA_COMP_NVCC) && __CUDA_ARCH__
 	        return api::cuda;
 	#elif ALPAKA_LANG_HIP && defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1
 	        return api::hip;
 	#else
 	        return api::host;
 	#endif
 	    }

 	    namespace onHost
 	    {
 	        constexpr auto apis = std::make_tuple(api::host, api::cuda, api::hip, api::oneApi);

 	        constexpr auto enabledApis = meta::filter([](auto api) constexpr { return isPlatformAvaiable(api); }, apis);
 	    } // namespace onHost

 	    namespace api
 	    {
 	        constexpr bool operator==(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
 	        {
 	            return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
 	        }

 	        constexpr bool operator!=(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
 	        {
 	            return !(lhs == rhs);
 	        }
 	    } // namespace api
 	} // namespace alpaka
 	// ==
 	// == ./include/alpaka/api/api.hpp ==
 	// ============================================================================

 	// ============================================================================
 	// == ./include/alpaka/api/cpu.hpp ==
 	// ==
 	/* Copyright 2024 René Widera
 	 * SPDX-License-Identifier: MPL-2.0
 	 */

 	// #pragma once
 	// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
 		// ============================================================================
 		// == ./include/alpaka/api/host/Device.hpp ==
 		// ==
 		/* Copyright 2024 René Widera, Mehmet Yusufoglu
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/api/host/Event.hpp ==
 			// ==
 			/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once

 			// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
 				// ============================================================================
 				// == ./include/alpaka/interface.hpp ==
 				// ==
 				/* Copyright 2024 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 				namespace alpaka
 				{

 				    /** Get the API an object depends on
 				     *
 				     * @param any can be a platform, device, queue, view
 				     * @return API tag
 				     *
 				     * @{
 				     */
 				    inline constexpr decltype(auto) getApi(auto&& any)
 				    {
 				        return alpaka::internal::getApi(ALPAKA_FORWARD(any));
 				    }

 				    inline constexpr decltype(auto) getApi(alpaka::concepts::HasGet auto&& any)
 				    {
 				        return alpaka::internal::getApi(*any.get());
 				    }

 				    namespace concepts
 				    {
 				        /** Concept to check if the given type implements the `getApi(T x)` function returning an alpaka::concepts::Api
 				         */
 				        template<typename T_Any>
 				        concept HasApi = requires(T_Any&& any) {
 				            { getApi(any) } -> alpaka::concepts::Api;
 				        };
 				    } // namespace concepts

 				    /** @} */

 				    /** Get the device type of an object
 				     *
 				     * @param any can be a platform, device, queue, view
 				     * @return type from alpaka::deviceKind
 				     *
 				     * @{
 				     */
 				    inline constexpr decltype(auto) getDeviceKind(auto&& any)
 				    {
 				        return alpaka::internal::getDeviceKind(ALPAKA_FORWARD(any));
 				    }

 				    inline constexpr decltype(auto) getDeviceKind(alpaka::concepts::HasGet auto&& any)
 				    {
 				        return alpaka::internal::getDeviceKind(*any.get());
 				    }

 				    /** @} */


 				    /** Get the number of elements to compute per thread.
 				     *
 				     * This function considers the SIMD width for the corresponding data type and the potential for instruction
 				     * parallelism.
 				     *
 				     * @tparam T_Type The data type used to determine the SIMD width.
 				     * @return The minimum number of elements a thread should compute to achieve optimal utilization.
 				     */
 				    template<typename T_Type>
 				    constexpr uint32_t getNumElemPerThread(auto&& any)
 				    {
 				        return alpaka::getNumElemPerThread<T_Type>(ALPAKA_TYPEOF(getApi(any)){}, ALPAKA_TYPEOF(getDeviceKind(any)){});
 				    }

 				    /** get SIMD with in bytes for the
 				     *
 				     * @tparam T_Type data type
 				     * @return number of elements that can be processed in parallel in a vector register
 				     */
 				    template<typename T_Type>
 				    constexpr uint32_t getArchSimdWidth(auto&& any)
 				    {
 				        return alpaka::getArchSimdWidth<T_Type>(ALPAKA_TYPEOF(getApi(any)){}, ALPAKA_TYPEOF(getDeviceKind(any)){});
 				    }

 				    /** get the number of instruction can be issued in parallel */
 				    constexpr uint32_t getNumPipelines(auto&& any)
 				    {
 				        return alpaka::getNumPipelines(ALPAKA_TYPEOF(getApi(any)){}, ALPAKA_TYPEOF(getDeviceKind(any)){});
 				    }

 				    /** Get the value type alignment of an object
 				     *
 				     * @param any type derive the alignment from
 				     * @return alignment in bytes, if not defined the alignment of the value_type will be returned
 				     */
 				    constexpr auto getAlignment(auto&& any)
 				    {
 				        return internal::getAlignment(ALPAKA_FORWARD(any));
 				    }

 				    /** Utility to mark variables as unused to avoid compiler warnings
 				     *
 				     * Using '[[maybe_unused]]` in function interfaces for arguments make the interface long and sometimes it is not
 				     * important that only the argument type is used within the function and not the instance itself.
 				     * This can be used to keep the function interfaces clean and readable.
 				     */
 				    inline constexpr void unused([[maybe_unused]] auto&&... values)
 				    {
 				    }

 				} // namespace alpaka
 				// ==
 				// == ./include/alpaka/interface.hpp ==
 				// ============================================================================

 			// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
 				// ============================================================================
 				// == ./include/alpaka/onHost/logger/logger.hpp ==
 				// ==
 				/* Copyright 2025 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 					// ============================================================================
 					// == ./include/alpaka/onHost/internal/logger.hpp ==
 					// ==
 					/* Copyright 2025 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include "alpaka/onHost/demangledName.hpp"    // amalgamate: file already inlined
 						// ============================================================================
 						// == ./include/alpaka/onHost/logger/lvl.hpp ==
 						// ==
 						/* Copyright 2025 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 						// #include <string>    // amalgamate: file already included

 						namespace alpaka::onHost::logger
 						{

 						    namespace detail
 						    {
 						        struct LogLvlBase
 						        {
 						        };

 						        template<typename T_Logger0, typename T_Logger1>
 						        struct AggregatedLogger : LogLvlBase
 						        {
 						            static std::string getName()
 						            {
 						                return T_Logger0::getName();
 						            }

 						            static constexpr size_t mask()
 						            {
 						                return T_Logger0::mask() + T_Logger1::mask();
 						            }
 						        };
 						    } // namespace detail

 						    namespace trait
 						    {
 						        template<typename T_DeviceKind>
 						        struct IsLogLvl : std::is_base_of<detail::LogLvlBase, T_DeviceKind>
 						        {
 						        };
 						    } // namespace trait

 						    template<typename T_LogLvl>
 						    constexpr bool isLogLvl_v = trait::IsLogLvl<T_LogLvl>::value;

 						    namespace concepts
 						    {
 						        /** Concept for log level types
 						         */
 						        template<typename T_DeviceKind>
 						        concept Level = isLogLvl_v<T_DeviceKind>;
 						    } // namespace concepts

 						    constexpr bool operator==(concepts::Level auto lhs, concepts::Level auto rhs)
 						    {
 						        return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
 						    }

 						    constexpr bool operator!=(concepts::Level auto lhs, concepts::Level auto rhs)
 						    {
 						        return !(lhs == rhs);
 						    }

 						    constexpr auto operator+(concepts::Level auto lhs, concepts::Level auto rhs)
 						    {
 						        return detail::AggregatedLogger<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>{};
 						    }

 						    struct Device : detail::LogLvlBase
 						    {
 						        static std::string getName()
 						        {
 						            return "Device";
 						        }

 						        static constexpr size_t mask()
 						        {
 						            return 1;
 						        }
 						    };

 						    constexpr auto device = Device{};

 						    struct Event : detail::LogLvlBase
 						    {
 						        static std::string getName()
 						        {
 						            return "Event";
 						        }

 						        static constexpr size_t mask()
 						        {
 						            return 2;
 						        }
 						    };

 						    constexpr auto event = Event{};

 						    struct Memory : detail::LogLvlBase
 						    {
 						        static std::string getName()
 						        {
 						            return "Memory";
 						        }

 						        static constexpr size_t mask()
 						        {
 						            return 4;
 						        }
 						    };

 						    constexpr auto memory = Memory{};

 						    struct Queue : detail::LogLvlBase
 						    {
 						        static std::string getName()
 						        {
 						            return "Queue";
 						        }

 						        static constexpr size_t mask()
 						        {
 						            return 8;
 						        }
 						    };

 						    constexpr auto queue = Queue{};

 						    struct Kernel : detail::LogLvlBase
 						    {
 						        static std::string getName()
 						        {
 						            return "Kernel";
 						        }

 						        static constexpr size_t mask()
 						        {
 						            return 16;
 						        }
 						    };

 						    constexpr auto kernel = Kernel{};
 						} // namespace alpaka::onHost::logger
 						// ==
 						// == ./include/alpaka/onHost/logger/lvl.hpp ==
 						// ============================================================================


 					#include <atomic>
 					#include <chrono>
 					// #include <functional>    // amalgamate: file already included
 					// #include <iostream>    // amalgamate: file already included
 					// #include <ostream>    // amalgamate: file already included
 					#include <source_location>
 					// #include <string>    // amalgamate: file already included
 					#include <string_view>

 					namespace alpaka::onHost::logger::internal
 					{
 					    /** Write all output to std::cerr
 					     *
 					     * The output is not buffered and will be written immediately, it is **NOT** threadsafe.
 					     *
 					     * @todo seperate the indention level from the writer
 					     * @todo write additional logger, std::cout and thread save logger
 					     */
 					    struct StdErr
 					    {
 					        static StdErr& get()
 					        {
 					            static StdErr inst = StdErr{};
 					            return inst;
 					        }

 					        std::ostream& operator<<(auto const& input) const
 					        {
 					            return std::cerr << input;
 					        }

 					        /** increase the indention level
 					         *
 					         * @return the indention level for the current message
 					         */
 					        int enter()
 					        {
 					            return indentLvl++;
 					        }

 					        /** decrease the indention level
 					         *
 					         * @return the indention level for the current message
 					         */
 					        int leave()
 					        {
 					            return --indentLvl;
 					        }

 					        /** current indention level
 					         *
 					         * @return the indention level for the current message
 					         */
 					        int current()
 					        {
 					            return indentLvl.load();
 					        }

 					    private:
 					        std::atomic<int> indentLvl = 1;
 					    };

 					    /** Indent the message if needed and forward it to the output writer
 					     *
 					     * If input is indented depends on the preprocessor define ALPAKA_LOG_INDENT
 					     */
 					    inline void indent(auto& writer, [[maybe_unused]] int indentLvl)
 					    {
 					#if defined(ALPAKA_LOG_INDENT)
 					        for(int i = 0; i < indentLvl; ++i)
 					            i == 0 ? (writer << "|-") : (writer << "--");
 					        if(indentLvl)
 					#endif
 					            writer << " ";
 					    }

 					    /** Adjust the length of a string to a minimum length
 					     *
 					     * @param str input string
 					     * @param n minimum number of characters, if the string is shorter than this number, it will be padded with a
 					     * padding character
 					     * @return new string with a  minimum number of characters
 					     */
 					    inline std::string adjStringLength(std::string str, size_t n, char const paddingCharacter = ' ')
 					    {
 					        if(str.length() >= n)
 					        {
 					            return str;
 					        }
 					        str.resize(n, paddingCharacter);
 					        return str;
 					    }

 					    /** shortening the function signatures to become human-readable
 					     *
 					     * If the name is simplified depends on the preprocessor define ALPAKA_LOG_DETAIL_SHORT
 					     */
 					    inline std::string adjDetails(std::string const& str)
 					    {
 					#if defined(ALPAKA_LOG_DETAIL_SHORT)
 					        return onHost::simplifyFunctionSignature(str);
 					#else
 					        return str;
 					#endif
 					    }

 					    /** Log the entry and exit of a scope */
 					    template<logger::concepts::Level T_LogLvl, typename T_Writer = StdErr>
 					    struct Scoped
 					    {
 					    public:
 					        Scoped(T_LogLvl logLvl, std::source_location const& location)
 					            : m_functionName{adjDetails(location.function_name())}
 					            , m_prefix{std::string("[") + adjStringLength(logLvl.getName(), 6) + "]"}
 					            , m_startTime{std::chrono::high_resolution_clock::now()}
 					            , m_writer{T_Writer::get()}
 					        {
 					            m_writer << m_prefix << "[+]";
 					            indent(m_writer, m_writer.enter());
 					            m_writer << m_functionName << std::endl;
 					        }

 					        Scoped(T_LogLvl logLvl) : m_writer{T_Writer::get()}, m_enableOutput{false}
 					        {
 					        }

 					        Scoped(Scoped const&) = delete;
 					        Scoped(Scoped&&) = delete;
 					        Scoped& operator=(Scoped const&) = delete;
 					        Scoped& operator=(Scoped&&) = delete;

 					        ~Scoped()
 					        {
 					            if(m_enableOutput)
 					            {
 					                auto const endTime = std::chrono::high_resolution_clock::now();
 					                double durationInSeconds = std::chrono::duration<double, std::milli>(endTime - m_startTime).count();

 					                m_writer << m_prefix << "[-]";
 					                indent(m_writer, m_writer.leave());
 					                m_writer << m_functionName << " " << durationInSeconds << " ms" << std::endl;
 					            }
 					        }

 					    private:
 					        std::string m_functionName;
 					        std::string m_prefix;
 					        decltype(std::chrono::high_resolution_clock::now()) m_startTime;
 					        T_Writer& m_writer;
 					        bool m_enableOutput = true;
 					    };

 					    /** Write a meta data message to the output
 					     *
 					     * @tparam T_Callable callable without arguments which provides a string which should be written to the output
 					     */
 					    template<logger::concepts::Level T_LogLvl, typename T_Callable, typename T_Writer = StdErr>
 					    requires(std::is_invocable_r_v<std::string, T_Callable>)
 					    struct Info
 					    {
 					    public:
 					        Info(T_LogLvl logLvl, T_Callable const& callable, std::source_location const& location)
 					        {
 					            auto fullPrefix = std::string("[") + adjStringLength(logLvl.getName(), 6) + "]";

 					            auto& writer = T_Writer::get();
 					            std::stringstream ss;
 					            ss << "   ";
 					            writer << fullPrefix << ss.str();
 					            indent(writer, writer.current());
 					            writer << callable() << " " << adjDetails(location.function_name()) << " " << location.file_name() << ":"
 					                   << location.line() << std::endl;
 					        }

 					        Info(Info const&) = delete;
 					        Info(Info&&) = delete;
 					        Info& operator=(Info const&) = delete;
 					        Info& operator=(Info&&) = delete;

 					        ~Info() = default;
 					    };
 					} // namespace alpaka::onHost::logger::internal
 					// ==
 					// == ./include/alpaka/onHost/internal/logger.hpp ==
 					// ============================================================================

 				// #include "alpaka/onHost/logger/lvl.hpp"    // amalgamate: file already inlined

 				#include <mutex>
 				#include <source_location>

 				namespace alpaka::onHost::logger
 				{
 				    /** Log the entry and exit of a scope
 				     *
 				     * @attention It is suggested to use the logger macro ALPAKA_LOG_FUNCTION to speedup the compile time.
 				     * For cases where logging is disabled the compiler does not need to register the C++ function signature.
 				     *
 				     * The time spend within the scope is added to the output as additional information, in milliseconds.
 				     *
 				     * @param logLvl log level or a sum of log levels
 				     */
 				    inline auto scope(
 				        concepts::Level auto logLvl,
 				        std::source_location const& location = std::source_location::current())
 				    {
 				#if defined(ALPAKA_LOG_STATIC)
 				        if constexpr(logLvl.mask() & ALPAKA_LOG_STATIC_LVL_MASK)
 				            return internal::Scoped{logLvl, location};
 				        else
 				            return internal::Scoped{logLvl};
 				#elif defined(ALPAKA_LOG_DYNAMIC)
 				        static std::once_flag flag;
 				        static size_t envLogMask = 0;

 				        std::call_once(
 				            flag,
 				            []()
 				            {
 				                if(char const* envStr = std::getenv("ALPAKA_LOG_DYNAMIC_LVL"))
 				                    envLogMask = std::stoull(envStr);
 				            });

 				        if(logLvl.mask() & envLogMask)
 				            return internal::Scoped{logLvl, location};
 				        else
 				            return internal::Scoped{logLvl};
 				#endif
 				    }

 				    /** Write a meta data message to the output
 				     *
 				     * @attention It is suggested to use the logger macro ALPAKA_LOG_INFO to speedup the compile time.
 				     * For cases where logging is disabled the compiler does not need to register the C++ function signature.
 				     *
 				     * @param logLvl log level or a sum of log levels
 				     * @param callable callable without arguments which provides a string which should be written to the output
 				     */
 				    inline void info(
 				        concepts::Level auto logLvl,
 				        auto const& callable,
 				        std::source_location const& location = std::source_location::current())
 				    {
 				#if defined(ALPAKA_LOG_STATIC)
 				        if constexpr(logLvl.mask() & ALPAKA_LOG_STATIC_LVL_MASK)
 				            internal::Info{logLvl, callable, location};
 				#elif defined(ALPAKA_LOG_DYNAMIC)
 				        static std::once_flag flag;
 				        static size_t envLogMask = 0;

 				        std::call_once(
 				            flag,
 				            []()
 				            {
 				                if(char const* envStr = std::getenv("ALPAKA_LOG_DYNAMIC_LVL"))
 				                    envLogMask = std::stoull(envStr);
 				            });
 				        if(logLvl.mask() & envLogMask)
 				            internal::Info{logLvl, callable, location};
 				#endif
 				    }
 				} // namespace alpaka::onHost::logger

 				/** Log the entry and exit of a scope
 				 *
 				 * @param logLvl log level or a sum of log levels
 				 */
 				#if defined(ALPAKA_ENABLE_LOG_FUNCTIONS)
 				#    define ALPAKA_LOG_FUNCTION(logLvl)                                                                               \
 				        [[maybe_unused]] auto const __alpaka_log_scope = ::alpaka::onHost::logger::scope(logLvl)
 				#else
 				#    define ALPAKA_LOG_FUNCTION(logLvl) void()
 				#endif

 				/** Write a meta data message to the output
 				 *
 				 * @param logLvl log level or a sum of log levels
 				 * @param callable callable without arguments which provides a string which should be written to the output
 				 */
 				#if defined(ALPAKA_ENABLE_LOG_INFO)
 				#    define ALPAKA_LOG_INFO(logLvl, callable) ::alpaka::onHost::logger::info(logLvl, callable)
 				#else
 				#    define ALPAKA_LOG_INFO(logLvl, callable) void()
 				#endif
 				// ==
 				// == ./include/alpaka/onHost/logger/logger.hpp ==
 				// ============================================================================


 			// #include <cstdint>    // amalgamate: file already included
 			#include <cstring>
 			#include <future>
 			// #include <sstream>    // amalgamate: file already included

 			namespace alpaka::onHost
 			{
 			    namespace cpu
 			    {
 			        template<typename T_Device>
 			        struct Event : std::enable_shared_from_this<Event<T_Device>>
 			        {
 			        public:
 			            Event(internal::concepts::DeviceHandle auto device, uint32_t const idx)
 			                : m_device(std::move(device))
 			                , m_idx(idx)
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::event);
 			            }

 			            ~Event()
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::event);
 			                internal::wait(*this);
 			            }

 			            Event(Event const&) = delete;
 			            Event& operator=(Event const&) = delete;

 			            Event(Event&&) = delete;
 			            Event& operator=(Event&&) = delete;

 			            bool operator==(Event const& other) const
 			            {
 			                return m_idx == other.m_idx && m_device == other.m_device;
 			            }

 			            bool operator!=(Event const& other) const
 			            {
 			                return !(*this == other);
 			            }

 			        private:
 			            Handle<T_Device> m_device;
 			            uint32_t m_idx = 0u;

 			            //!< The mutex used to synchronize access to the event.
 			            std::mutex mutable m_mutex;
 			            //!< The future signaling the event completion.
 			            std::shared_future<void> m_future;
 			            //!< The number of times this event has been enqueued.
 			            std::size_t m_enqueueCount = 0u;
 			            //!< The time this event has been ready the last time.
 			            //!< Ready means that the event was not waiting within a queue
 			            //!< (not enqueued or already completed). If m_enqueueCount ==
 			            //!< m_LastReadyEnqueueCount, the event is currently not enqueued
 			            std::size_t m_LastReadyEnqueueCount = 0u;

 			            friend struct alpaka::internal::GetName;

 			            std::string getName() const
 			            {
 			                return std::string("host::Event id=") + std::to_string(m_idx);
 			            }

 			            friend struct internal::GetNativeHandle;
 			            friend struct internal::Enqueue;
 			            friend struct alpaka::internal::GetDeviceType;

 			            auto getDeviceKind() const
 			            {
 			                return alpaka::internal::getDeviceKind(*m_device.get());
 			            }

 			            auto getDevice() const
 			            {
 			                return m_device;
 			            }

 			            std::shared_ptr<Event> getSharedPtr()
 			            {
 			                return this->shared_from_this();
 			            }

 			            friend struct onHost::internal::GetDevice;

 			            friend struct internal::IsEventComplete;

 			            /** Check if the event is ready.
 			             *
 			             * @attention Do not call this method without holding the event lock.
 			             *
 			             * @return true if the event is ready, false otherwise
 			             */
 			            bool isReady() noexcept
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::event);
 			                return (m_LastReadyEnqueueCount == m_enqueueCount);
 			            }

 			            /** Check if the event is complete.
 			             *
 			             * @attention Should not be called if the event lock is acquired, because it could lead to a deadlock.
 			             *
 			             * @return true if the event is complete, false otherwise
 			             */
 			            bool isEventComplete() noexcept
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::event);
 			                std::lock_guard<std::mutex> lk(m_mutex);
 			                return isReady();
 			            }

 			            friend struct internal::WaitFor;
 			            friend struct internal::Wait;

 			            void wait()
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::event);
 			                std::unique_lock<std::mutex> lk(m_mutex);
 			                size_t enqueueCount = m_enqueueCount;

 			                while(enqueueCount > m_LastReadyEnqueueCount)
 			                {
 			                    auto future = m_future;
 			                    lk.unlock();
 			                    future.get();
 			                    lk.lock();
 			                }
 			            }

 			            friend struct alpaka::internal::GetApi;
 			        };
 			    } // namespace cpu
 			} // namespace alpaka::onHost

 			namespace alpaka::internal
 			{
 			    template<typename T_Device>
 			    struct GetApi::Op<onHost::cpu::Event<T_Device>>
 			    {
 			        inline constexpr auto operator()(auto&& event) const
 			        {
 			            return alpaka::getApi(event.m_device);
 			        }
 			    };
 			} // namespace alpaka::internal
 			// ==
 			// == ./include/alpaka/api/host/Event.hpp ==
 			// ============================================================================

 			// ============================================================================
 			// == ./include/alpaka/api/host/Queue.hpp ==
 			// ==
 			/* Copyright 2024 René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 				// ============================================================================
 				// == ./include/alpaka/api/generic.hpp ==
 				// ==
 				/* Copyright 2025 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */


 				// #pragma once
 				// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 					// ============================================================================
 					// == ./include/alpaka/onAcc/SimdAlgo.hpp ==
 					// ==
 					/* Copyright 2024 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/mem/concepts.hpp"    // amalgamate: file already inlined
 						// ============================================================================
 						// == ./include/alpaka/onAcc/internal/SimdConcurrent.hpp ==
 						// ==
 						/* Copyright 2024 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
 							// ============================================================================
 							// == ./include/alpaka/SimdPtr.hpp ==
 							// ==
 							/* Copyright 2025 René Widera
 							 * SPDX-License-Identifier: MPL-2.0
 							 */

 							// #pragma once
 							// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/mem/concepts.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/mem/concepts/IMdSpan.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 							// #include <array>    // amalgamate: file already included
 							// #include <concepts>    // amalgamate: file already included
 							// #include <cstdint>    // amalgamate: file already included
 							#include <type_traits>

 							namespace alpaka
 							{
 							    namespace trait
 							    {
 							        template<typename T>
 							        struct IsSimdPtr : std::false_type
 							        {
 							        };
 							    } // namespace trait

 							    template<typename T>
 							    constexpr bool isSimdPtr_v = trait::IsSimdPtr<T>::value;

 							    namespace concepts
 							    {
 							        /** Concept to check if a type is a SIMD pointer
 							         *
 							         * @tparam T Type to check
 							         * @tparam T_ValueType enforce a value type of the SIMD pointer, if not provided the value type is not checked
 							         * @tparam T_width enforce lane width of the SIMD pointer, if not provided the value is not checked
 							         */
 							        template<typename T, typename T_ValueType = alpaka::NotRequired, uint32_t T_width = alpaka::notRequiredWidth>
 							        concept SimdPtr = isSimdPtr_v<T>
 							                          && (std::same_as<T_ValueType, trait::GetValueType_t<std::decay_t<T>>>
 							                              || std::same_as<T_ValueType, alpaka::NotRequired>)
 							                          && ((T_width == alpaka::notRequiredWidth) || (T::width() == T_width));
 							    } // namespace concepts

 							    /** pointer to a SIMD pack with the width T_SimdWidth
 							     *
 							     * The pointer is used to load/store data from/to memory
 							     *
 							     * @tparam T_MdSpan type of the memory the pointer is pointing to
 							     * @tparam T_IdxType type of the index
 							     * @tparam T_MemAlignment alignment of the memory the pointer is pointing to
 							     * @tparam T_SimdWidth width of the SIMD pack
 							     */
 							    template<
 							        typename T_MdSpan,
 							        alpaka::concepts::Vector T_IdxType,
 							        alpaka::concepts::Alignment T_MemAlignment,
 							        alpaka::concepts::CVector T_SimdWidth>
 							    struct SimdPtr : private T_MdSpan
 							    {
 							        using value_type = typename T_MdSpan::value_type;
 							        using IdxType = typename T_IdxType::UniVec;

 							        static consteval uint32_t width()
 							        {
 							            return T_SimdWidth{}.back();
 							        }

 							        constexpr SimdPtr(T_MdSpan const& mdSpan, T_IdxType const& idx, T_MemAlignment, T_SimdWidth)
 							            : T_MdSpan(mdSpan)
 							            , m_idx(idx)
 							        {
 							        }

 							        /** Shift the element the pointer is pointing to by idx
 							         *
 							         * @param idx number of elements to shift the pointer by
 							         * @return a new simd pointer pointing to the shifted element
 							         *
 							         * @{
 							         */
 							        constexpr auto operator[](alpaka::concepts::Vector auto const& idx) const
 							        {
 							            constexpr uint32_t valueAlignment = static_cast<uint32_t>(alignof(value_type));
 							            constexpr auto align = Alignment<valueAlignment>{};
 							            return SimdPtr<T_MdSpan, T_IdxType, ALPAKA_TYPEOF(align), T_SimdWidth>{
 							                static_cast<T_MdSpan>(*this),
 							                idx + m_idx,
 							                align,
 							                T_SimdWidth{}};
 							        }

 							        constexpr auto operator[](alpaka::concepts::Vector auto const& idx)
 							        {
 							            constexpr uint32_t valueAlignment = static_cast<uint32_t>(alignof(value_type));
 							            constexpr auto align = Alignment<valueAlignment>{};
 							            return SimdPtr<T_MdSpan, T_IdxType, ALPAKA_TYPEOF(align), T_SimdWidth>{
 							                static_cast<T_MdSpan>(*this),
 							                idx + m_idx,
 							                align,
 							                T_SimdWidth{}};
 							        }

 							        /** @} */

 							        constexpr decltype(auto) load() const
 							        {
 							            return internal::loadAsSimd<width()>(static_cast<T_MdSpan const&>(*this), getAlignment(), m_idx);
 							        }

 							        constexpr decltype(auto) load()
 							        {
 							            return internal::loadAsSimd<width()>(static_cast<T_MdSpan&>(*this), getAlignment(), m_idx);
 							        }

 							        /** get the alignment of the memory the pointer is pointing to
 							         *
 							         * @attention If the pointer is shifted by `operator[]` the alignment is equal to the data alignment of an
 							         * single element
 							         *
 							         * @return the alignment of the memory (in byte) the pointer is pointing to
 							         */
 							        static constexpr auto getAlignment()
 							        {
 							            using SpanElemType = typename T_MdSpan::value_type;
 							            constexpr uint32_t spanAlignment = T_MdSpan::getAlignment().template get<SpanElemType>();
 							            using MemoryAlignment = std::conditional_t<
 							                std::is_same_v<AutoAligned, T_MemAlignment>,
 							                Alignment<spanAlignment>,
 							                Alignment<std::min(T_MemAlignment::template get<SpanElemType>(), spanAlignment)>>;
 							            return MemoryAlignment{};
 							        }

 							        /** store the simd pack to the memory the pointer is pointing to
 							         *
 							         * @param rhs simd pack to store
 							         *
 							         * @{
 							         */
 							        template<typename T_Storage>
 							        constexpr void storeTo(Simd<value_type, SimdPtr::width(), T_Storage> const& rhs) const
 							        {
 							            auto* ptr = &T_MdSpan::operator[](m_idx);

 							            rhs.copyTo(ptr, getAlignment());
 							        }

 							        template<typename T_Storage>
 							        constexpr void storeTo(Simd<value_type, SimdPtr::width(), T_Storage> const& rhs)
 							        {
 							            auto* ptr = &T_MdSpan::operator[](m_idx);
 							            rhs.copyTo(ptr, getAlignment());
 							        }

 							        template<typename T_Storage>
 							        constexpr SimdPtr const& operator=(Simd<value_type, SimdPtr::width(), T_Storage> const& rhs) const
 							        {
 							            storeTo(rhs);
 							            return *this;
 							        }

 							        template<typename T_Storage>
 							        constexpr SimdPtr& operator=(Simd<value_type, SimdPtr::width(), T_Storage> const& rhs)
 							        {
 							            storeTo(rhs);
 							            return *this;
 							        }

 							        /** @} */

 							        /** offset in elements relative to the MdSpan given at construction
 							         *
 							         * The index points to the first element followed by T_SimdWidth elements.
 							         *
 							         * @return the index of the first element relative to the MdSpan given at construction
 							         */
 							        constexpr IdxType getIdx() const
 							        {
 							            return m_idx;
 							        }

 							    private:
 							        IdxType m_idx;
 							    };

 							    namespace internal
 							    {
 							        template<
 							            alpaka::concepts::IMdSpan T_MdSpan,
 							            alpaka::concepts::Alignment T_MdSpanAlignment,
 							            alpaka::concepts::Vector T_Idx>
 							        struct LoadAsSimd::Op<T_MdSpan, T_MdSpanAlignment, T_Idx>
 							        {
 							            template<uint32_t T_simdWidth>
 							            constexpr auto load(auto&& dataSource, T_MdSpanAlignment alignment, T_Idx const& idx) const
 							            {
 							                static_assert(
 							                    std::is_same_v<T_MdSpan, ALPAKA_TYPEOF(dataSource)>,
 							                    "Data source type must match the class template signature.");
 							                auto&& d = dataSource[idx];
 							                using DataTypeType = std::remove_reference_t<decltype(d)>;
 							                using DstType = std::conditional_t<
 							                    std::is_const_v<DataTypeType>,
 							                    Simd<std::decay_t<DataTypeType>, T_simdWidth> const,
 							                    Simd<std::decay_t<DataTypeType>, T_simdWidth>>;

 							                alpaka::concepts::Simd auto dest = DstType{};
 							                dest.copyFrom(&d, alignment);
 							                return dest;
 							            }
 							        };
 							    } // namespace internal

 							    namespace trait
 							    {
 							        template<typename T>
 							        requires(isSpecializationOf_v<T, SimdPtr>)
 							        struct IsSimdPtr<T> : std::true_type
 							        {
 							        };
 							    } // namespace trait
 							} // namespace alpaka
 							// ==
 							// == ./include/alpaka/SimdPtr.hpp ==
 							// ============================================================================

 						// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/mem/concepts.hpp"    // amalgamate: file already inlined
 							// ============================================================================
 							// == ./include/alpaka/mem/concepts/IGeneratorOrMdSpan.hpp ==
 							// ==
 							/* Copyright 2025 Simeon Ehrig
 							 * SPDX-License-Identifier: MPL-2.0
 							 */

 							// #pragma once
 								// ============================================================================
 								// == ./include/alpaka/mem/concepts/IGenerator.hpp ==
 								// ==
 								/* Copyright 2025 Simeon Ehrig
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/mem/concepts/ExpectedValueType.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 								// #include <concepts>    // amalgamate: file already included

 								namespace alpaka::concepts
 								{
 								    namespace impl
 								    {
 								        /** @brief Interface concept for objects describing multidimensional generator
 								         *
 								         * @details
 								         *
 								         * @param t may or may not have a const modifier.
 								         * @param mut_t Mutable generator. Does not have a const modifier.
 								         * @param const_t Constant generator. Does have a const modifier.
 								         * @param vec Vector with the same number of components as the dimension of the generator like object.
 								         * Used to call the access operator.
 								         *
 								         * A generator is always read only.
 								         * It is allowed that a generator derives values from another data source.
 								         * Even if a generator is not limited in the number of elements it can generate, it must provide the extents
 								         *via getExtents().
 								         *
 								         * @section membertypes Member types
 								         * - <b>T::value_type</b>: The element type. May or may not be const.
 								         * - <b>T::index_type</b>: The index type of the extents.
 								         *
 								         * @note The access operator [] with an integral as an argument is only available if the dimension is one.
 								         **/
 								        template<typename T, typename T_Mut, typename T_Const>
 								        concept IGenerator
 								            = requires(T t, T_Mut mut_t, T_Const const_t, alpaka::Vec<typename T::index_type, T::dim()> vec) {
 								                  typename T::value_type;
 								                  typename T::index_type;
 								                  requires std::movable<T_Mut>;

 								                  { mut_t[vec] } -> std::same_as<typename T::value_type>;
 								                  { const_t[vec] } -> std::same_as<typename T::value_type>;
 								                  // only for 1D, the access operator with an integral is available
 								                  requires(T::dim() > 1) || requires {
 								                      { mut_t[0] } -> std::same_as<typename T::value_type>;
 								                  };
 								                  requires(T::dim() > 1) || requires {
 								                      { const_t[0] } -> std::same_as<typename T::value_type>;
 								                  };

 								                  // typically the alignment of the value_type.
 								                  { t.getAlignment() } -> alpaka::concepts::Alignment;
 								                  /** @todo implement concept alpaka::concepts::Extents and use it as return value
 								                   * @todo in general a generator is not required to have extents but our algorithm e.g. onHost::reduce
 								                   *will not work without extents
 								                   **/
 								                  t.getExtents();
 								              };
 								    } // namespace impl

 								    /** @brief Interface concept for objects describing multidimensional generator
 								     *
 								     * @attention Use `alpaka::IGenerator` to restrict types in your code. The actual interface is described in
 								     * alpaka::concepts::impl::IGenerator.
 								     **/
 								    template<typename T, typename T_ValueType = alpaka::NotRequired>
 								    concept IGenerator = requires {
 								        requires impl::IGenerator<
 								            std::remove_reference_t<T>,
 								            std::remove_const_t<std::remove_reference_t<T>>,
 								            std::add_const_t<std::remove_reference_t<T>>>;
 								        requires ExpectedValueType<trait::GetValueType_t<std::decay_t<T>>, T_ValueType>;
 								    };
 								} // namespace alpaka::concepts
 								// ==
 								// == ./include/alpaka/mem/concepts/IGenerator.hpp ==
 								// ============================================================================

 							// #include "alpaka/mem/concepts/IMdSpan.hpp"    // amalgamate: file already inlined

 							namespace alpaka::concepts
 							{
 							    /** @brief Interface concept for objects describing multidimensional memory access or a generator.
 							     */
 							    template<typename T, typename T_ValueType = alpaka::NotRequired>
 							    concept IGeneratorOrMdSpan = (IGenerator<T, T_ValueType> || IMdSpan<T, T_ValueType>);
 							    ;
 							} // namespace alpaka::concepts
 							// ==
 							// == ./include/alpaka/mem/concepts/IGeneratorOrMdSpan.hpp ==
 							// ============================================================================

 							// ============================================================================
 							// == ./include/alpaka/onAcc/interface.hpp ==
 							// ==
 							/* Copyright 2024 René Widera
 							 * SPDX-License-Identifier: MPL-2.0
 							 */

 							// #pragma once
 							/** @file
 							 *
 							 * On some constexpr function signatures `ALPAKA_FN_HOST_ACC` is required for CUDA;
 							 * otherwise a `__host__` function called from a `__host__ __device__` context
 							 * triggers a warning and the generated code is wrong.
 							 */

 							// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
 								// ============================================================================
 								// == ./include/alpaka/mem/BoundaryIter.hpp ==
 								// ==
 								/* Copyright 2025 Anton Reinhard
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 								// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
 									// ============================================================================
 									// == ./include/alpaka/core/Assert.hpp ==
 									// ==
 									/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once
 									// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 									// #include <cassert>    // amalgamate: file already included
 									#include <type_traits>

 									//! The assert can be explicit disabled by defining NDEBUG
 									#define ALPAKA_ASSERT(...) assert(__VA_ARGS__)

 									//! Macro which expands to a noop.
 									//! Macro enforces an semicolon after the call.
 									#define ALPAKA_NOOP(...)                                                                                              \
 									    do                                                                                                                \
 									    {                                                                                                                 \
 									    } while(false)

 									//! ALPAKA_ASSERT_ACC_IMPL is an assert-like macro.
 									//! It can be disabled setting the ALPAKA_DISABLE_ASSERT_ACC preprocessor symbol or the NDEBUG preprocessor symbol.
 									#if !defined(ALPAKA_DISABLE_ASSERT_ACC)
 									#    define ALPAKA_ASSERT_ACC_IMPL(...) ALPAKA_ASSERT(__VA_ARGS__)
 									#else
 									#    define ALPAKA_ASSERT_ACC_IMPL(...) ALPAKA_NOOP(__VA_ARGS__)
 									#endif

 									//! ALPAKA_ASSERT_ACC is an assert-like macro.
 									//!
 									//! In device code for a GPU or SYCL backend it can be disabled setting the ALPAKA_DISABLE_ASSERT_ACC preprocessor
 									//! symbol or the NDEBUG preprocessor symbol. In device code for a native C++ CPU backend and in host code, it is
 									//! equivalent to ALPAKA_ASSERT, and can be disabled setting the NDEBUG preprocessor symbol.
 									#if defined(ALPAKA_LANG_CUDA) && defined(__CUDA_ARCH__)
 									// CUDA device code
 									#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
 									#elif defined(ALPAKA_LANG_HIP) && defined(__HIP_DEVICE_COMPILE__)
 									// HIP/ROCm device code
 									#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
 									#elif defined(ALPAKA_LANG_SYCL) && defined(__SYCL_DEVICE_ONLY__)
 									// SYCL/oneAPI device code
 									#    if defined(SYCL_EXT_ONEAPI_ASSERT)
 									#        define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
 									#    else
 									#        define ALPAKA_ASSERT_ACC(...) ALPAKA_NOOP(__VA_ARGS__)
 									#    endif
 									// add here any other #elif conditions for non-CPU backends
 									// ...
 									#else
 									// CPU backend, or host code
 									#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT(__VA_ARGS__)
 									#endif

 									namespace alpaka::core
 									{
 									    namespace detail
 									    {
 									        template<typename TArg>
 									        struct AssertValueUnsigned
 									        {
 									            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static constexpr auto assertValueUnsigned(
 									                [[maybe_unused]] TArg const& arg)
 									            {
 									                if constexpr(std::is_signed_v<TArg>)
 									                    ALPAKA_ASSERT_ACC(arg >= 0);

 									                // Nothing to do for unsigned types.
 									            }
 									        };
 									    } // namespace detail

 									    //! This method checks integral values if they are greater or equal zero.
 									    //! The implementation prevents warnings for checking this for unsigned types.
 									    ALPAKA_NO_HOST_ACC_WARNING
 									    template<typename TArg>
 									    ALPAKA_FN_HOST_ACC constexpr auto assertValueUnsigned(TArg const& arg) -> void
 									    {
 									        detail::AssertValueUnsigned<TArg>::assertValueUnsigned(arg);
 									    }

 									    namespace detail
 									    {
 									        template<typename TLhs, typename TRhs>
 									        struct AssertGreaterThan
 									        {
 									            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static constexpr auto assertGreaterThan(
 									                [[maybe_unused]] TRhs const& rhs)
 									            {
 									                if constexpr(std::is_signed_v<TRhs> || (TLhs::value != 0u))
 									                    ALPAKA_ASSERT_ACC(TLhs::value > rhs);

 									                // Nothing to do for unsigned types comparing to zero.
 									            }
 									        };
 									    } // namespace detail

 									    //! This function asserts that the integral value TLhs is greater than TRhs.
 									    ALPAKA_NO_HOST_ACC_WARNING
 									    template<typename TLhs, typename TRhs>
 									    ALPAKA_FN_HOST_ACC constexpr auto assertGreaterThan(TRhs const& rhs) -> void
 									    {
 									        detail::AssertGreaterThan<TLhs, TRhs>::assertGreaterThan(rhs);
 									    }
 									} // namespace alpaka::core
 									// ==
 									// == ./include/alpaka/core/Assert.hpp ==
 									// ============================================================================

 								// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 								// #include <ostream>    // amalgamate: file already included

 								namespace alpaka
 								{

 								    /**
 								     * @brief An enum representing the different types of boundary, with LOWER, MIDDLE, and UPPER being valid states,
 								     * and OOB being invalid (out-of-bounds).
 								     */
 								    enum class BoundaryType : uint32_t
 								    {
 								        LOWER,
 								        MIDDLE,
 								        UPPER,
 								        OOB
 								    };

 								    /**
 								     * @brief An n-dimensional boundary direction. Encodes a single unique boundary of an nD volume, e.g., a specific
 								     * corner of a 2D plane or a side of a 3D cube.
 								     *
 								     * @tparam T_dim The dimensionality of the volume that this is a boundary direction for.
 								     * @tparam T_LowHaloVec The vector type used for the lower halo sizes.
 								     * @tparam T_UpHaloVec The vector type used for the upper halo sizes.
 								     */
 								    template<uint32_t T_dim, concepts::Vector T_LowHaloVec, concepts::Vector T_UpHaloVec>
 								    struct BoundaryDirection
 								    {
 								        using T_BoundaryVec = Vec<BoundaryType, T_dim>;

 								        T_BoundaryVec data;
 								        T_LowHaloVec lowerHaloSize;
 								        T_UpHaloVec upperHaloSize;

 								        constexpr BoundaryDirection(
 								            concepts::Vector auto const& boundaries,
 								            T_LowHaloVec const& lower_halo_sizes,
 								            T_UpHaloVec const& upper_halo_sizes)
 								            : data(boundaries)
 								            , lowerHaloSize(lower_halo_sizes)
 								            , upperHaloSize(upper_halo_sizes)
 								        {
 								        }

 								        /** @brief The dimensionality of the whole volume that this is a boundary direction for. Not to be confused
 								         * with boundaryDimensionality().
 								         */
 								        [[nodiscard]] static constexpr uint32_t dim()
 								        {
 								            return T_dim;
 								        }

 								        /** @brief The dimensionality of the boundary direction. For example, a vertex (corner) of a 3D-volume (cube)
 								         * is 0-dimensional. See also the functions isVertex(), isEdge(), etc.
 								         */
 								        [[nodiscard]] constexpr uint32_t boundaryDimensionality() const
 								        {
 								            uint32_t c = 0;
 								            for(uint32_t i = 0; i < T_dim; ++i)
 								            {
 								                if(data[i] == BoundaryType::MIDDLE)
 								                    ++c;
 								            }
 								            return c;
 								        }

 								        /** @brief Return true if this boundary direction describes a vertex, for example the corner of a plane.
 								         */
 								        [[nodiscard]] constexpr bool isVertex() const
 								        {
 								            return boundaryDimensionality() == 0;
 								        }

 								        /** @brief Return true if this boundary direction describes an edge, for example any of the 12 edges of a cube.
 								         */
 								        [[nodiscard]] constexpr bool isEdge() const
 								        {
 								            return boundaryDimensionality() == 1;
 								        }

 								        /** @brief Return true if this boundary direction describes a face, for example any of the 6 sides of a cube.
 								         */
 								        [[nodiscard]] constexpr bool isFace() const
 								        {
 								            return boundaryDimensionality() == 2;
 								        }

 								        /** @brief Return true if this boundary direction describes a cell, for example the interior of a cube or one
 								         * of the 8 cells in a tesseract.
 								         */
 								        [[nodiscard]] constexpr bool isCell() const
 								        {
 								            return boundaryDimensionality() == 3;
 								        }

 								        /** @brief Return true if this boundary direction describes the interior of a volume, like the 2D interior of a
 								         * plane or the 3D interior of a cube.
 								         */
 								        [[nodiscard]] constexpr bool isInterior() const
 								        {
 								            return boundaryDimensionality() == dim();
 								        }

 								        [[nodiscard]] constexpr auto operator<=>(BoundaryDirection const&) const = default;
 								    };

 								    /**
 								     * @brief The iterator type for [`BoundaryDirectionsContainer`](@ref)
 								     *
 								     * @tparam T_dim The dimensionality of the volume that this is a boundary direction iterator for.
 								     * @tparam T_LowHaloVec The vector type used for the lower halo sizes.
 								     * @tparam T_UpHaloVec The vector type used for the upper halo sizes.
 								     */
 								    template<uint32_t T_dim, concepts::Vector T_LowHaloVec, concepts::Vector T_UpHaloVec>
 								    struct BoundaryDirectionIter
 								    {
 								        using T_BoundaryVec = Vec<BoundaryType, T_dim>;

 								        using difference_type = std::ptrdiff_t;
 								        using value_type = BoundaryDirection<T_dim, T_LowHaloVec, T_UpHaloVec>;
 								        using reference = value_type&;
 								        using const_reference = value_type const&;
 								        using pointer = value_type*;
 								        using const_pointer = value_type const*;

 								        constexpr BoundaryDirectionIter(
 								            T_BoundaryVec const& boundaries,
 								            T_LowHaloVec const& lower_halo_sizes,
 								            T_UpHaloVec const& upper_halo_sizes)
 								            : boundaries(boundaries, lower_halo_sizes, upper_halo_sizes)
 								            , lowerHaloSizes(lower_halo_sizes)
 								            , upperHaloSizes(upper_halo_sizes)
 								        {
 								        }

 								        [[nodiscard]] constexpr const_reference& operator*() const
 								        {
 								            return boundaries;
 								        }

 								        [[nodiscard]] constexpr reference& operator*()
 								        {
 								            return boundaries;
 								        }

 								        constexpr auto& operator++()
 								        {
 								            uint32_t i = T_dim - 1;
 								            bool oob = true;
 								            while(i != static_cast<uint32_t>(-1))
 								            {
 								                switch(boundaries.data[i])
 								                {
 								                case BoundaryType::LOWER:
 								                    boundaries.data[i] = BoundaryType::MIDDLE;
 								                    i = static_cast<uint32_t>(-1);
 								                    oob = false;
 								                    break;
 								                case BoundaryType::MIDDLE:
 								                    boundaries.data[i] = BoundaryType::UPPER;
 								                    i = static_cast<uint32_t>(-1);
 								                    oob = false;
 								                    break;
 								                case BoundaryType::UPPER:
 								                    boundaries.data[i] = BoundaryType::LOWER;
 								                    --i;
 								                    break;
 								                case BoundaryType::OOB:
 								                    [[fallthrough]];
 								                default:
 								                    constexpr bool onHost = std::is_same_v<api::Host, ALPAKA_TYPEOF(thisApi())>;
 								                    if constexpr(onHost)
 								                        assert(false);
 								                    else
 								                        ALPAKA_ASSERT_ACC(false);
 								                }
 								            }
 								            if(oob)
 								            {
 								                boundaries
 								                    = {Vec<BoundaryType, T_dim>([](int) { return BoundaryType::OOB; }),
 								                       lowerHaloSizes,
 								                       upperHaloSizes};
 								            }
 								            return *this;
 								        }

 								        [[nodiscard]] static consteval auto dim()
 								        {
 								            return T_dim;
 								        }

 								        [[nodiscard]] constexpr auto operator<=>(BoundaryDirectionIter const&) const = default;

 								    private:
 								        BoundaryDirection<T_dim, T_LowHaloVec, T_UpHaloVec> boundaries;

 								        T_LowHaloVec lowerHaloSizes;
 								        T_UpHaloVec upperHaloSizes;
 								    };

 								    /**
 								     * @brief A container for boundary directions of an n-dimensional volume.
 								     * For example, a 1-dimensional (1D) volume has two 0D ends and a 1D center. A 2D volume has 4 0D corners, 4 1D
 								     * edges, and one 2D center. In general, there are 3^n boundaries for an nD volume. This class implements begin(),
 								     * end(), and length(), and can be iterated over.
 								     *
 								     * @tparam T_dim The dimensionality of the volume that this contains boundaries for.
 								     * @tparam T_LowHaloVec The vector type used for the lower halo sizes.
 								     * @tparam T_UpHaloVec The vector type used for the upper halo sizes.
 								     */
 								    template<uint32_t T_dim, concepts::Vector T_LowHaloVec, concepts::Vector T_UpHaloVec>
 								    struct BoundaryDirectionsContainer
 								    {
 								        static_assert(T_dim > 0, "0 Dimension Boundary Direction Container is not defined");

 								        constexpr BoundaryDirectionsContainer(T_LowHaloVec const& lowerHaloSizes, T_UpHaloVec const& upperHaloSizes)
 								            : m_lowerHaloSizes(lowerHaloSizes)
 								            , m_upperHaloSizes(upperHaloSizes)
 								        {
 								        }

 								        [[nodiscard]] constexpr BoundaryDirectionIter<T_dim, T_LowHaloVec, T_UpHaloVec> begin() const
 								        {
 								            return BoundaryDirectionIter<T_dim, T_LowHaloVec, T_UpHaloVec>{
 								                Vec<BoundaryType, T_dim>([](int) { return BoundaryType::LOWER; }),
 								                m_lowerHaloSizes,
 								                m_upperHaloSizes};
 								        }

 								        [[nodiscard]] constexpr BoundaryDirectionIter<T_dim, T_LowHaloVec, T_UpHaloVec> end() const
 								        {
 								            return BoundaryDirectionIter<T_dim, T_LowHaloVec, T_UpHaloVec>{
 								                Vec<BoundaryType, T_dim>([](int) { return BoundaryType::OOB; }),
 								                m_lowerHaloSizes,
 								                m_upperHaloSizes};
 								        }

 								        [[nodiscard]] static consteval uint32_t length()
 								        {
 								            return ipow(3u, T_dim);
 								        }

 								        [[nodiscard]] static consteval auto dim()
 								        {
 								            return T_dim;
 								        }

 								    private:
 								        T_LowHaloVec const m_lowerHaloSizes;
 								        T_UpHaloVec const m_upperHaloSizes;
 								    };

 								    template<concepts::Vector LowHaloVecType, concepts::Vector UpHaloVecType>
 								    BoundaryDirectionsContainer(LowHaloVecType const& lowerHalos, UpHaloVecType const& upperHalos)
 								        -> BoundaryDirectionsContainer<LowHaloVecType::dim(), LowHaloVecType, UpHaloVecType>;

 								    /** @brief Construct and return a single boundary direction specifying the middle of a volume.
 								     */
 								    template<uint32_t T_dim>
 								    [[nodiscard]] constexpr auto makeCoreBoundaryDirection(
 								        concepts::Vector auto const& lowerHalos,
 								        concepts::Vector auto const& upperHalos)
 								    {
 								        return BoundaryDirection<T_dim, ALPAKA_TYPEOF(lowerHalos), ALPAKA_TYPEOF(upperHalos)>{
 								            fillCVec<BoundaryType, T_dim, BoundaryType::MIDDLE>(),
 								            lowerHalos,
 								            upperHalos};
 								    }

 								    /** @brief Construct and return a single boundary direction specifying the middle of a volume with symmetric halos.
 								     */
 								    template<uint32_t T_dim>
 								    [[nodiscard]] constexpr auto makeCoreBoundaryDirection(concepts::Vector auto const& halos)
 								    {
 								        return BoundaryDirection<T_dim, ALPAKA_TYPEOF(halos), ALPAKA_TYPEOF(halos)>{
 								            fillCVec<BoundaryType, T_dim, BoundaryType::MIDDLE>(),
 								            halos,
 								            halos};
 								    }

 								    /**
 								     * @brief Construct and return a single boundary direction specifying the middle of a volume with all halo sizes
 								     * set to 1.
 								     */
 								    template<uint32_t T_dim>
 								    consteval auto makeCoreBoundaryDirection()
 								    {
 								        return makeCoreBoundaryDirection<T_dim>(fillCVec<uint32_t, T_dim, 1u>());
 								    }

 								    /** @brief Construct and return a boundary direction container. This container can be iterated over. See
 								     * BoundaryDirectionsContainer.
 								     * This constructor uses a default halo size of 1 everywhere.
 								     *
 								     * @tparam T_dim The dimensionality of the container.
 								     */
 								    template<uint32_t T_dim>
 								    [[nodiscard]] constexpr auto makeBoundaryDirIterator()
 								    {
 								        auto lowerHalos = fillCVec<uint32_t, T_dim, static_cast<uint32_t>(1)>();
 								        auto upperHalos = fillCVec<uint32_t, T_dim, static_cast<uint32_t>(1)>();
 								        return BoundaryDirectionsContainer{lowerHalos, upperHalos};
 								    }

 								    /** @brief Construct and return a boundary direction container with the given halo sizes.
 								     * This container can be iterated over. See BoundaryDirectionsContainer.
 								     * The dimensionality is inferred from the given haloSizes.
 								     *
 								     * @param haloSizes The halo sizes per dimension. The halos are used for both "ends" of each dimension
 								     * symmetrically.
 								     */
 								    [[nodiscard]] constexpr auto makeBoundaryDirIterator(concepts::Vector auto const& haloSizes)
 								    {
 								        return BoundaryDirectionsContainer{haloSizes, haloSizes};
 								    }

 								    /** @brief Construct and return a boundary direction container with the given halo sizes.
 								     * This container can be iterated over. See BoundaryDirectionsContainer.
 								     * The dimensionality is inferred from the given halo sizes, which are asserted to be identical.
 								     *
 								     * @param lowerHaloSizes The lower end halo sizes per dimension. These are the halos from 0 in each dimension.
 								     * @param upperHaloSizes The upper end halo sizes per dimension. These are the halos to `size()` in each dimension.
 								     */
 								    [[nodiscard]] constexpr auto makeBoundaryDirIterator(
 								        concepts::Vector auto const& lowerHaloSizes,
 								        concepts::Vector auto const& upperHaloSizes)
 								    {
 								        static_assert(
 								            ALPAKA_TYPEOF(lowerHaloSizes)::dim() == ALPAKA_TYPEOF(upperHaloSizes)::dim(),
 								            "dimension mismatch");
 								        return BoundaryDirectionsContainer{lowerHaloSizes, upperHaloSizes};
 								    }

 								    /** @brief Construct and return a boundary direction container for the given view with default (size 1) halo
 								     * sizes. This container can be iterated over. See BoundaryDirectionsContainer.
 								     * For custom halo sizes, use one of the other overloads.
 								     *
 								     * @param view The given view; only the dimension of the view matters.
 								     */
 								    [[nodiscard]] constexpr auto makeBoundaryDirIterator(concepts::View auto const& view)
 								    {
 								        return makeBoundaryDirIterator<static_cast<uint32_t>(ALPAKA_TYPEOF(view)::dim())>();
 								    }

 								    namespace trait
 								    {
 								        template<typename T>
 								        struct IsBoundaryDirection : std::false_type
 								        {
 								        };

 								        template<uint32_t T_dim, concepts::Vector T_LowHaloVec, concepts::Vector T_UpHaloVec>
 								        requires(T_dim == T_LowHaloVec::dim() && T_dim == T_UpHaloVec::dim())
 								        struct IsBoundaryDirection<BoundaryDirection<T_dim, T_LowHaloVec, T_UpHaloVec>> : std::true_type
 								        {
 								        };
 								    } // namespace trait

 								    template<typename T>
 								    constexpr bool isBoundaryDirection_v = trait::IsBoundaryDirection<T>::value;

 								    namespace concepts
 								    {
 								        /** @brief Concept checking whether T is a boundary direction.
 								         */
 								        template<typename T>
 								        concept BoundaryDirection = isBoundaryDirection_v<T>;
 								    } // namespace concepts

 								    std::ostream& operator<<(std::ostream& os, concepts::BoundaryDirection auto const& bd)
 								    {
 								        for(uint32_t i = 0; i < bd.dim(); ++i)
 								        {
 								            switch(bd.data[i])
 								            {
 								            case BoundaryType::LOWER:
 								                os << 'v';
 								                break;
 								            case BoundaryType::MIDDLE:
 								                os << '-';
 								                break;
 								            case BoundaryType::UPPER:
 								                os << '^';
 								                break;
 								            case BoundaryType::OOB:
 								                [[fallthrough]];
 								            default:
 								                os << 'x';
 								                break;
 								            }
 								        }

 								        if(bd.isVertex())
 								            os << " (vertex) ";
 								        if(bd.isEdge())
 								            os << " (edge)   ";
 								        if(bd.isFace())
 								            os << " (face)   ";
 								        if(bd.isCell())
 								            os << " (cell)   ";
 								        if(bd.boundaryDimensionality() >= 4)
 								            os << " (" << bd.boundaryDimensionality() << "D volume)";

 								        return os;
 								    }
 								} // namespace alpaka
 								// ==
 								// == ./include/alpaka/mem/BoundaryIter.hpp ==
 								// ============================================================================

 								// ============================================================================
 								// == ./include/alpaka/mem/Iter.hpp ==
 								// ==
 								/* Copyright 2024 Andrea Bocci, René Widera
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 									// ============================================================================
 									// == ./include/alpaka/mem/IdxRange.hpp ==
 									// ==
 									/* Copyright 2024 René Widera
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once
 									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/mem/BoundaryIter.hpp"    // amalgamate: file already inlined

 									// #include <cstdint>    // amalgamate: file already included

 									namespace alpaka
 									{

 									    template<
 									        concepts::VectorOrScalar T_End,
 									        concepts::Vector T_Begin = typename T_End::UniVec,
 									        concepts::Vector T_Stride = typename T_End::UniVec>
 									    struct IdxRange
 									    {
 									        using IdxType = typename T_End::type;
 									        using IdxVecType = typename T_End::UniVec;

 									        constexpr IdxRange(T_Begin const& begin, T_End const& end, T_Stride const& stride)
 									            : m_begin{begin}
 									            , m_end{end}
 									            , m_stride{stride}
 									        {
 									        }

 									        constexpr IdxRange(T_Begin const& begin, T_End const& end)
 									            : m_begin{begin}
 									            , m_end{end}
 									            , m_stride{T_End::all(1u)}
 									        {
 									        }

 									        constexpr IdxRange(T_End const& extent) : m_begin{T_End::all(0u)}, m_end{extent}, m_stride{T_End::all(1u)}
 									        {
 									        }

 									        static consteval uint32_t dim()
 									        {
 									            return IdxVecType::dim();
 									        }

 									        template<concepts::TypeOrVector<typename T_End::type> T_OpType>
 									        ALPAKA_FN_HOST_ACC constexpr auto operator%(T_OpType const& rhs) const
 									        {
 									            return IdxRange<T_End, T_Begin, ALPAKA_TYPEOF(m_stride * rhs)>{m_begin, m_end, m_stride * rhs};
 									        }

 									        template<concepts::TypeOrVector<typename T_End::type> T_OpType>
 									        ALPAKA_FN_HOST_ACC constexpr auto operator>>(T_OpType const& rhs) const
 									        {
 									            return IdxRange<ALPAKA_TYPEOF(m_end + rhs), ALPAKA_TYPEOF(m_begin + rhs), ALPAKA_TYPEOF(m_stride)>{
 									                m_begin + rhs,
 									                m_end + rhs,
 									                m_stride};
 									        }

 									        template<concepts::TypeOrVector<typename T_End::type> T_OpType>
 									        ALPAKA_FN_HOST_ACC constexpr auto operator<<(T_OpType const& rhs) const
 									        {
 									            return IdxRange<ALPAKA_TYPEOF(m_end - rhs), ALPAKA_TYPEOF(m_begin - rhs), T_Stride>{
 									                m_begin - rhs,
 									                m_end - rhs,
 									                m_stride};
 									        }

 									        constexpr auto distance() const
 									        {
 									            return m_end - m_begin;
 									        }

 									        std::string toString(std::string const separator = ",", std::string const enclosings = "{}") const
 									        {
 									            std::string locale_enclosing_begin;
 									            std::string locale_enclosing_end;
 									            size_t enclosing_dim = enclosings.size();

 									            if(enclosing_dim > 0)
 									            {
 									                /* % avoid out of memory access */
 									                locale_enclosing_begin = enclosings[0 % enclosing_dim];
 									                locale_enclosing_end = enclosings[1 % enclosing_dim];
 									            }

 									            std::stringstream stream;
 									            stream << locale_enclosing_begin;
 									            stream << m_begin << separator << m_end << separator << m_stride;
 									            stream << locale_enclosing_end;
 									            return stream.str();
 									        }

 									        T_Begin m_begin;
 									        T_End m_end;
 									        T_Stride m_stride;

 									        using type = typename T_Begin::type;
 									    };

 									    template<uint32_t T_dim, alpaka::concepts::Vector T_LowHaloVec, alpaka::concepts::Vector T_UpHaloVec>
 									    constexpr auto makeDirectionSubRange(
 									        auto const range,
 									        alpaka::BoundaryDirection<T_dim, T_LowHaloVec, T_UpHaloVec> const& boundaryDir)
 									    {
 									        auto m_begin = Vec<uint32_t, T_dim>::all(0u);
 									        auto m_end = Vec<uint32_t, T_dim>::all(0u);
 									        for(uint32_t i = 0; i < T_dim; ++i)
 									        {
 									            switch(boundaryDir.data[i])
 									            {
 									            case BoundaryType::LOWER:
 									                m_begin[i] = range.m_begin[i];
 									                m_end[i] = range.m_begin[i] + boundaryDir.lowerHaloSize[i];
 									                break;
 									            case BoundaryType::UPPER:
 									                m_begin[i] = range.m_end[i] - boundaryDir.upperHaloSize[i];
 									                m_end[i] = range.m_end[i];
 									                break;
 									            case BoundaryType::MIDDLE:
 									                m_begin[i] = range.m_begin[i] + boundaryDir.lowerHaloSize[i];
 									                m_end[i] = range.m_end[i] - boundaryDir.upperHaloSize[i];
 									                break;
 									            case BoundaryType::OOB:
 									                [[fallthrough]];
 									            default:
 									                ALPAKA_ASSERT_ACC(false);
 									            }
 									        }
 									        return IdxRange{m_begin, m_end, range.m_stride};
 									    }

 									    namespace internal
 									    {
 									        template<
 									            typename T_To,
 									            alpaka::concepts::Vector T_End,
 									            alpaka::concepts::Vector T_Begin,
 									            alpaka::concepts::Vector T_Stride>
 									        struct PCast::Op<T_To, IdxRange<T_End, T_Begin, T_Stride>>
 									        {
 									            constexpr decltype(auto) operator()(auto&& input) const
 									                requires std::convertible_to<typename T_End::type, T_To> && (!std::same_as<T_To, typename T_End::type>)
 									            {
 									                return IdxRange{pCast<T_To>(input.m_begin), pCast<T_To>(input.m_end), pCast<T_To>(input.m_stride)};
 									            }

 									            constexpr decltype(auto) operator()(auto&& input) const requires std::same_as<T_To, typename T_End::type>
 									            {
 									                return input;
 									            }
 									        };

 									    } // namespace internal

 									    template<concepts::VectorOrScalar T_Extents>
 									    ALPAKA_FN_HOST_ACC IdxRange(T_Extents const&) -> IdxRange<typename trait::getVec_t<T_Extents>::UniVec>;

 									    template<concepts::VectorOrScalar T_Begin, concepts::VectorOrScalar T_End>
 									    ALPAKA_FN_HOST_ACC IdxRange(T_Begin const&, T_End const&) -> IdxRange<
 									        typename trait::getVec_t<T_Begin>::UniVec,
 									        typename trait::getVec_t<T_End>::UniVec,
 									        typename trait::getVec_t<T_End>::UniVec>;

 									    template<concepts::VectorOrScalar T_Begin, concepts::VectorOrScalar T_End, concepts::VectorOrScalar T_Stride>
 									    ALPAKA_FN_HOST_ACC IdxRange(T_Begin const&, T_End const&, T_Stride const&) -> IdxRange<
 									        typename trait::getVec_t<T_Begin>::UniVec,
 									        typename trait::getVec_t<T_End>::UniVec,
 									        typename trait::getVec_t<T_Stride>::UniVec>;

 									    namespace trait
 									    {
 									        template<typename T>
 									        struct IsIndexRange : std::false_type
 									        {
 									        };

 									        template<typename T>
 									        requires(isSpecializationOf_v<std::remove_cvref_t<T>, IdxRange>)
 									        struct IsIndexRange<T> : std::true_type
 									        {
 									        };

 									        template<typename T>
 									        struct IsLazyIndexRange : std::false_type
 									        {
 									        };

 									    } // namespace trait

 									    template<typename T>
 									    constexpr bool isIndexRange_v = trait::IsIndexRange<T>::value;

 									    template<typename T>
 									    constexpr bool isLazyIndexRange_v = trait::IsLazyIndexRange<T>::value;

 									    namespace concepts
 									    {
 									        /** Concept to check if a type is an index range
 									         *
 									         * @tparam T Type to check
 									         * @tparam T_ValueType enforce a value type of the index range, if not provided the type is not checked
 									         * @tparam T_dim enforce a dimensionality of the index range, if not provided the value is not checked
 									         */
 									        template<typename T, typename T_ValueType = alpaka::NotRequired, uint32_t T_dim = alpaka::notRequiredDim>
 									        concept IdxRange
 									            = alpaka::isIndexRange_v<T>
 									              && (std::same_as<T_ValueType, typename T::IdxType> || std::same_as<T_ValueType, alpaka::NotRequired>)
 									              && ((T_dim == alpaka::notRequiredDim) || (T::dim() == T_dim));

 									        /** Concept to check if a type is a lazy-evaluated index range
 									         *
 									         * @attention the value type and dimension can not be evaluated for lazy index ranges.
 									         *
 									         * @tparam T Type to check
 									         */
 									        template<typename T>
 									        concept LazyIdxRange = alpaka::isLazyIndexRange_v<T>;

 									        template<typename T>
 									        concept IdxRangeDescription = alpaka::isIndexRange_v<T> || isLazyIndexRange_v<T>;

 									    } // namespace concepts
 									} // namespace alpaka
 									// ==
 									// == ./include/alpaka/mem/IdxRange.hpp ==
 									// ============================================================================

 									// ============================================================================
 									// == ./include/alpaka/mem/ThreadSpace.hpp ==
 									// ==
 									/* Copyright 2024 Andrea Bocci, René Widera
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once
 									// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 									// #include <cstdint>    // amalgamate: file already included

 									namespace alpaka
 									{
 									    template<concepts::Vector T_ThreadIdx, concepts::Vector T_ThreadCount>
 									    struct ThreadSpace
 									    {
 									        constexpr ThreadSpace(T_ThreadIdx const& threadIdx, T_ThreadCount const& threadCount)
 									            : m_threadIdx(threadIdx)
 									            , m_threadCount(threadCount)
 									        {
 									        }

 									        std::string toString(std::string const separator = ",", std::string const enclosings = "{}") const
 									        {
 									            std::string locale_enclosing_begin;
 									            std::string locale_enclosing_end;
 									            size_t enclosing_dim = enclosings.size();

 									            if(enclosing_dim > 0)
 									            {
 									                /* % avoid out of memory access */
 									                locale_enclosing_begin = enclosings[0 % enclosing_dim];
 									                locale_enclosing_end = enclosings[1 % enclosing_dim];
 									            }

 									            std::stringstream stream;
 									            stream << locale_enclosing_begin;
 									            stream << m_threadIdx << separator << m_threadCount;
 									            stream << locale_enclosing_end;
 									            return stream.str();
 									        }

 									        constexpr auto size() const
 									        {
 									            return m_threadCount;
 									        }

 									        constexpr auto idx() const
 									        {
 									            return m_threadIdx;
 									        }

 									        template<concepts::CVector T_CSelect>
 									        constexpr ThreadSpace mapTo(T_CSelect selection) const requires(T_ThreadIdx::dim() <= T_CSelect::dim())
 									        {
 									            static_assert(T_ThreadIdx::dim() == T_CSelect::dim(), "can not map to a larger dimension");
 									            return *this;
 									        }

 									        template<concepts::CVector T_CSelect>
 									        constexpr ThreadSpace mapTo(T_CSelect selection) const requires(T_ThreadIdx::dim() > T_CSelect::dim())
 									        {
 									            using IdxType = typename T_ThreadIdx::type;
 									            constexpr uint32_t dim = T_ThreadIdx::dim();

 									            auto allElements = iotaCVec<IdxType, dim>();
 									            constexpr auto notSelectedDims = filter(allElements, T_CSelect{});

 									            auto threadIndex = m_threadIdx;
 									            auto numThreads = m_threadCount;

 									            // map not selected dimensions to the slowest selected dimension
 									            for(uint32_t x = 0u; x < notSelectedDims.dim(); ++x)
 									            {
 									                auto d = notSelectedDims[x];
 									                auto old = threadIndex[d];
 									                threadIndex[d] = 0u;
 									                threadIndex[T_CSelect{}[0]] += old * numThreads[T_CSelect{}[0]];
 									            }

 									            for(uint32_t x = 0u; x < notSelectedDims.dim(); ++x)
 									            {
 									                auto d = notSelectedDims[x];
 									                auto old = numThreads[d];
 									                numThreads[d] = 1u;
 									                numThreads[T_CSelect{}[0]] *= old;
 									            }

 									            return {threadIndex, numThreads};
 									        }

 									        T_ThreadIdx m_threadIdx;
 									        T_ThreadCount m_threadCount;

 									        using type = typename T_ThreadIdx::type;
 									    };

 									    namespace internal
 									    {
 									        template<typename T_To, typename T_ThreadIdx, typename T_ThreadCount>
 									        struct PCast::Op<T_To, ThreadSpace<T_ThreadIdx, T_ThreadCount>>
 									        {
 									            constexpr decltype(auto) operator()(auto&& input) const
 									                requires std::convertible_to<typename T_ThreadIdx::type, T_To>
 									                         && (!std::same_as<T_To, typename T_ThreadIdx::type>)
 									            {
 									                return ThreadSpace{pCast<T_To>(input.m_threadIdx), pCast<T_To>(input.m_threadCount)};
 									            }

 									            constexpr decltype(auto) operator()(auto&& input) const
 									                requires std::same_as<T_To, typename T_ThreadIdx::type>
 									            {
 									                return input;
 									            }
 									        };
 									    } // namespace internal

 									    template<std::size_t I, typename T_ThreadIdx, typename T_ThreadCount>
 									    constexpr auto get(alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount> const& v) requires(I == 0u)
 									    {
 									        return v.m_threadIdx;
 									    }

 									    template<std::size_t I, typename T_ThreadIdx, typename T_ThreadCount>
 									    constexpr auto& get(alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount>& v) requires(I == 0u)
 									    {
 									        return v.m_threadIdx;
 									    }

 									    template<std::size_t I, typename T_ThreadIdx, typename T_ThreadCount>
 									    constexpr auto get(alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount> const& v) requires(I == 1u)
 									    {
 									        return v.m_threadCount;
 									    }

 									    template<std::size_t I, typename T_ThreadIdx, typename T_ThreadCount>
 									    constexpr auto& get(alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount>& v) requires(I == 1u)
 									    {
 									        return v.m_threadCount;
 									    }

 									} // namespace alpaka

 									namespace std
 									{
 									    template<typename T_ThreadIdx, typename T_ThreadCount>
 									    struct tuple_size<alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount>>
 									    {
 									        static constexpr std::size_t value = 2u;
 									    };

 									    template<std::size_t I, typename T_ThreadIdx, typename T_ThreadCount>
 									    struct tuple_element<I, alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount>>
 									    {
 									        using type = std::conditional_t<I == 0u, T_ThreadIdx, T_ThreadCount>;
 									    };
 									} // namespace std
 									// ==
 									// == ./include/alpaka/mem/ThreadSpace.hpp ==
 									// ============================================================================

 								// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
 									// ============================================================================
 									// == ./include/alpaka/onAcc/WorkGroup.hpp ==
 									// ==
 									/* Copyright 2024 Andrea Bocci, René Widera
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once
 									// #include "alpaka/mem/ThreadSpace.hpp"    // amalgamate: file already inlined
 										// ============================================================================
 										// == ./include/alpaka/onAcc/internal/interface.hpp ==
 										// ==
 										/* Copyright 2024 René Widera
 										 * SPDX-License-Identifier: MPL-2.0
 										 */

 										// #pragma once
 										// #include "alpaka/UniqueId.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 											// ============================================================================
 											// == ./include/alpaka/onAcc/tag.hpp ==
 											// ==
 											/* Copyright 2024 René Widera
 											 * SPDX-License-Identifier: MPL-2.0
 											 */

 											// #pragma once
 											// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
 											// #include "alpaka/core/Tag.hpp"    // amalgamate: file already inlined
 											// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined

 											// #include <cassert>    // amalgamate: file already included
 											// #include <tuple>    // amalgamate: file already included

 											namespace alpaka::onAcc
 											{
 											    /** Origin of index domains
 											     *
 											     * An origin is used to query the index domain within a block or grid.
 											     */
 											    namespace origin
 											    {
 											        ALPAKA_TAG(block);
 											        ALPAKA_TAG(grid);
 											    } // namespace origin

 											    /** Unit of index domains
 											     *
 											     * A unit is used to describe the quantity of the index domain with respect to an origin
 											     */
 											    namespace unit
 											    {
 											        ALPAKA_TAG(threads);
 											        ALPAKA_TAG(blocks);
 											    } // namespace unit

 											    namespace trait
 											    {
 											        template<typename T>
 											        struct IsOrigin : std::false_type
 											        {
 											        };

 											        template<>
 											        struct IsOrigin<ALPAKA_TYPEOF(origin::block)> : std::true_type
 											        {
 											        };

 											        template<>
 											        struct IsOrigin<ALPAKA_TYPEOF(origin::grid)> : std::true_type
 											        {
 											        };

 											        template<typename T>
 											        struct IsUnit : std::false_type
 											        {
 											        };

 											        template<>
 											        struct IsUnit<ALPAKA_TYPEOF(unit::threads)> : std::true_type
 											        {
 											        };

 											        template<>
 											        struct IsUnit<ALPAKA_TYPEOF(unit::blocks)> : std::true_type
 											        {
 											        };
 											    } // namespace trait

 											    template<typename T>
 											    constexpr bool isOrigin_v = trait::IsOrigin<T>::value;

 											    template<typename T>
 											    constexpr bool isUnit_v = trait::IsUnit<T>::value;

 											    namespace concepts
 											    {
 											        template<typename T>
 											        concept Origin = isOrigin_v<T>;

 											        template<typename T>
 											        concept Unit = isUnit_v<T>;
 											    } // namespace concepts

 											} // namespace alpaka::onAcc
 											// ==
 											// == ./include/alpaka/onAcc/tag.hpp ==
 											// ============================================================================

 										// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 										namespace alpaka::onAcc
 										{
 										    namespace internalCompute
 										    {
 										        struct Sync
 										        {
 										            template<typename T_Acc, alpaka::concepts::Layer T_Scope>
 										            struct Op
 										            {
 										                constexpr auto operator()(T_Acc const& acc, T_Scope const scope) const;
 										            };
 										        };

 										        constexpr void sync(auto const& acc, alpaka::concepts::Layer auto const scope)
 										        {
 										            Sync::Op<ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(scope)>{}(acc, scope);
 										        }

 										        struct SharedMemory
 										        {
 										            template<typename T, size_t T_uniqueId, typename T_Acc>
 										            struct Static
 										            {
 										                constexpr decltype(auto) operator()(auto const& acc) const
 										                {
 										                    return acc[layer::shared].template allocVar<T, T_uniqueId>();
 										                }
 										            };

 										            template<typename T, typename T_Acc>
 										            struct Dynamic
 										            {
 										                constexpr auto operator()(auto const& acc) const -> T*
 										                {
 										                    static_assert(
 										                        T_Acc::hasKey(object::dynSharedMemBytes),
 										                        "Dynamic shared memory not configured. Add member 'dynSharedMemBytes' to the kernel or "
 										                        "specialize 'onHost::trait:BlockDynSharedMemBytes'!");
 										                    uint32_t numBytes = acc[object::dynSharedMemBytes];
 										                    return acc[layer::dynShared].template allocDynamic<T, uniqueId()>(numBytes);
 										                }
 										            };
 										        };

 										        template<typename T, size_t T_uniqueId>
 										        constexpr decltype(auto) declareSharedVar(auto const& acc)
 										        {
 										            return SharedMemory::Static<T, T_uniqueId, std::decay_t<decltype(acc)>>{}(acc);
 										        }

 										        template<typename T>
 										        constexpr auto declareDynamicSharedMem(auto const& acc) -> T*
 										        {
 										            return SharedMemory::Dynamic<T, std::decay_t<decltype(acc)>>{}(acc);
 										        }

 										        struct Atomic
 										        {
 										            /** Implements a atomic operation */
 										            template<typename TOp, typename TAtomicImpl, typename T, typename T_Scope, typename TSfinae = void>
 										            struct Op;
 										        };

 										        /** Get the index of an object within a layer in the selected units*/
 										        struct GetIdxWithin
 										        {
 										            template<typename T_Acc, typename T_Origin, typename T_Unit>
 										            struct Op
 										            {
 										                constexpr alpaka::concepts::Vector auto operator()(T_Acc const& acc, T_Origin origin, T_Unit unit)
 										                    const;
 										            };

 										            template<typename T_Acc>
 										            struct Op<T_Acc, ALPAKA_TYPEOF(origin::block), ALPAKA_TYPEOF(unit::threads)>
 										            {
 										                constexpr alpaka::concepts::Vector auto operator()(
 										                    T_Acc const& acc,
 										                    ALPAKA_TYPEOF(origin::block),
 										                    ALPAKA_TYPEOF(unit::threads)) const
 										                {
 										                    return acc[layer::thread].idx();
 										                }
 										            };

 										            template<typename T_Acc>
 										            struct Op<T_Acc, ALPAKA_TYPEOF(origin::grid), ALPAKA_TYPEOF(unit::threads)>
 										            {
 										                constexpr alpaka::concepts::Vector auto operator()(
 										                    T_Acc const& acc,
 										                    ALPAKA_TYPEOF(origin::grid),
 										                    ALPAKA_TYPEOF(unit::threads)) const
 										                {
 										                    return acc[layer::thread].count() * acc[layer::block].idx() + acc[layer::thread].idx();
 										                }
 										            };

 										            template<typename T_Acc>
 										            struct Op<T_Acc, ALPAKA_TYPEOF(origin::grid), ALPAKA_TYPEOF(unit::blocks)>
 										            {
 										                constexpr alpaka::concepts::Vector auto operator()(
 										                    T_Acc const& acc,
 										                    ALPAKA_TYPEOF(origin::grid),
 										                    ALPAKA_TYPEOF(unit::blocks)) const
 										                {
 										                    return acc[layer::block].idx();
 										                }
 										            };
 										        };

 										        /** Get the number of elments in a layer in the selected units*/
 										        struct GetExtentsOf
 										        {
 										            template<typename T_Acc, typename T_Origin, typename T_Unit>
 										            struct Op
 										            {
 										                constexpr alpaka::concepts::Vector auto operator()(T_Acc const& acc, T_Origin origin, T_Unit unit)
 										                    const;
 										            };

 										            template<typename T_Acc>
 										            struct Op<T_Acc, ALPAKA_TYPEOF(origin::block), ALPAKA_TYPEOF(unit::threads)>
 										            {
 										                constexpr alpaka::concepts::Vector auto operator()(
 										                    T_Acc const& acc,
 										                    ALPAKA_TYPEOF(origin::block),
 										                    ALPAKA_TYPEOF(unit::threads)) const
 										                {
 										                    return acc[layer::thread].count();
 										                }
 										            };

 										            template<typename T_Acc>
 										            struct Op<T_Acc, ALPAKA_TYPEOF(origin::grid), ALPAKA_TYPEOF(unit::blocks)>
 										            {
 										                constexpr alpaka::concepts::Vector auto operator()(
 										                    T_Acc const& acc,
 										                    ALPAKA_TYPEOF(origin::grid),
 										                    ALPAKA_TYPEOF(unit::blocks)) const
 										                {
 										                    return acc[layer::block].count();
 										                }
 										            };

 										            template<typename T_Acc>
 										            struct Op<T_Acc, ALPAKA_TYPEOF(origin::grid), ALPAKA_TYPEOF(unit::threads)>
 										            {
 										                constexpr alpaka::concepts::Vector auto operator()(
 										                    T_Acc const& acc,
 										                    ALPAKA_TYPEOF(origin::grid),
 										                    ALPAKA_TYPEOF(unit::threads)) const
 										                {
 										                    return acc[layer::block].count() * acc[layer::thread].count();
 										                }
 										            };
 										        };

 										        struct MemoryFence
 										        {
 										            // Backend specializations provide the definition.
 										            template<typename T_Acc, typename T_Scope>
 										            struct Op
 										            {
 										                constexpr void operator()(T_Acc const& acc, T_Scope const scope) const;
 										            };
 										        };
 										    } // namespace internalCompute
 										} // namespace alpaka::onAcc
 										// ==
 										// == ./include/alpaka/onAcc/internal/interface.hpp ==
 										// ============================================================================

 									// #include "alpaka/onAcc/tag.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 									// #include <cstdint>    // amalgamate: file already included

 									namespace alpaka::onAcc
 									{
 									    template<bool T_multiDimensional = true>
 									    struct MultiDimensional : std::bool_constant<T_multiDimensional>
 									    {
 									    };

 									    constexpr auto linearized = MultiDimensional<false>{};

 									    template<
 									        typename T_ThreadIdxOrOrigin,
 									        typename T_NumThreadsOrUnit,
 									        typename T_MultiDimensional = MultiDimensional<true>>
 									    struct WorkerGroup
 									    {
 									        /** WorkerGroup constructor
 									         *
 									         * @param threadIdxOrOrigin the index of the thread or onAcc::origin
 									         * @param numThreadsOrUnit the number of threads or the onAcc::unit
 									         * @param multiDimensional keep the dimensionality for both input parameters, if 'linearized' is used the
 									         * workgroup will be reduced to a one dimensional group.
 									         */
 									        constexpr WorkerGroup(
 									            T_ThreadIdxOrOrigin threadIdxOrOrigin,
 									            T_NumThreadsOrUnit numThreadsOrUnit,
 									            T_MultiDimensional = MultiDimensional<true>{})
 									            : m_threadIdxOrOrigin{threadIdxOrOrigin}
 									            , m_numThreadsOrUnit{numThreadsOrUnit}
 									        {
 									        }

 									        constexpr auto size(auto const& acc) const
 									        {
 									            return getThreadSpace(acc).size();
 									        }

 									        constexpr auto idx(auto const& acc) const
 									        {
 									            return getThreadSpace(acc).idx();
 									        }

 									    private:
 									        template<typename T_ThreadGroup, typename T_ThreadIdxOrOriginRange>
 									        friend struct DomainSpec;

 									        /** get the thread configuration
 									         *
 									         * Implementation specialization for vectors.
 									         */
 									        constexpr auto getThreadSpace([[maybe_unused]] auto const& acc) const
 									            requires(isVector_v<T_ThreadIdxOrOrigin> && isVector_v<T_NumThreadsOrUnit>)
 									        {
 									            if constexpr(T_MultiDimensional::value == false)
 									                return ThreadSpace{
 									                    Vec{linearize(m_numThreadsOrUnit, m_threadIdxOrOrigin)},
 									                    Vec{m_numThreadsOrUnit.product()}};
 									            else
 									                return ThreadSpace{m_threadIdxOrOrigin, m_numThreadsOrUnit};
 									        }

 									        /** get the thread configuration
 									         *
 									         * Implementation specialization for lazy evaluated acc properties based on an origin and unit.
 									         */
 									        constexpr auto getThreadSpace(auto const& acc) const
 									            requires(isOrigin_v<T_ThreadIdxOrOrigin> && isUnit_v<T_NumThreadsOrUnit>)
 									        {
 									            auto const idx
 									                = internalCompute::GetIdxWithin::Op<ALPAKA_TYPEOF(acc), T_ThreadIdxOrOrigin, T_NumThreadsOrUnit>{}(
 									                    acc,
 									                    m_threadIdxOrOrigin,
 									                    m_numThreadsOrUnit);
 									            auto const extent
 									                = internalCompute::GetExtentsOf::Op<ALPAKA_TYPEOF(acc), T_ThreadIdxOrOrigin, T_NumThreadsOrUnit>{}(
 									                    acc,
 									                    m_threadIdxOrOrigin,
 									                    m_numThreadsOrUnit);

 									            if constexpr(T_MultiDimensional::value == false)
 									                return ThreadSpace{Vec{linearize(extent, idx)}, Vec{extent.product()}};
 									            else
 									                return ThreadSpace{idx, extent};
 									        }

 									    private:
 									        T_ThreadIdxOrOrigin m_threadIdxOrOrigin;
 									        T_NumThreadsOrUnit m_numThreadsOrUnit;
 									    };

 									    namespace worker
 									    {
 									        constexpr auto threadsInGrid = WorkerGroup{origin::grid, unit::threads};
 									        constexpr auto blocksInGrid = WorkerGroup{origin::grid, unit::blocks};
 									        constexpr auto threadsInBlock = WorkerGroup{origin::block, unit::threads};

 									        constexpr auto linearThreadsInGrid = WorkerGroup{origin::grid, unit::threads, linearized};
 									        constexpr auto linearThreadsInBlock = WorkerGroup{origin::block, unit::threads, linearized};
 									        constexpr auto linearBlocksInGrid = WorkerGroup{origin::grid, unit::blocks, linearized};
 									    } // namespace worker

 									} // namespace alpaka::onAcc
 									// ==
 									// == ./include/alpaka/onAcc/WorkGroup.hpp ==
 									// ============================================================================

 								// #include "alpaka/onAcc/layout.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/onAcc/tag.hpp"    // amalgamate: file already inlined
 									// ============================================================================
 									// == ./include/alpaka/onAcc/traverse.hpp ==
 									// ==
 									/* Copyright 2024 Andrea Bocci, René Widera
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once
 									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 										// ============================================================================
 										// == ./include/alpaka/mem/FlatIdxContainer.hpp ==
 										// ==
 										/* Copyright 2024 Andrea Bocci, René Widera
 										 * SPDX-License-Identifier: MPL-2.0
 										 */

 										// #pragma once
 										// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/mem/IdxRange.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/mem/ThreadSpace.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/onAcc/layout.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 										// #include <cstdint>    // amalgamate: file already included
 										// #include <functional>    // amalgamate: file already included
 										// #include <memory>    // amalgamate: file already included
 										// #include <ranges>    // amalgamate: file already included
 										// #include <sstream>    // amalgamate: file already included

 										namespace alpaka::onAcc
 										{

 										    template<typename T_IdxRange, typename T_ThreadSpace, typename T_IdxMapperFn, alpaka::concepts::CVector T_CSelect>
 										    class FlatIdxContainer : private T_IdxMapperFn
 										    {
 										        void _()
 										        {
 										            static_assert(std::ranges::forward_range<FlatIdxContainer>);
 										            static_assert(std::ranges::borrowed_range<FlatIdxContainer>);
 										            static_assert(std::ranges::range<FlatIdxContainer>);
 										            static_assert(std::ranges::input_range<FlatIdxContainer>);
 										        }

 										    public:
 										        using IdxType = typename T_IdxRange::IdxType;
 										        static constexpr uint32_t dim = T_IdxRange::dim();
 										        using IdxVecType = Vec<IdxType, dim>;

 										        ALPAKA_FN_ACC inline FlatIdxContainer(
 										            T_IdxRange const& idxRange,
 										            T_ThreadSpace const& threadSpace,
 										            T_IdxMapperFn idxMapping,
 										            T_CSelect const& = T_CSelect{})
 										            : T_IdxMapperFn{std::move(idxMapping)}
 										            , m_idxRange(idxRange)
 										            , m_threadSpace{threadSpace}
 										        {
 										            //  std::cout << "iter:" << m_idxRange.toString() << " " << m_threadSpace.toString() << std::endl;
 										        }

 										        constexpr FlatIdxContainer(FlatIdxContainer const&) = default;
 										        constexpr FlatIdxContainer(FlatIdxContainer&&) = default;

 										        class const_iterator;

 										        /** special implementation to define the end
 										         *
 										         * Only a scalar value must be stored which reduce the register footprint.
 										         * The definition of end is that the index is behind or equal to the extent of the slowest moving dimension.
 										         */
 										        class const_iterator_end
 										        {
 										            friend class FlatIdxContainer;

 										            void _()
 										            {
 										                static_assert(std::forward_iterator<const_iterator_end>);
 										                static_assert(std::input_iterator<const_iterator_end>);
 										            }

 										            ALPAKA_FN_ACC inline const_iterator_end(IdxType const& end) : m_extentSlowDim{end}
 										            {
 										            }

 										            constexpr IdxType operator*() const
 										            {
 										                return m_extentSlowDim;
 										            }

 										        public:
 										            constexpr bool operator==(const_iterator_end const& other) const
 										            {
 										                return (m_extentSlowDim == other.m_extentSlowDim);
 										            }

 										            constexpr bool operator!=(const_iterator_end const& other) const
 										            {
 										                return !(*this == other);
 										            }

 										            constexpr bool operator==(const_iterator const& other) const
 										            {
 										                return (m_extentSlowDim <= other.slowCurrent());
 										            }

 										            constexpr bool operator!=(const_iterator const& other) const
 										            {
 										                return !(*this == other);
 										            }

 										        private:
 										            IdxType m_extentSlowDim;
 										        };

 										        class const_iterator
 										        {
 										            friend class FlatIdxContainer;
 										            friend class const_iterator_end;

 										            static constexpr uint32_t iterDim = T_CSelect::dim();
 										            using IterIdxVecType = Vec<IdxType, iterDim>;

 										            void _()
 										            {
 										                static_assert(std::forward_iterator<const_iterator>);
 										                static_assert(std::input_iterator<const_iterator>);
 										            }

 										            constexpr const_iterator(
 										                alpaka::concepts::Vector auto offsetMD,
 										                IdxType const current,
 										                IdxType const stride,
 										                IdxType const end,
 										                alpaka::concepts::Vector auto const extentMD,
 										                alpaka::concepts::Vector auto const strideMD)
 										                : m_offsetMD{offsetMD}
 										                , m_current{current}
 										                , m_end{end}
 										                , m_stride{stride}
 										                , m_extentMD{extentMD}
 										                , m_strideMD{strideMD}
 										            {
 										            }

 										            ALPAKA_FN_ACC constexpr IdxType slowCurrent() const
 										            {
 										                return m_current;
 										            }

 										        public:
 										            constexpr IdxVecType operator*() const
 										            {
 										                auto result = m_offsetMD;
 										                result.ref(T_CSelect{}) += mapToND(m_extentMD, m_current) * m_strideMD;
 										                return result;
 										            }

 										            // pre-increment the iterator
 										            ALPAKA_FN_ACC inline const_iterator& operator++()
 										            {
 										                m_current += m_stride;
 										                return *this;
 										            }

 										            // post-increment the iterator
 										            ALPAKA_FN_ACC inline const_iterator operator++(int)
 										            {
 										                const_iterator old = *this;
 										                ++(*this);
 										                return old;
 										            }

 										            constexpr bool operator==(const_iterator const& other) const
 										            {
 										                return ((**this) == *other);
 										            }

 										            constexpr bool operator!=(const_iterator const& other) const
 										            {
 										                return !(*this == other);
 										            }

 										            constexpr bool operator==(const_iterator_end const& other) const
 										            {
 										                return (slowCurrent() >= *other);
 										            }

 										            constexpr bool operator!=(const_iterator_end const& other) const
 										            {
 										                return !(*this == other);
 										            }

 										        private:
 										            IdxVecType m_offsetMD;
 										            // modified by the pre/post-increment operator
 										            IdxType m_current;
 										            // non-const to support iterator copy and assignment
 										            IdxType m_end;
 										            IdxType m_stride;
 										            IterIdxVecType m_extentMD;
 										            IterIdxVecType m_strideMD;
 										        };

 										        ALPAKA_FN_ACC inline const_iterator begin() const
 										        {
 										            constexpr auto selectedDims = T_CSelect{};
 										            auto [threadIdx, numThreads] = m_threadSpace.mapTo(selectedDims);

 										            if constexpr(std::is_same_v<T_IdxMapperFn, layout::Strided>)
 										            {
 										                auto groupOffset = threadIdx * m_idxRange.m_stride;
 										                groupOffset.ref(selectedDims) -= groupOffset[selectedDims];

 										                auto begin = m_idxRange.m_begin + groupOffset;

 										                auto linearCurrent = linearize(numThreads[selectedDims], threadIdx[selectedDims]);
 										                auto linearStride = numThreads[selectedDims].product();
 										                auto strideMD = m_idxRange.m_stride[selectedDims];
 										                auto extentMD = divCeil(m_idxRange.distance()[selectedDims], strideMD);

 										                return const_iterator(begin, linearCurrent, linearStride, extentMD.product(), extentMD, strideMD);
 										            }
 										            else if constexpr(std::is_same_v<T_IdxMapperFn, layout::Contiguous>)
 										            {
 										                auto groupOffset = threadIdx * m_idxRange.m_stride;
 										                groupOffset.ref(selectedDims) -= groupOffset[selectedDims];

 										                auto begin = m_idxRange.m_begin + groupOffset;

 										                auto strideMD = m_idxRange.m_stride[selectedDims];

 										                auto numElements = divCeil(
 										                                       m_idxRange.distance()[selectedDims],
 										                                       m_threadSpace.m_threadCount[selectedDims] * strideMD)
 										                                       .product();
 										                auto linearCurrent
 										                    = linearize(m_threadSpace.m_threadCount[selectedDims], threadIdx[selectedDims]) * numElements;
 										                auto extentMD = divCeil(m_idxRange.distance()[selectedDims], strideMD);
 										                return const_iterator(
 										                    begin,
 										                    linearCurrent,
 										                    IdxType{1u},
 										                    std::min(linearCurrent + numElements, extentMD.product()),
 										                    extentMD,
 										                    strideMD);
 										            }
 										        }

 										        ALPAKA_FN_ACC inline const_iterator_end end() const
 										        {
 										            constexpr auto selectedDims = T_CSelect{};
 										            auto [threadIdx, numThreads] = m_threadSpace.mapTo(selectedDims);

 										            if constexpr(std::is_same_v<T_IdxMapperFn, layout::Strided>)
 										            {
 										                auto extentMD = divCeil(m_idxRange.distance()[selectedDims], m_idxRange.m_stride[selectedDims]);
 										                return const_iterator_end(extentMD.product());
 										            }
 										            else if constexpr(std::is_same_v<T_IdxMapperFn, layout::Contiguous>)
 										            {
 										                auto strideMD = m_idxRange.m_stride[selectedDims];
 										                auto numElements
 										                    = divCeil(m_idxRange.distance()[selectedDims], numThreads[selectedDims] * strideMD).product();
 										                auto linearCurrent = linearize(numThreads[selectedDims], threadIdx[selectedDims]) * numElements;
 										                auto extentMD = divCeil(m_idxRange.distance()[selectedDims], strideMD);
 										                return const_iterator_end(std::min(linearCurrent + numElements, extentMD.product()));
 										            }
 										        }

 										        ALPAKA_FN_HOST_ACC constexpr auto operator[](alpaka::concepts::CVector auto const iterDir) const
 										        {
 										            return FlatIdxContainer<T_IdxRange, T_ThreadSpace, T_IdxMapperFn, ALPAKA_TYPEOF(iterDir)>(
 										                m_idxRange,
 										                m_threadSpace,
 										                T_IdxMapperFn{});
 										        }

 										    private:
 										        T_IdxRange m_idxRange;
 										        T_ThreadSpace m_threadSpace;
 										    };
 										} // namespace alpaka::onAcc
 										// ==
 										// == ./include/alpaka/mem/FlatIdxContainer.hpp ==
 										// ============================================================================

 										// ============================================================================
 										// == ./include/alpaka/mem/TiledIdxContainer.hpp ==
 										// ==
 										/* Copyright 2024 Andrea Bocci, René Widera
 										 * SPDX-License-Identifier: MPL-2.0
 										 */

 										// #pragma once
 										// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/mem/IdxRange.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/mem/ThreadSpace.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/onAcc/layout.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 										// #include <cstdint>    // amalgamate: file already included
 										// #include <functional>    // amalgamate: file already included
 										// #include <memory>    // amalgamate: file already included
 										// #include <ranges>    // amalgamate: file already included
 										// #include <sstream>    // amalgamate: file already included

 										namespace alpaka::onAcc
 										{
 										    namespace detail
 										    {
 										        /** Store reduced vector
 										         *
 										         * The first index can be reduced by on dimension because the slowest dimension must never set to zero after
 										         * the initialization.
 										         */
 										        template<typename T_Type, uint32_t T_dim>
 										        struct ReducedVector : private Vec<T_Type, T_dim - 1u>
 										        {
 										            constexpr ReducedVector(Vec<T_Type, T_dim> const& first)
 										                : Vec<T_Type, T_dim - 1u>{first.template rshrink<T_dim - 1u>()}
 										            {
 										            }

 										            constexpr decltype(auto) operator[](T_Type idx) const
 										            {
 										                return Vec<T_Type, T_dim - 1u>::operator[](idx - 1u);
 										            }

 										            constexpr decltype(auto) operator[](T_Type idx)
 										            {
 										                return Vec<T_Type, T_dim - 1u>::operator[](idx - 1u);
 										            }
 										        };

 										        template<typename T_Type>
 										        struct ReducedVector<T_Type, 1u>
 										        {
 										            constexpr ReducedVector(Vec<T_Type, 1u> const&)
 										            {
 										            }
 										        };
 										    } // namespace detail

 										    template<
 										        alpaka::concepts::IdxRange T_IdxRange,
 										        typename T_ThreadSpace,
 										        typename T_IdxMapperFn,
 										        alpaka::concepts::CVector T_CSelect>
 										    class TiledIdxContainer
 										    {
 										        void _()
 										        {
 										            static_assert(std::ranges::forward_range<TiledIdxContainer>);
 										            static_assert(std::ranges::borrowed_range<TiledIdxContainer>);
 										            static_assert(std::ranges::range<TiledIdxContainer>);
 										            static_assert(std::ranges::input_range<TiledIdxContainer>);
 										        }

 										    public:
 										        using IdxType = typename T_IdxRange::IdxType;
 										        static constexpr uint32_t dim = T_IdxRange::dim();
 										        using IdxVecType = Vec<IdxType, dim>;

 										        ALPAKA_FN_ACC inline TiledIdxContainer(
 										            T_IdxRange const& idxRange,
 										            T_ThreadSpace const& threadSpace,
 										            T_IdxMapperFn idxMapping,
 										            T_CSelect const& = T_CSelect{})
 										            : m_idxRange(idxRange)
 										            , m_threadSpace{threadSpace}
 										        {
 										            //  std::cout << "iter:" << m_idxRange.toString() << " " << m_threadSpace.toString() << std::endl;
 										        }

 										        constexpr TiledIdxContainer(TiledIdxContainer const&) = default;
 										        constexpr TiledIdxContainer(TiledIdxContainer&&) = default;

 										        class const_iterator;

 										        /** special implementation to define the end
 										         *
 										         * Only a scalar value must be stored which reduce the register footprint.
 										         * The definition of end is that the index is behind or equal to the extent of the slowest moving dimension.
 										         */
 										        class const_iterator_end
 										        {
 										            friend class TiledIdxContainer;

 										            void _()
 										            {
 										                static_assert(std::forward_iterator<const_iterator_end>);
 										            }

 										            ALPAKA_FN_ACC inline const_iterator_end(alpaka::concepts::Vector auto const& extent)
 										                : m_extentSlowDim{extent[T_CSelect{}][0]}
 										            {
 										            }

 										            constexpr IdxType operator*() const
 										            {
 										                return m_extentSlowDim;
 										            }

 										        public:
 										            constexpr bool operator==(const_iterator_end const& other) const
 										            {
 										                return (m_extentSlowDim == other.m_extentSlowDim);
 										            }

 										            constexpr bool operator!=(const_iterator_end const& other) const
 										            {
 										                return !(*this == other);
 										            }

 										            constexpr bool operator==(const_iterator const& other) const
 										            {
 										                return (m_extentSlowDim <= other.slowCurrent);
 										            }

 										            constexpr bool operator!=(const_iterator const& other) const
 										            {
 										                return !(*this == other);
 										            }

 										        private:
 										            IdxType m_extentSlowDim;
 										        };

 										        class const_iterator
 										        {
 										            friend class TiledIdxContainer;
 										            friend class const_iterator_end;

 										            static constexpr uint32_t iterDim = T_CSelect::dim();
 										            using IterIdxVecType = Vec<IdxType, iterDim>;

 										            void _()
 										            {
 										                static_assert(std::forward_iterator<const_iterator>);
 										                static_assert(std::input_iterator<const_iterator>);
 										            }

 										            constexpr const_iterator(
 										                alpaka::concepts::Vector auto const offset,
 										                alpaka::concepts::Vector auto const first,
 										                alpaka::concepts::Vector auto const extent,
 										                alpaka::concepts::Vector auto const stride)
 										                : m_current{first + offset}
 										                , m_stride{stride[T_CSelect{}]}
 										                , m_extent{(extent + offset)[T_CSelect{}]}
 										                , m_first((m_current)[T_CSelect{}])
 										            {
 										                // range check required for 1 dimensional iterators
 										                if constexpr(iterDim > 1u)
 										                {
 										                    // invalidate current if one dimension is out of range.
 										                    bool isIndexValid = true;
 										                    for(uint32_t d = 1u; d < iterDim; ++d)
 										                        isIndexValid = isIndexValid && (m_first[d] < m_extent[d]);
 										                    if(!isIndexValid)
 										                        m_current[T_CSelect{}[0]] = m_extent[0];
 										                }

 										                // std::cout << "const iter " << m_current << m_extent << m_stride << std::endl;
 										            }

 										            ALPAKA_FN_ACC constexpr IdxType slowCurrent() const
 										            {
 										                return m_current[T_CSelect{}[0]];
 										            }

 										        public:
 										            constexpr IdxVecType operator*() const
 										            {
 										                return m_current;
 										            }

 										            // pre-increment the iterator
 										            ALPAKA_FN_ACC inline const_iterator& operator++()
 										            {
 										                for(uint32_t d = 0; d < iterDim; ++d)
 										                {
 										                    uint32_t const idx = iterDim - 1u - d;
 										                    m_current[T_CSelect{}[idx]] += m_stride[idx];
 										                    if constexpr(iterDim != 1u)
 										                    {
 										                        if(idx >= 1u && m_current[T_CSelect{}[idx]] >= m_extent[idx])
 										                        {
 										                            m_current[T_CSelect{}[idx]] = m_first[idx];
 										                        }
 										                        else
 										                            break;
 										                    }
 										                }
 										                return *this;
 										            }

 										            // post-increment the iterator
 										            ALPAKA_FN_ACC inline const_iterator operator++(int)
 										            {
 										                const_iterator old = *this;
 										                ++(*this);
 										                return old;
 										            }

 										            constexpr bool operator==(const_iterator const& other) const
 										            {
 										                return (m_current == other.m_current);
 										            }

 										            constexpr bool operator!=(const_iterator const& other) const
 										            {
 										                return !(*this == other);
 										            }

 										            constexpr bool operator==(const_iterator_end const& other) const
 										            {
 										                return (slowCurrent() >= *other);
 										            }

 										            constexpr bool operator!=(const_iterator_end const& other) const
 										            {
 										                return !(*this == other);
 										            }

 										        private:
 										            // modified by the pre/post-increment operator
 										            IdxVecType m_current;
 										            // non-const to support iterator copy and assignment
 										            IterIdxVecType m_stride;
 										            IterIdxVecType m_extent;
 										            detail::ReducedVector<IdxType, iterDim> m_first;
 										        };

 										        ALPAKA_FN_ACC inline const_iterator begin() const
 										        {
 										            constexpr auto selectedDims = T_CSelect{};
 										            auto [threadIdx, numThreads] = m_threadSpace.mapTo(selectedDims);

 										            if constexpr(std::is_same_v<T_IdxMapperFn, layout::Strided>)
 										            {
 										                return const_iterator(
 										                    m_idxRange.m_begin,
 										                    threadIdx * m_idxRange.m_stride,
 										                    m_idxRange.distance(),
 										                    numThreads * m_idxRange.m_stride);
 										            }
 										            else if constexpr(std::is_same_v<T_IdxMapperFn, layout::Contiguous>)
 										            {
 										                auto extent = m_idxRange.distance();
 										                auto numElements = divCeil(extent, m_idxRange.m_stride * numThreads);
 										                auto first = threadIdx * numElements * m_idxRange.m_stride;

 										                return const_iterator(
 										                    m_idxRange.m_begin,
 										                    first,
 										                    extent.min(first + numElements * m_idxRange.m_stride),
 										                    m_idxRange.m_stride);
 										            }
 										        }

 										        ALPAKA_FN_ACC inline const_iterator_end end() const
 										        {
 										            constexpr auto selectedDims = T_CSelect{};
 										            auto [threadIdx, numThreads] = m_threadSpace.mapTo(selectedDims);

 										            if constexpr(std::is_same_v<T_IdxMapperFn, layout::Strided>)
 										            {
 										                return const_iterator_end(m_idxRange.m_begin + m_idxRange.distance());
 										            }
 										            else if constexpr(std::is_same_v<T_IdxMapperFn, layout::Contiguous>)
 										            {
 										                auto extent = m_idxRange.distance();
 										                auto numElements = divCeil(extent, m_idxRange.m_stride * numThreads);
 										                auto first = threadIdx * numElements * m_idxRange.m_stride;

 										                return const_iterator_end(m_idxRange.m_begin + extent.min(first + numElements * m_idxRange.m_stride));
 										            }
 										        }

 										        ALPAKA_FN_HOST_ACC constexpr auto operator[](alpaka::concepts::CVector auto const iterDir) const
 										        {
 										            return TiledIdxContainer<T_IdxRange, T_ThreadSpace, T_IdxMapperFn, ALPAKA_TYPEOF(iterDir)>(
 										                m_idxRange,
 										                m_threadSpace,
 										                T_IdxMapperFn{});
 										        }

 										    private:
 										        T_IdxRange m_idxRange;
 										        T_ThreadSpace m_threadSpace;
 										    };
 										} // namespace alpaka::onAcc
 										// ==
 										// == ./include/alpaka/mem/TiledIdxContainer.hpp ==
 										// ============================================================================


 									namespace alpaka::onAcc
 									{
 									    namespace traverse
 									    {
 									        /** Linearize the index domain for traversing.
 									         *
 									         * Maps each linear index into the M-dimensional index space.
 									         * Mapping the linear index to a MD-index is increasing the computations (usage of multiplications and
 									         * additions) and can therefore slow down the performance.
 									         */
 									        struct Flat
 									        {
 									            ALPAKA_FN_HOST_ACC static constexpr auto make(
 									                auto const& idxRange,
 									                auto const& threadSpace,
 									                auto const& idxLayout,
 									                alpaka::concepts::CVector auto const& cSelect)
 									            {
 									                return FlatIdxContainer{idxRange, threadSpace, idxLayout, cSelect};
 									            }
 									        };

 									        constexpr auto flat = Flat{};

 									        /** Traversing the index domain with MD-tiles
 									         *
 									         * The worker specification is seen as MD-tile and iterating over the index space is done in a tiled strided
 									         * way. There are no multiplication required (only additions) and therefore are less computations requred
 									         * compared to @see Flat.
 									         */
 									        struct Tiled
 									        {
 									            ALPAKA_FN_HOST_ACC static constexpr auto make(
 									                auto const& idxRange,
 									                auto const& threadSpace,
 									                auto const& idxLayout,
 									                alpaka::concepts::CVector auto const& cSelect)
 									            {
 									                return TiledIdxContainer{idxRange, threadSpace, idxLayout, cSelect};
 									            }
 									        };

 									        constexpr auto tiled = Tiled{};
 									    } // namespace traverse
 									} // namespace alpaka::onAcc
 									// ==
 									// == ./include/alpaka/onAcc/traverse.hpp ==
 									// ============================================================================

 								// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 								// #include <cstdint>    // amalgamate: file already included
 								// #include <functional>    // amalgamate: file already included
 								// #include <memory>    // amalgamate: file already included
 								// #include <ranges>    // amalgamate: file already included
 								// #include <sstream>    // amalgamate: file already included

 								namespace alpaka::onAcc
 								{
 								    namespace trait
 								    {
 								        template<typename T>
 								        struct IsIdxMapping : std::false_type
 								        {
 								        };

 								        template<>
 								        struct IsIdxMapping<layout::Strided> : std::true_type
 								        {
 								        };

 								        template<>
 								        struct IsIdxMapping<layout::Optimized> : std::true_type
 								        {
 								        };

 								        template<>
 								        struct IsIdxMapping<layout::Contiguous> : std::true_type
 								        {
 								        };

 								        template<typename T>
 								        constexpr bool isIdxMapping_v = IsIdxMapping<T>::value;

 								        template<typename T>
 								        struct IsIdxTraversing : std::false_type
 								        {
 								        };

 								        template<>
 								        struct IsIdxTraversing<traverse::Flat> : std::true_type
 								        {
 								        };

 								        template<>
 								        struct IsIdxTraversing<traverse::Tiled> : std::true_type
 								        {
 								        };

 								        template<typename T>
 								        constexpr bool isIdxTraversing_v = IsIdxTraversing<T>::value;

 								    } // namespace trait

 								    namespace concepts
 								    {
 								        template<typename T>
 								        concept IdxMapping = trait::isIdxMapping_v<T>;

 								        template<typename T>
 								        concept IdxTraversing = trait::isIdxTraversing_v<T>;
 								    } // namespace concepts

 								    namespace internal
 								    {
 								        struct MakeIter
 								        {
 								            /* create iterator
 								             *
 								             * ALPAKA_FN_HOST_ACC is required for cuda else __host__ function called from __host__ __device__
 								             * warning is popping up and generated code is wrong.
 								             */
 								            template<
 								                typename T_ScalarIdxType,
 								                typename T_Acc,
 								                typename T_DomainSpec,
 								                typename T_Traverse,
 								                typename T_IdxMapping>
 								            struct Op
 								            {
 								                ALPAKA_FN_HOST_ACC constexpr auto operator()(
 								                    T_Acc const& acc,
 								                    T_DomainSpec const& domainSpec,
 								                    [[maybe_unused]] T_Traverse traverse,
 								                    T_IdxMapping idxMapping) const
 								                    requires std::is_same_v<ALPAKA_TYPEOF(idxMapping), layout::Optimized>
 								                {
 								                    auto adjIdxMapping = adjustMapping(acc);
 								                    auto const idxRange = domainSpec.getIdxRange(acc);
 								                    auto const threadSpace = domainSpec.getThreadSpace(acc);

 								                    using IdxType = std::conditional_t<
 								                        std::is_same_v<void, T_ScalarIdxType>,
 								                        typename ALPAKA_TYPEOF(idxRange)::IdxType,
 								                        T_ScalarIdxType>;
 								                    return T_Traverse::make(
 								                        pCast<IdxType>(idxRange),
 								                        pCast<IdxType>(threadSpace),
 								                        adjIdxMapping,
 								                        iotaCVec<
 								                            typename ALPAKA_TYPEOF(idxRange.distance())::type,
 								                            ALPAKA_TYPEOF(idxRange.distance())::dim()>());
 								                }

 								                ALPAKA_FN_HOST_ACC constexpr auto operator()(
 								                    T_Acc const& acc,
 								                    T_DomainSpec const& domainSpec,
 								                    [[maybe_unused]] T_Traverse traverse,
 								                    T_IdxMapping idxMapping) const
 								                {
 								                    auto const idxRange = domainSpec.getIdxRange(acc);
 								                    auto const threadSpace = domainSpec.getThreadSpace(acc);

 								                    using IdxType = std::conditional_t<
 								                        std::is_same_v<void, T_ScalarIdxType>,
 								                        typename ALPAKA_TYPEOF(idxRange)::IdxType,
 								                        T_ScalarIdxType>;
 								                    return T_Traverse::make(
 								                        pCast<IdxType>(idxRange),
 								                        pCast<IdxType>(threadSpace),
 								                        idxMapping,
 								                        iotaCVec<
 								                            typename ALPAKA_TYPEOF(idxRange.distance())::type,
 								                            ALPAKA_TYPEOF(idxRange.distance())::dim()>());
 								                }
 								            };
 								        };


 								    } // namespace internal

 								    namespace detail
 								    {
 								        template<typename T_ExtentFn>
 								        struct IdxRangeFn
 								        {
 								            constexpr IdxRangeFn(T_ExtentFn const& extentFn) : m_extentFn{extentFn}
 								            {
 								            }

 								            constexpr auto getIdxRange(auto const& acc) const
 								            {
 								                return IdxRange{m_extentFn(acc)};
 								            }

 								        private:
 								            T_ExtentFn const m_extentFn;
 								        };

 								        template<
 								            concepts::Origin T_Origin,
 								            concepts::Unit T_Unit,
 								            typename T_MultiDimensional = MultiDimensional<true>>
 								        struct IdxRangeLazy
 								        {
 								            constexpr IdxRangeLazy(
 								                T_Origin const& origin,
 								                T_Unit const& unit,
 								                T_MultiDimensional = T_MultiDimensional{})
 								            {
 								            }

 								            constexpr auto getIdxRange(auto const& acc) const
 								            {
 								                auto const extent = internalCompute::GetExtentsOf::Op<ALPAKA_TYPEOF(acc), T_Origin, T_Unit>{}(
 								                    acc,
 								                    T_Origin{},
 								                    T_Unit{});

 								                if constexpr(T_MultiDimensional::value == false)
 								                    return IdxRange{Vec{extent.product()}};
 								                else
 								                    return IdxRange{extent};
 								            }
 								        };
 								    } // namespace detail

 								    template<typename T_WorkGroup, typename T_IdxRange>
 								    struct DomainSpec
 								    {
 								        constexpr DomainSpec(T_WorkGroup const threadGroup, T_IdxRange const idxRange)
 								            : m_threadGroup{threadGroup}
 								            , m_idxRange{idxRange}
 								        {
 								        }

 								    private:
 								        friend internal::MakeIter;

 								        constexpr auto getIdxRange(auto const& acc) const
 								        {
 								            return m_idxRange;
 								        }

 								        constexpr auto getIdxRange(auto const& acc) const
 								            requires(requires { std::declval<T_IdxRange>().getIdxRange(acc); })
 								        {
 								            return m_idxRange.getIdxRange(acc);
 								        }

 								        constexpr auto getThreadSpace(auto const& acc) const
 								        {
 								            return m_threadGroup;
 								        }

 								        constexpr auto getThreadSpace(auto const& acc) const
 								            requires(requires { std::declval<T_WorkGroup>().getThreadSpace(acc); })
 								        {
 								            return m_threadGroup.getThreadSpace(acc);
 								        }

 								        T_WorkGroup m_threadGroup;
 								        T_IdxRange m_idxRange;
 								    };

 								    namespace idxTrait
 								    {
 								        struct TotalFrameSpecExtent
 								        {
 								            template<typename T_Acc>
 								            constexpr auto operator()(T_Acc const& acc) const
 								            {
 								                return acc[frame::count] * acc[frame::extent];
 								            }
 								        };

 								        struct FrameCount
 								        {
 								            template<typename T_Acc>
 								            constexpr auto operator()(T_Acc const& acc) const
 								            {
 								                return acc[frame::count];
 								            }
 								        };

 								        struct FrameExtent
 								        {
 								            template<typename T_Acc>
 								            constexpr auto operator()(T_Acc const& acc) const
 								            {
 								                return acc[frame::extent];
 								            }
 								        };
 								    } // namespace idxTrait

 								    namespace range
 								    {
 								        constexpr auto totalFrameSpecExtent = detail::IdxRangeFn{idxTrait::TotalFrameSpecExtent{}};
 								        constexpr auto frameCount = detail::IdxRangeFn{idxTrait::FrameCount{}};
 								        constexpr auto frameExtent = detail::IdxRangeFn{idxTrait::FrameExtent{}};

 								        constexpr auto threadsInGrid = detail::IdxRangeLazy{origin::grid, unit::threads};
 								        constexpr auto blocksInGrid = detail::IdxRangeLazy{origin::grid, unit::blocks};
 								        constexpr auto threadsInBlock = detail::IdxRangeLazy{origin::block, unit::threads};

 								        constexpr auto linearThreadsInGrid = detail::IdxRangeLazy{origin::grid, unit::threads, linearized};
 								        constexpr auto linearBlocksInGrid = detail::IdxRangeLazy{origin::grid, unit::blocks, linearized};
 								        constexpr auto linearThreadsInBlock = detail::IdxRangeLazy{origin::block, unit::threads, linearized};
 								    } // namespace range

 								} // namespace alpaka::onAcc

 								namespace alpaka::trait
 								{
 								    template<typename T>
 								    requires(isSpecializationOf_v<std::remove_cvref_t<T>, onAcc::detail::IdxRangeLazy>)
 								    struct IsLazyIndexRange<T> : std::true_type
 								    {
 								    };

 								    template<typename T>
 								    requires(isSpecializationOf_v<std::remove_cvref_t<T>, onAcc::detail::IdxRangeFn>)
 								    struct IsLazyIndexRange<T> : std::true_type
 								    {
 								    };
 								} // namespace alpaka::trait
 								// ==
 								// == ./include/alpaka/mem/Iter.hpp ==
 								// ============================================================================

 								// ============================================================================
 								// == ./include/alpaka/mem/MdSpan.hpp ==
 								// ==
 								/* Copyright 2025 René Widera, Simeon Ehrig
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
 									// ============================================================================
 									// == ./include/alpaka/mem/DataPitches.hpp ==
 									// ==
 									/* Copyright 2024 René Widera
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once
 									// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined

 									#include <type_traits>

 									namespace alpaka
 									{
 									    //! Calculate the pitches purely from the extents.
 									    template<typename T_Elem, alpaka::concepts::Vector T_Vec>
 									    constexpr auto calculatePitchesFromExtents(T_Vec const& extent)
 									    {
 									        constexpr auto dim = T_Vec::dim();
 									        using type = typename T_Vec::type;
 									        auto pitchBytes = typename T_Vec::UniVec{};
 									        if constexpr(dim > 0)
 									            pitchBytes.back() = static_cast<type>(sizeof(T_Elem));
 									        if constexpr(dim > 1)
 									            for(type i = dim - 1; i > 0; i--)
 									                pitchBytes[i - 1] = extent[i] * pitchBytes[i];
 									        return pitchBytes;
 									    }

 									    //! Calculate the pitches purely from the extents.
 									    template<typename T_Elem, alpaka::concepts::Vector T_Vec>
 									    requires(T_Vec::dim() >= 2)
 									    constexpr auto calculatePitches(T_Vec const& extent, typename T_Vec::type const& rowPitchBytes)
 									    {
 									        constexpr auto dim = T_Vec::dim();
 									        using type = typename T_Vec::type;
 									        auto pitchBytes = typename T_Vec::UniVec{};
 									        pitchBytes.back() = static_cast<type>(sizeof(T_Elem));
 									        if constexpr(dim > 1)
 									            pitchBytes[dim - 2u] = rowPitchBytes;
 									        if constexpr(dim > 2)
 									            for(type i = dim - 2; i > 0; i--)
 									                pitchBytes[i - 1] = extent[i] * pitchBytes[i];
 									        return pitchBytes;
 									    }

 									    template<typename T_Type, concepts::Vector T_Pitches>
 									    struct DataPitches
 									    {
 									        using value_type = T_Type;
 									        using index_type = typename T_Pitches::type;

 									        static consteval uint32_t dim()
 									        {
 									            return T_Pitches::dim();
 									        }

 									        constexpr DataPitches(T_Pitches const& pitchBytes) : m_pitch(pitchBytes.eraseBack())
 									        {
 									            assert(pitchBytes.back() == sizeof(value_type));
 									        }

 									        /*Object must init by copy a valid instance*/
 									        constexpr DataPitches() = default;

 									        constexpr auto getPitches() const
 									        {
 									            Vec<index_type, dim()> result;
 									            for(uint32_t d = 0u; d < dim() - 1u; ++d)
 									            {
 									                result[d] = m_pitch[d];
 									            }
 									            result.back() = static_cast<index_type>(sizeof(value_type));
 									            return result;
 									        }

 									        constexpr index_type operator[](std::integral auto idx) const
 									        {
 									            return getPitches()[idx];
 									        }

 									    private:
 									        decltype(std::declval<T_Pitches>().eraseBack()) m_pitch;
 									    };

 									    template<typename T_Type, typename T_IndexType, typename T_Storage>
 									    struct DataPitches<T_Type, Vec<T_IndexType, 1u, T_Storage>>
 									    {
 									        using value_type = T_Type;
 									        using index_type = T_IndexType;

 									        static consteval uint32_t dim()
 									        {
 									            return 1u;
 									        }

 									        constexpr DataPitches([[maybe_unused]] Vec<T_IndexType, 1u> const& pitchBytes)
 									        {
 									            assert(pitchBytes.back() == sizeof(value_type));
 									        }

 									        /*Object must init by copy a valid instance*/
 									        constexpr DataPitches() = default;

 									        constexpr auto getPitches() const
 									        {
 									            return Vec{static_cast<index_type>(sizeof(value_type))};
 									        }
 									    };
 									} // namespace alpaka
 									// ==
 									// == ./include/alpaka/mem/DataPitches.hpp ==
 									// ============================================================================

 									// ============================================================================
 									// == ./include/alpaka/mem/MdForwardIter.hpp ==
 									// ==
 									/* Copyright 2025 René Widera
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once
 									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/mem/concepts.hpp"    // amalgamate: file already inlined

 									// #include <cstdint>    // amalgamate: file already included
 									#include <iterator>

 									namespace alpaka
 									{

 									    /** special implementation to define the end
 									     *
 									     * Only a scalar value must be stored which reduce the register footprint.
 									     * The definition of end is that the index is behind or equal to the extent of the slowest moving dimension.
 									     */
 									    template<typename T_idxType>
 									    class MdForwardIterEnd
 									    {
 									        using index_type = T_idxType;

 									        void _()
 									        {
 									            static_assert(std::forward_iterator<MdForwardIterEnd>);
 									        }

 									    public:
 									        constexpr MdForwardIterEnd(alpaka::concepts::MdSpan auto const& mdSpan)
 									            : m_extentSlowDim{mdSpan.getExtents()[0]}
 									        {
 									        }

 									        constexpr auto operator*() const
 									        {
 									            return m_extentSlowDim;
 									        }

 									        constexpr bool operator==(MdForwardIterEnd const& other) const
 									        {
 									            return (m_extentSlowDim == other.m_extentSlowDim);
 									        }

 									        constexpr bool operator!=(MdForwardIterEnd const& other) const
 									        {
 									            return !(*this == other);
 									        }

 									    private:
 									        index_type m_extentSlowDim;
 									    };

 									    template<alpaka::concepts::MdSpan T_MdSpan>
 									    ALPAKA_FN_HOST_ACC MdForwardIterEnd(T_MdSpan const&) -> MdForwardIterEnd<typename T_MdSpan::index_type>;

 									    template<alpaka::concepts::MdSpan T_MdSpan>
 									    class MdForwardIter
 									    {
 									        using index_type = typename T_MdSpan::index_type;

 									        friend class MdForwardIterEnd<index_type>;

 									        static constexpr uint32_t iterDim = T_MdSpan::dim();
 									        using IterIdxVecType = Vec<index_type, iterDim>;

 									        void _()
 									        {
 									            static_assert(std::forward_iterator<MdForwardIter>);
 									            static_assert(std::input_or_output_iterator<MdForwardIter>);
 									        }

 									    public:
 									        constexpr MdForwardIter(T_MdSpan const& mdSpan) : m_mdSpan(mdSpan), m_current{IterIdxVecType::all(0u)}
 									        {
 									        }

 									        ALPAKA_FN_ACC constexpr index_type slowCurrent() const
 									        {
 									            return m_current[0];
 									        }

 									        constexpr decltype(auto) operator*() const
 									        {
 									            return m_mdSpan[m_current];
 									        }

 									        constexpr decltype(auto) operator*()
 									        {
 									            return m_mdSpan[m_current];
 									        }

 									        // pre-increment the iterator
 									        ALPAKA_FN_ACC inline MdForwardIter& operator++()
 									        {
 									            for(uint32_t d = 0; d < iterDim; ++d)
 									            {
 									                uint32_t const idx = iterDim - 1u - d;
 									                m_current[idx] += index_type{1u};
 									                if constexpr(iterDim != 1u)
 									                {
 									                    if(idx >= 1u && m_current[idx] >= m_mdSpan.getExtents()[idx])
 									                    {
 									                        m_current[idx] = index_type{0u};
 									                    }
 									                    else
 									                        break;
 									                }
 									            }
 									            return *this;
 									        }

 									        // post-increment the iterator
 									        ALPAKA_FN_ACC inline MdForwardIter operator++(int)
 									        {
 									            MdForwardIter old = *this;
 									            ++(*this);
 									            return old;
 									        }

 									        constexpr bool operator==(MdForwardIter const& other) const
 									        {
 									            return (m_current == other.m_current);
 									        }

 									        constexpr bool operator!=(MdForwardIter const& other) const
 									        {
 									            return !(*this == other);
 									        }

 									    private:
 									        T_MdSpan m_mdSpan;
 									        IterIdxVecType m_current;
 									    };

 									    template<typename T_MdSpan>
 									    constexpr bool operator==(
 									        MdForwardIter<T_MdSpan> const& mdIter,
 									        MdForwardIterEnd<typename T_MdSpan::index_type> const& mdIteratorEnd)
 									    {
 									        return (*mdIteratorEnd <= mdIter.slowCurrent());
 									    }

 									    template<typename T_MdSpan>
 									    constexpr bool operator!=(
 									        MdForwardIter<T_MdSpan> const& mdIter,
 									        MdForwardIterEnd<typename T_MdSpan::index_type> const& mdIteratorEnd)
 									    {
 									        return !(mdIteratorEnd == mdIter);
 									    }

 									    template<typename T_MdSpan>
 									    constexpr bool operator==(
 									        MdForwardIterEnd<typename T_MdSpan::index_type> const& mdIteratorEnd,
 									        MdForwardIter<T_MdSpan> const& mdIter)
 									    {
 									        return (*mdIteratorEnd <= mdIter.slowCurrent());
 									    }

 									    template<typename T_MdSpan>
 									    constexpr bool operator!=(
 									        MdForwardIterEnd<typename T_MdSpan::index_type> const& mdIteratorEnd,
 									        MdForwardIter<T_MdSpan> const& mdIter)
 									    {
 									        return !(mdIteratorEnd == mdIter);
 									    }
 									} // namespace alpaka
 									// ==
 									// == ./include/alpaka/mem/MdForwardIter.hpp ==
 									// ============================================================================

 									// ============================================================================
 									// == ./include/alpaka/mem/concepts/detail/InnerTypeAllowedCast.hpp ==
 									// ==
 									/* Copyright 2025 Simeon Ehrig
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once

 										// ============================================================================
 										// == ./include/alpaka/concepts/types.hpp ==
 										// ==
 										/* Copyright 2025 Simeon Ehrig, René Widera
 										 * SPDX-License-Identifier: MPL-2.0
 										 */

 										// #pragma once
 										#include <type_traits>

 										namespace alpaka::concepts
 										{
 										    /** Concept to check if the given type is a C static array.
 										     */
 										    template<typename T>
 										    concept CStaticArray = std::is_array_v<T>;

 										    /** Concept to check if the given type is a reference, using std::is_reference
 										     */
 										    template<typename T>
 										    concept Reference = std::is_reference_v<T>;
 										} // namespace alpaka::concepts
 										// ==
 										// == ./include/alpaka/concepts/types.hpp ==
 										// ============================================================================


 									// #include <concepts>    // amalgamate: file already included

 									namespace alpaka::internal
 									{
 									    /** Get the element type without cv qualifier or static dimension from a value or reference type T.
 									     *
 									     * @example
 									     * int const -> int
 									     * int const & -> int
 									     * int -> int
 									     * &int const[2][2] -> int
 									     *
 									     */
 									    template<typename T>
 									    struct GetElementType
 									    {
 									        /** The trait GetElementType removes an optional reference and NonRefType removes the cv-qualifiers.
 									         Two nested traits are required because we need the specialization for C static array. */
 									        template<typename U>
 									        struct NonRefType
 									        {
 									            using type = std::decay_t<U>;
 									        };

 									        template<alpaka::concepts::CStaticArray U>
 									        struct NonRefType<U>
 									        {
 									            using type = typename std::remove_all_extents_t<std::remove_cv_t<U>>;
 									        };

 									        using type = typename NonRefType<std::remove_reference_t<T>>::type;
 									        static constexpr bool is_const = std::is_const_v<std::remove_reference_t<T>>;
 									    };

 									    template<typename T>
 									    using GetElementType_t = typename GetElementType<T>::type;

 									    namespace concepts
 									    {
 									        /** Concept to restrict copy or move constructor of a DataSource which creates a new object with a different
 									         * inner type.
 									         *
 									         * @tparam T_Type element type of the new object
 									         * @tparam T_Type_Other element type of the object which is copied or moved
 									         *
 									         * @details
 									         * Needs to fulfill the following requirements
 									         *  - the datatype without cv-qualifier needs to be the same
 									         *  - following const/mutable conversion to const/mutable are allowed
 									         *      - mutable -> mutable
 									         *      - const -> const
 									         *      - mutable -> const
 									         */
 									        template<typename T_Type, typename T_Type_Other>
 									        concept InnerTypeAllowedCast = requires {
 									            /// the value type without cv-qualifier needs to be the same
 									            requires std::same_as<GetElementType_t<T_Type>, GetElementType_t<T_Type_Other>>;
 									            /// check the correct cast of a const/mutable inner type to another const/mutable inner type
 									            requires !(GetElementType<T_Type_Other>::is_const && !GetElementType<T_Type>::is_const);
 									        };
 									    } // namespace concepts
 									} // namespace alpaka::internal
 									// ==
 									// == ./include/alpaka/mem/concepts/detail/InnerTypeAllowedCast.hpp ==
 									// ============================================================================

 								// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
 									// ============================================================================
 									// == ./include/alpaka/onHost/interface.hpp ==
 									// ==
 									/* Copyright 2024 René Widera
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once
 									// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
 										// ============================================================================
 										// == ./include/alpaka/onHost/DeviceSelector.hpp ==
 										// ==
 										/* Copyright 2024 René Widera
 										 * SPDX-License-Identifier: MPL-2.0
 										 */

 										// #pragma once
 										// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
 											// ============================================================================
 											// == ./include/alpaka/onHost/Device.hpp ==
 											// ==
 											/* Copyright 2024 René Widera
 											 * SPDX-License-Identifier: MPL-2.0
 											 */

 											// #pragma once
 											// #include "Handle.hpp"    // amalgamate: file already inlined
 											// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
 												// ============================================================================
 												// == ./include/alpaka/onHost/Event.hpp ==
 												// ==
 												/* Copyright 2024 René Widera
 												 * SPDX-License-Identifier: MPL-2.0
 												 */

 												// #pragma once
 												// #include "Handle.hpp"    // amalgamate: file already inlined
 												// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 												// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined

 												// #include <memory>    // amalgamate: file already included

 												namespace alpaka::onHost
 												{
 												    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 												    struct Device;

 												    template<typename T_Device>
 												    struct Event;

 												    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 												    struct Event<Device<T_Api, T_DeviceKind>>
 												    {
 												    private:
 												        using DeviceInterface = Device<T_Api, T_DeviceKind>;
 												        using EventHandle = ALPAKA_TYPEOF(
 												            internal::MakeEvent::Op<ALPAKA_TYPEOF(*std::declval<DeviceInterface>().get())>{}(
 												                *std::declval<DeviceInterface>().get()));

 												        EventHandle m_event;

 												    public:
 												        using element_type = typename EventHandle::element_type;

 												        template<typename T_Event>
 												        Event(Handle<T_Event>&& event) : m_event{std::forward<Handle<T_Event>>(event)}
 												        {
 												        }

 												        auto* get() const
 												        {
 												            return m_event.get();
 												        }

 												        constexpr auto getApi() const
 												        {
 												            return alpaka::internal::getApi(*m_event.get());
 												        }

 												        std::string getName() const
 												        {
 												            return alpaka::internal::GetName::Op<std::decay_t<decltype(*m_event.get())>>{}(*m_event.get());
 												        }

 												        [[nodiscard]] auto getNativeHandle() const
 												        {
 												            return internal::getNativeHandle(*m_event.get());
 												        }

 												        bool operator==(Event const& other) const
 												        {
 												            return this->get() == other.get();
 												        }

 												        bool operator!=(Event const& other) const
 												        {
 												            return this->get() != other.get();
 												        }

 												        /** Get the device of this event
 												         *
 												         * @return the device of this event
 												         */
 												        auto getDevice() const
 												        {
 												            return Device<T_Api, T_DeviceKind>{internal::getDevice(*m_event.get())};
 												        }

 												        bool isComplete() const
 												        {
 												            return alpaka::onHost::internal::isEventComplete(*m_event.get());
 												        }
 												    };

 												    template<typename T_Event>
 												    Event(Handle<T_Event>&&) -> Event<Device<
 												        ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval<T_Event>())),
 												        ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval<T_Event>()))>>;

 												} // namespace alpaka::onHost
 												// ==
 												// == ./include/alpaka/onHost/Event.hpp ==
 												// ============================================================================

 												// ============================================================================
 												// == ./include/alpaka/onHost/Queue.hpp ==
 												// ==
 												/* Copyright 2024 René Widera
 												 * SPDX-License-Identifier: MPL-2.0
 												 */

 												// #pragma once
 												// #include "Handle.hpp"    // amalgamate: file already inlined
 												// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 												// #include "alpaka/executor.hpp"    // amalgamate: file already inlined
 												// #include "alpaka/onHost/Event.hpp"    // amalgamate: file already inlined
 												// #include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
 												// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined

 												// #include <memory>    // amalgamate: file already included

 												namespace alpaka::onHost
 												{
 												    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 												    struct Device;

 												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 												    struct Queue;

 												    template<
 												        alpaka::concepts::Api T_Api,
 												        alpaka::concepts::DeviceKind T_DeviceKind,
 												        alpaka::concepts::QueueKind T_QueueKind>
 												    struct Queue<Device<T_Api, T_DeviceKind>, T_QueueKind>
 												    {
 												    private:
 												        using DeviceInterface = Device<T_Api, T_DeviceKind>;
 												        using QueueHandle = ALPAKA_TYPEOF(
 												            internal::MakeQueue::Op<ALPAKA_TYPEOF(*std::declval<DeviceInterface>().get()), T_QueueKind>{}(
 												                *std::declval<DeviceInterface>().get(),
 												                T_QueueKind{}));

 												        QueueHandle m_queue;

 												    public:
 												        using element_type = typename QueueHandle::element_type;

 												        template<typename T_Queue>
 												        Queue(Handle<T_Queue>&& queue, T_QueueKind) : m_queue{std::forward<Handle<T_Queue>>(queue)}
 												        {
 												        }

 												        auto* get() const
 												        {
 												            return m_queue.get();
 												        }

 												        constexpr auto getApi() const
 												        {
 												            return alpaka::internal::getApi(*m_queue.get());
 												        }

 												        constexpr auto getQueueKind() const
 												        {
 												            return T_DeviceKind{};
 												        }

 												        void _()
 												        {
 												            static_assert(internal::concepts::Queue<element_type>);
 												        }

 												        std::string getName() const
 												        {
 												            return alpaka::internal::GetName::Op<std::decay_t<decltype(*m_queue.get())>>{}(*m_queue.get());
 												        }

 												        [[nodiscard]] auto getNativeHandle() const
 												        {
 												            return internal::getNativeHandle(*m_queue.get());
 												        }

 												        bool operator==(Queue const& other) const
 												        {
 												            return this->get() == other.get();
 												        }

 												        bool operator!=(Queue const& other) const
 												        {
 												            return this->get() != other.get();
 												        }

 												        /** Get the device of this queue
 												         *
 												         * @return the device of this queue
 												         */
 												        auto getDevice() const
 												        {
 												            return Device<T_Api, T_DeviceKind>{internal::getDevice(*m_queue.get())};
 												        }

 												        /** Enqueue a kernel functor to a queue
 												         *
 												         * @param executor description how native worker threads will be mapped and grouped to compute grid layers
 												         * @param f the compute kernel functor
 												         * @param args arguments to forwarded to the kernel functor
 												         */
 												        void enqueue(
 												            auto const executor,
 												            onHost::concepts::ThreadOrFrameSpec auto const& blockCfg,
 												            auto const& f,
 												            auto&&... args) const
 												        {
 												            return internal::enqueue(
 												                *m_queue.get(),
 												                std::move(executor),
 												                blockCfg,
 												                KernelBundle{f, onHost::makeAccessibleOnAcc(ALPAKA_FORWARD(args))...});
 												        }

 												        /** Enqueue a kernel to a queue
 												         *
 												         * @param specification thread or frame specification which provides a chunked description of the thread or
 												         * frame index domain
 												         * @param kernelBundle the compute kernel and there arguments
 												         */

 												        /**
 												         * A available default executor will be selected automaticlally. The default executor is a executor with most
 												         * parallelism/performance.
 												         */
 												        template<typename TKernelFn, typename... TArgs>
 												        void enqueue(
 												            onHost::concepts::ThreadOrFrameSpec auto const& specification,
 												            KernelBundle<TKernelFn, TArgs...> const& kernelBundle) const
 												        {
 												            auto executor = supportedExecutors(internal::getDevice(*m_queue.get()), exec::allExecutors);
 												            internal::enqueue(*m_queue.get(), std::get<0>(executor), specification, kernelBundle);
 												        }

 												        /**
 												         * @param executor description how native worker threads will be mapped and grouped to compute grid layers
 												         * (blocks, threads).
 												         */
 												        void enqueue(
 												            alpaka::concepts::Executor auto const executor,
 												            onHost::concepts::ThreadOrFrameSpec auto const& specification,
 												            alpaka::concepts::KernelBundle auto const& kernelBundle) const
 												        {
 												            internal::enqueue(*m_queue.get(), executor, specification, kernelBundle);
 												        }

 												        /** Enqueue an operation which is executed on the host side
 												         *
 												         * @attention Do NOT enqueue a task which captures the queue internally to keep the queue alive, this could
 												         * lead into deadlocks. Do NOT capture @see MangedView because view actions could perform blocking operations
 												         * e.g. onHost::wait() in the destructor which could lead to deadlocks too.
 												         *
 												         * @param task task to be executed on the host side
 												         */
 												        void enqueue(auto const& task) const
 												        {
 												            return internal::Enqueue::Task<std::decay_t<decltype(*m_queue.get())>, std::decay_t<decltype(task)>>{}(
 												                *m_queue.get(),
 												                task);
 												        }

 												        void enqueue(Event<Device<T_Api, T_DeviceKind>> const& event) const
 												        {
 												            return internal::Enqueue::Event<ALPAKA_TYPEOF(*m_queue.get()), ALPAKA_TYPEOF(*event.get())>{}(
 												                *m_queue.get(),
 												                *event.get());
 												        }

 												        void waitFor(Event<Device<T_Api, T_DeviceKind>> const& event) const
 												        {
 												            return internal::waitFor(*m_queue.get(), *event.get());
 												        }
 												    };

 												    template<typename T_Queue, alpaka::concepts::QueueKind T_QueueKind>
 												    Queue(Handle<T_Queue>&&, T_QueueKind) -> Queue<
 												        Device<
 												            ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval<T_Queue>())),
 												            ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval<T_Queue>()))>,
 												        T_QueueKind>;

 												    /** @{
 												     * @name Memory modifiers
 												     *
 												     * @attention For input/output memory the caller should ensure that the memory is valid until the operation is
 												     * completed not until the execution handle is given back because alpaka is not extending the life-time until
 												     * the operation is finished.
 												     */
 												    /** copy data byte wise from one to another container
 												     *
 												     * @param queue the copy will be executed after all previous work in this queue is finished
 												     * @param[in,out] dest can be a container/view where the data should be written to
 												     * @param[in] source can be a container/view from which the data will be copied
 												     */
 												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 												    inline void memcpy(Queue<T_Device, T_QueueKind> const& queue, auto&& dest, auto const& source)
 												    {
 												        memcpy(queue, ALPAKA_FORWARD(dest), source, internal::getExtents(dest));
 												    }

 												    /** copy data byte wise from one to another container
 												     *
 												     * @param queue the copy will be executed after all previous work in this queue is finished
 												     * @param[in,out] dest can be a container/view where the data should be written to
 												     * @param[in] source can be a container/view from which the data will be copied
 												     * @param extents M-dimensional data extents in elements, can be smaller than the container capacity
 												     */
 												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 												    inline void memcpy(
 												        Queue<T_Device, T_QueueKind> const& queue,
 												        auto&& dest,
 												        auto const& source,
 												        alpaka::concepts::VectorOrScalar auto const& extents)
 												    {
 												        Vec const extentsVec = extents;
 												        internal::Memcpy::Op<
 												            std::decay_t<decltype(*queue.get())>,
 												            std::decay_t<decltype(dest)>,
 												            std::decay_t<decltype(source)>,
 												            std::decay_t<decltype(extentsVec)>>{}(*queue.get(), ALPAKA_FORWARD(dest), source, extentsVec);
 												    }

 												    /** fill memory byte wise
 												     *
 												     * @param[in,out] dest can be a container/view where the data should be written to
 												     * The caller should ensure that the memory is valid until the operation is completed not until the
 												     * execution handle is given back because alpaka is not extending the life-time until the operation
 												     * is finished.
 												     * @param byteValue value to be written to each byte
 												     */
 												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 												    inline void memset(Queue<T_Device, T_QueueKind> const& queue, auto&& dest, uint8_t byteValue)
 												    {
 												        memset(queue, ALPAKA_FORWARD(dest), byteValue, internal::getExtents(dest));
 												    }

 												    /** fill memory byte wise
 												     *
 												     * @param[in,out] dest can be a container/view where the data should be written to
 												     * The caller should ensure that the memory is valid until the operation is completed not until the
 												     * execution handle is given back because alpaka is not extending the life-time until the operation
 												     * is finished.
 												     * @param byteValue value to be written to each byte
 												     * @param extents M-dimensional data extents in elements, can be smaller than the container capacity
 												     */
 												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 												    inline void memset(
 												        Queue<T_Device, T_QueueKind> const& queue,
 												        auto&& dest,
 												        uint8_t byteValue,
 												        alpaka::concepts::VectorOrScalar auto const& extents)
 												    {
 												        Vec const extentsVec = extents;
 												        internal::Memset::Op<
 												            std::decay_t<decltype(*queue.get())>,
 												            std::decay_t<decltype(dest)>,
 												            std::decay_t<decltype(extentsVec)>>{}(*queue.get(), ALPAKA_FORWARD(dest), byteValue, extentsVec);
 												    }

 												    /** fill memory element wise
 												     *
 												     * @param[in,out] dest can be a container/view where the data should be written to
 												     * The caller should ensure that the memory is valid until the operation is completed not until the
 												     * execution handle is given back because alpaka is not extending the life-time until the operation
 												     * is finished.
 												     * @param elementValue value to be written to each element
 												     */
 												    template<typename T_Value, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 												    inline void fill(Queue<T_Device, T_QueueKind> const& queue, auto&& dest, T_Value elementValue) requires(
 												        std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
 												        && std::same_as<ALPAKA_TYPEOF(alpaka::internal::getApi(queue)), ALPAKA_TYPEOF(alpaka::internal::getApi(dest))>)
 												    {
 												        fill(queue, ALPAKA_FORWARD(dest), elementValue, internal::getExtents(dest));
 												    }

 												    /** fill memory element wise
 												     *
 												     * @param[in,out] dest can be a container/view where the data should be written to
 												     * The caller should ensure that the memory is valid until the operation is completed not until the
 												     * execution handle is given back because alpaka is not extending the life-time until the operation
 												     * is finished.
 												     * @param elementValue value to be written to each element
 												     * @param extents M-dimensional data extents in elements, can be smaller than the container capacity
 												     */

 												    template<typename T_Value, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 												    inline void fill(
 												        Queue<T_Device, T_QueueKind> const& queue,
 												        auto&& dest,
 												        T_Value elementValue,
 												        alpaka::concepts::VectorOrScalar auto const& extents)
 												        requires(
 												            std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
 												            && std::
 												                same_as<ALPAKA_TYPEOF(alpaka::internal::getApi(queue)), ALPAKA_TYPEOF(alpaka::internal::getApi(dest))>)
 												    {
 												        Vec const extentsVec = extents;
 												        internal::Fill::Op<
 												            ALPAKA_TYPEOF(*queue.get()),
 												            ALPAKA_TYPEOF(dest),
 												            ALPAKA_TYPEOF(elementValue),
 												            ALPAKA_TYPEOF(extentsVec)>{}(*queue.get(), ALPAKA_FORWARD(dest), elementValue, extentsVec);
 												    }

 												    /** @} */

 												    /** @{
 												     * @name Deferred device allocations
 												     */
 												    /** allocate memory that is accessible after it is processed in the queue
 												     *
 												     * Deferred allocation means that the pointer in the returned buffer is valid after the function is returning.
 												     * It is allowed to slice the buffer or use the encapsulated pointer for address calculations.
 												     * In any cases the pointer is not allowed to be dereferenced before the memory allocation is processed in the
 												     * queue. All tasks performing any memory access must be executed after the memory allocation is processed in the
 												     * queue. This can be archived by waiting on the queue or describing queue dependencies via @c waitFor(). The
 												     * memory is allowed to be used in other queues too. To avoid that a view to the memory is still in use during the
 												     * deallocation you can use @see addDestructorAction() and wait for a queue if it **differs** to the queue used for
 												     * the allocation.
 												     *
 												     * @attention It is allowed that the function is blocking the caller until the memory is created.
 												     *
 												     * @tparam T_Type type of the data elements
 												     * @param queue queue handle
 												     * @param extents number of elements for each dimension
 												     * @return Shared buffer to the allocated memory. The buffer will be freed after the last instance to the
 												     * memory is destroyed. The deallocation is asynchronous performed in the the queue which is used for the
 												     * allocation.
 												     */
 												    template<typename T_Type, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 												    inline auto allocDeferred(
 												        Queue<T_Device, T_QueueKind> const& queue,
 												        alpaka::concepts::VectorOrScalar auto const& extents)
 												    {
 												        Vec const extentsVec = extents;
 												        return internal::AllocDeferred::Op<T_Type, std::decay_t<decltype(*queue.get())>, ALPAKA_TYPEOF(extentsVec)>{}(
 												            *queue.get(),
 												            extentsVec);
 												    }

 												    /** allocate memory that is accessible after it is processed in the queue
 												     *
 												     * In any cases the pointer is not allowed to be dereferenced before the memory allocation is processed in the
 												     * queue. All tasks performing any memory access must be executed after the memory allocation is processed in the
 												     * queue. This can be archived by waiting on the queue or describing queue dependencies via @c waitFor(). The
 												     * memory is allowed to be used in other queues too. To avoid that a view to the memory is still in use during the
 												     * deallocation you can use @see addDestructorAction() and wait for a queue if it **differs** to the queue used for
 												     * the allocation.
 												     *
 												     * @attention It is allowed that the function is blocking the caller until the memory is created.
 												     *
 												     * @param queue queue handle
 												     * @param[in] view other memory where the extents will be derived from
 												     * @return Shared buffer to the allocated memory. The buffer will be freed after the last instance to the
 												     * memory is destroyed. The deallocation is asynchronous performed in the the queue which is used for the
 												     * allocation.
 												     */
 												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 												    inline auto allocLikeAsync(Queue<T_Device, T_QueueKind> const& queue, auto const& view)
 												    {
 												        return allocDeferred<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(view)>>(queue, getExtents(view));
 												    }

 												    /** @} */
 												} // namespace alpaka::onHost
 												// ==
 												// == ./include/alpaka/onHost/Queue.hpp ==
 												// ============================================================================

 											// #include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
 											// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
 											// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
 											// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 											// #include <bit>    // amalgamate: file already included
 											// #include <climits>    // amalgamate: file already included

 											namespace alpaka::onHost
 											{
 											    /** @brief Description of a specific device that one can schedule kernels on.
 											     *
 											     * @details
 											     * A device is the combination of an alpaka::deviceKind::onHost::DeviceKind and an alpaka::concepts::Api,
 											     * representing an entity that one can schedule work on.
 											     *
 											     * @tparam T_Api The Api powering this device.
 											     * @tparam T_DeviceKind The kind of device it is.
 											     */
 											    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 											    struct Device
 											    {
 											    private:
 											        using PlatformHandle = ALPAKA_TYPEOF(internal::makePlatform(T_Api{}, T_DeviceKind{}));
 											        using DeviceHandle = ALPAKA_TYPEOF(
 											            internal::MakeDevice::Op<typename PlatformHandle::element_type>{}(
 											                *std::declval<PlatformHandle>().get(),
 											                0u));
 											        DeviceHandle m_device;

 											    public:
 											        friend struct alpaka::internal::GetName;
 											        friend struct internal::GetNativeHandle;

 											        using element_type = typename DeviceHandle::element_type;

 											        auto get() const
 											        {
 											            return m_device.get();
 											        }

 											        template<typename T_Device>
 											        Device(Handle<T_Device>&& internalDeviceHandle)
 											            : m_device{std::forward<Handle<T_Device>>(internalDeviceHandle)}
 											        {
 											        }

 											        void _()
 											        {
 											            static_assert(internal::concepts::Device<element_type>);
 											        }

 											        std::string getName() const
 											        {
 											            return alpaka::internal::GetName::Op<std::decay_t<decltype(*m_device.get())>>{}(*m_device.get());
 											        }

 											        [[nodiscard]] auto getNativeHandle() const
 											        {
 											            return internal::getNativeHandle(*m_device.get());
 											        }

 											        bool operator==(Device const& other) const
 											        {
 											            return this->get() == other.get();
 											        }

 											        bool operator!=(Device const& other) const
 											        {
 											            return this->get() != other.get();
 											        }

 											        /** Create a queue for this device.
 											         *
 											         * @attention If you call this method multiple times it is allowed that you always get the same handle
 											         * back. There is no guarantee that you will get independent queues.
 											         *
 											         * Enqueuing tasks into two different queues does not guarantee that these tasks run in parallel.
 											         * Running tasks from different tasks sequentially is valid behavior. Enqueuing into two individual queues only
 											         * signifies that the tasks are independent of each other and their order of execution is independent.
 											         *
 											         * @param kind
 											         *   Blocking behaviour:
 											         *    - queueKind::nonBlocking (default): enqueue returns immediately; completion of the enqueued operation
 											         * must be ensured via onHost::wait(queue) or by enqueuing dependent operations onto the same queue.
 											         *    - queueKind::blocking: each enqueue only returns after the operation is complete and its effects are
 											         * host-visible.
 											         *
 											         * @return A onHost::Queue that tasks and memory operations can be enqueued on.
 											         */
 											        auto makeQueue(alpaka::concepts::QueueKind auto kind)
 											        {
 											            return Queue{
 											                internal::MakeQueue::Op<ALPAKA_TYPEOF(*m_device.get()), ALPAKA_TYPEOF(kind)>{}(*m_device.get(), kind),
 											                kind};
 											        }

 											        auto makeQueue()
 											        {
 											            return makeQueue(queueKind::nonBlocking);
 											        }

 											        auto makeEvent()
 											        {
 											            return Event{internal::MakeEvent::Op<std::decay_t<decltype(*m_device.get())>>{}(*m_device.get())};
 											        }

 											        /** Blocks the caller until the given handle executes all work
 											         */
 											        void wait()
 											        {
 											            return internal::wait(*m_device.get());
 											        }

 											        /** Properties of a given device
 											         *
 											         * @attention Currently only a handful of entries is available. The object will be refactored soon and will
 											         * become most likely a compile time dictionary tu support optional entries.
 											         */

 											        inline DeviceProperties getDeviceProperties() const
 											        {
 											            return internal::GetDeviceProperties::Op<ALPAKA_TYPEOF(*m_device.get())>{}(*m_device.get());
 											        }

 											        constexpr auto getDeviceKind() const
 											        {
 											            return T_DeviceKind{};
 											        }
 											    };

 											    namespace concepts
 											    {
 											        /** @brief Concept to check if something is a device.
 											         *
 											         * @details
 											         * This concept checks for specializations of alpaka::onHost::Device. For more information on devices in
 											         * alpaka, refer to the class documentation.
 											         */
 											        template<typename T_Device>
 											        concept Device = alpaka::isSpecializationOf_v<T_Device, onHost::Device>;
 											    } // namespace concepts

 											    template<typename T_Device>
 											    Device(Handle<T_Device>&&) -> Device<
 											        ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval<T_Device>())),
 											        ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval<T_Device>()))>;

 											    /** @{
 											     * @name Device allocations
 											     */
 											    /** Allocate memory on the given device
 											     *
 											     * @tparam T_Type type of the data elements
 											     * @param device device handle
 											     * @param extents number of elements for each dimension
 											     * @return memory owning view to the allocated memory
 											     */
 											    template<typename T_Type>
 											    inline auto alloc(concepts::Device auto const& device, alpaka::concepts::VectorOrScalar auto const& extents)
 											    {
 											        Vec const extentsVec = extents;
 											        return internal::Alloc::Op<T_Type, std::decay_t<decltype(*device.get())>, ALPAKA_TYPEOF(extentsVec)>{}(
 											            *device.get(),
 											            extentsVec);
 											    }

 											    /** Allocate memory on the given device with unified virtual memory
 											     *
 											     * This memory can be accessed from all devices with the same Api and device kind. Depending on the backend e.g.
 											     * OneApi memory can be accessed by other device kind devices if they are using the same native context. It is not
 											     * allowed to access the data on two devices at the same time, this must be avoided by explicit synchronizations.
 											     * Unified memory follows the rules of UVM memory of the device backend e.g. CUDA, HIP, ...
 											     *
 											     * @tparam T_Type type of the data elements
 											     * @param device device handle
 											     * @param extents number of elements for each dimension
 											     * @return Managed view to the allocated memory
 											     */
 											    template<typename T_Type>
 											    inline auto allocUnified(concepts::Device auto const& device, alpaka::concepts::VectorOrScalar auto const& extents)
 											    {
 											        Vec const extentsVec = extents;
 											        return internal::AllocUnified::Op<T_Type, std::decay_t<decltype(*device.get())>, ALPAKA_TYPEOF(extentsVec)>{}(
 											            *device.get(),
 											            extentsVec);
 											    }

 											    /** Allocates unified memory on the device associated with the given queue.
 											     *
 											     * This memory can be accessed from all devices with the same Api and device kind. Depending on the backend e.g.
 											     * OneApi memory can be accessed by other device kind devices if they are using the same native context. It is not
 											     * allowed to access the data on two devices at the same time, this must be avoided by explicit synchronizations.
 											     * Unified memory follows the rules of UVM memory of the device backend e.g. CUDA, HIP, ...
 											     *
 											     * @ingroup foo
 											     *
 											     * @tparam T_Type type of the data elements
 											     * @param queue queue handle
 											     * @param extents number of elements for each dimension
 											     */
 											    template<typename T_Type, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 											    inline auto allocUnified(
 											        Queue<T_Device, T_QueueKind> const& queue,
 											        alpaka::concepts::VectorOrScalar auto const& extents)
 											    {
 											        Vec const extentsVec = extents;
 											        return internal::AllocUnified::
 											            Op<T_Type, std::decay_t<decltype(*queue.getDevice().get())>, ALPAKA_TYPEOF(extentsVec)>{}(
 											                *queue.getDevice().get(),
 											                extentsVec);
 											    }

 											    /** Allocate pinned memory on the host which is mapped into the address space of the device
 											     *
 											     * Mapped memory is located on the host and is transferred for each access via the PCIe/Nvlink bus. The performance
 											     * on the device is mostly pure. Mapped memory should be used for host memory if you transfer memory between host
 											     * and device via `onHost::memcpy()` because the transfer will be optimized for latency and performance.
 											     *
 											     * @tparam T_Type type of the data elements
 											     * @param device device handle
 											     * @param extents number of elements for each dimension
 											     */
 											    template<typename T_Type>
 											    inline auto allocMapped(concepts::Device auto const& device, alpaka::concepts::VectorOrScalar auto const& extents)
 											    {
 											        Vec const extentsVec = extents;
 											        return internal::AllocMapped::Op<T_Type, std::decay_t<decltype(*device.get())>, ALPAKA_TYPEOF(extentsVec)>{}(
 											            *device.get(),
 											            extentsVec);
 											    }

 											    /** Allocate pinned memory on the host which is mapped into the address space of the device
 											     *
 											     * Mapped memory is located on the host and is transferred for each access via the PCIe/Nvlink bus. The performance
 											     * on the device is mostly pure. Mapped memory should be used for host memory if you transfer memory between host
 											     * and device via `onHost::memcpy()` because the transfer will be optimized for latency and performance.
 											     *
 											     * @tparam T_Type type of the data elements
 											     * @param queue queue handle
 											     * @param extents number of elements for each dimension
 											     */
 											    template<typename T_Type, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 											    inline auto allocMapped(
 											        Queue<T_Device, T_QueueKind> const& queue,
 											        alpaka::concepts::VectorOrScalar auto const& extents)
 											    {
 											        return allocMapped<T_Type>(queue.getDevice(), extents);
 											    }

 											    /** Allocate memory on the given device based on a view
 											     *
 											     * Derives type and extents of the memory from the view.
 											     * The content of the memory is NOT copied to the created allocated memory.
 											     *
 											     * @param device device handle
 											     * @param[in] view memory where properties will be derived from
 											     *
 											     * @return memory owning view to the allocated memory
 											     */
 											    inline auto allocLike(concepts::Device auto const& device, auto const& view)
 											    {
 											        return alloc<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(view)>>(device, getExtents(view));
 											    }

 											    ///@}

 											    /** Check if the given view is accessible on the given device
 											     *
 											     * @param device device handle
 											     * @param view memory where properties will be derived from
 											     * @return true if the view is accessible on the device, false otherwise.
 											     * alpaka can not detect all memory access types therefore the result can be false even if the memory is accessible
 											     * because the view was allocated with a UVM allocator.
 											     *
 											     */
 											    inline bool isDataAccessible(concepts::Device auto const& device, alpaka::concepts::View auto const& view)
 											    {
 											        return internal::IsDataAccessible::FirstPath<ALPAKA_TYPEOF(*device.get()), ALPAKA_TYPEOF(view)>{}(
 											                   *device.get(),
 											                   view)
 											               || internal::IsDataAccessible::SecondPath<
 											                   ALPAKA_TYPEOF(getApi(view)),
 											                   ALPAKA_TYPEOF(getDeviceKind(device)),
 											                   ALPAKA_TYPEOF(view)>{}(getApi(view), getDeviceKind(device), view);
 											    }

 											    /** Check if the given view is accessible on the device of the given queue
 											     *
 											     * @param queue queue handle
 											     */
 											    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
 											    inline bool isDataAccessible(Queue<T_Device, T_QueueKind> const& queue, alpaka::concepts::View auto const& view)
 											    {
 											        return internal::IsDataAccessible::FirstPath<ALPAKA_TYPEOF(*queue.getDevice().get()), ALPAKA_TYPEOF(view)>{}(
 											                   *queue.getDevice().get(),
 											                   view)
 											               || internal::IsDataAccessible::SecondPath<
 											                   ALPAKA_TYPEOF(getApi(view)),
 											                   ALPAKA_TYPEOF(getDeviceKind(queue.getDevice())),
 											                   ALPAKA_TYPEOF(view)>{}(getApi(view), getDeviceKind(queue.getDevice()), view);
 											    }

 											    /** Provides a frame specification to operate on a given index range
 											     *
 											     * The frame specification will be optimized for SIMD executions in the highest dimension.
 											     *
 											     * @param extents size of the index range
 											     * @return frame specification
 											     */
 											    template<typename T_DataType, typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 											    inline constexpr auto getFrameSpec(onHost::Device<T_Api, T_DeviceKind> const& device, auto&& extents)
 											    {
 											        using ExtentVecType = ALPAKA_TYPEOF(extents);
 											        using IndexType = alpaka::trait::GetValueType_t<ExtentVecType>;
 											        auto props = device.getDeviceProperties();
 											        IndexType warpSize = static_cast<IndexType>(props.m_warpSize);
 											        // try to create a specification with a frame size of 512 elements
 											        IndexType numFrameElemets = 512;
 											        // avoid non-power of two values
 											        auto fastDimensionValue = roundDownToPowerOfTwo(std::min(warpSize, extents.x()));
 											        auto frameExtents = ExtentVecType::all(1).rAssign(fastDimensionValue);
 											        numFrameElemets /= frameExtents.x();
 											        // distribute remainder frame elements
 											        while(numFrameElemets > IndexType{1})
 											        {
 											            uint32_t maxIdx = ExtentVecType::dim() - 1u;
 											            IndexType maxValue = 0;
 											            for(auto i = 0u; i < ExtentVecType::dim(); ++i)
 											            {
 											                auto v = extents[i] / frameExtents[i] / IndexType{2};
 											                if(maxValue < v)
 											                {
 											                    maxIdx = i;
 											                    maxValue = v;
 											                }
 											            }
 											            // apply the change only if we not oversubscribe the extents
 											            auto v = extents[maxIdx] / frameExtents[maxIdx] / IndexType{2};
 											            if(v >= IndexType{1})
 											                frameExtents[maxIdx] *= IndexType{2};
 											            else
 											                break;
 											            numFrameElemets /= IndexType{2};
 											        }
 											        IndexType elementsPerFrameItem = static_cast<IndexType>(getNumElemPerThread<T_DataType>(device));
 											        alpaka::concepts::Vector auto numFrames
 											            = divExZero(extents, frameExtents * frameExtents.all(1).rAssign(elementsPerFrameItem));
 											        // The frame specification is not required to be a multiple of the extent, it can be smaller.
 											        auto frameSpec = onHost::FrameSpec{numFrames, frameExtents};
 											        return frameSpec;
 											    }
 											} // namespace alpaka::onHost
 											// ==
 											// == ./include/alpaka/onHost/Device.hpp ==
 											// ============================================================================

 										// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
 										// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 										namespace alpaka::onHost
 										{
 										    /** @brief Concept for a combination of an API and device kind
 										     *
 										     * @details
 										     * A device specification means the combination of an API and a device kind. Multiple instances of
 										     * alpaka::onHost::Device can exist for the same device specification, for example in the form of multiple GPUs of
 										     * the same type in one system.
 										     *
 										     * To check whether a specific combination is valid, i.e., whether an API can target a device kind, the static
 										     * isValid() method can be used.
 										     */
 										    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 										    struct DeviceSpec
 										    {
 										    public:
 										        constexpr DeviceSpec(T_Api api, T_DeviceKind deviceType) : m_api(api), m_deviceType(deviceType)
 										        {
 										        }

 										        constexpr DeviceSpec() = default;

 										        constexpr T_DeviceKind getDeviceKind() const
 										        {
 										            return m_deviceType;
 										        }

 										        constexpr T_Api getApi() const
 										        {
 										            return m_api;
 										        }

 										        std::string getName() const
 										        {
 										            return m_api.getName() + " " + m_deviceType.getName();
 										        }

 										        /** Checks if the device kind and api combination is valid
 										         *
 										         * Reasons why a combination is valid can be that the api does not know how to talk to a device or that the
 										         * required dependencies e.g. CUDA, HIP, or OneApi are not fulfilled.
 										         *
 										         * @return true if the device kind and api combination is valid, else false
 										         */
 										        static constexpr bool isValid()
 										        {
 										            return trait::IsDeviceSupportedBy::Op<T_DeviceKind, T_Api>::value;
 										        }

 										    private:
 										        T_Api m_api;
 										        T_DeviceKind m_deviceType;
 										    };

 										    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 										    struct DeviceSelector
 										    {
 										    public:
 										        static_assert(
 										            DeviceSpec<T_Api, T_DeviceKind>::isValid(),
 										            "Invalid combination of device kind and api. The api does not know how to talk to the device or the "
 										            "required dependencies to enable the api are not fulfilled.");

 										        constexpr DeviceSelector(DeviceSpec<T_Api, T_DeviceKind> deviceSpec)
 										            : m_platform(internal::makePlatform(deviceSpec.getApi(), deviceSpec.getDeviceKind()))
 										            , m_deviceSpec(deviceSpec)
 										        {
 										        }

 										        constexpr DeviceSelector(T_Api api, T_DeviceKind devType) : DeviceSelector(DeviceSpec{api, devType})
 										        {
 										        }

 										        uint32_t getDeviceCount() const
 										        {
 										            return internal::GetDeviceCount::Op<ALPAKA_TYPEOF(*m_platform.get())>{}(*m_platform.get());
 										        }

 										        bool isAvailable() const
 										        {
 										            return getDeviceCount() != 0;
 										        }

 										        DeviceProperties getDeviceProperties(uint32_t idx) const
 										        {
 										            return internal::GetDeviceProperties::Op<ALPAKA_TYPEOF(*m_platform.get())>{}(*m_platform.get(), idx);
 										        }

 										        /** Get a device
 										         *
 										         * @param idx device index (range [0;number of devices), invalid index will throw an exception
 										         * @return @see onHost::Device
 										         */
 										        auto makeDevice(uint32_t idx)
 										        {
 										            return Device{internal::MakeDevice::Op<ALPAKA_TYPEOF(*m_platform.get())>{}(*m_platform.get(), idx)};
 										        }

 										    private:
 										        ALPAKA_TYPEOF(internal::makePlatform(T_Api{}, T_DeviceKind{})) m_platform;
 										        DeviceSpec<T_Api, T_DeviceKind> m_deviceSpec;
 										    };

 										    /** create a object to get access to devices */
 										    template<typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
 										    inline auto makeDeviceSelector(DeviceSpec<T_Api, T_DeviceKind> deviceSpec)
 										    {
 										        return DeviceSelector{deviceSpec};
 										    }

 										    inline auto makeDeviceSelector(alpaka::concepts::Api auto api, alpaka::concepts::DeviceKind auto deviceTag)
 										    {
 										        return DeviceSelector{api, deviceTag};
 										    }

 										    template<typename deferEvaluation = void>
 										    inline auto makeHostDevice()
 										    {
 										        return DeviceSelector{
 										            std::conditional_t<std::is_same_v<deferEvaluation, bool>, api::Host, api::Host>{},
 										            deviceKind::cpu}
 										            .makeDevice(0);
 										    }

 										    namespace concepts
 										    {
 										        /** Concept to check for specializations of alpaka::onHost::DeviceSpec
 										         */
 										        template<typename T>
 										        concept DeviceSpec = isSpecializationOf_v<T, onHost::DeviceSpec>;
 										    } // namespace concepts

 										} // namespace alpaka::onHost
 										// ==
 										// == ./include/alpaka/onHost/DeviceSelector.hpp ==
 										// ============================================================================

 									// #include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
 									// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 									/** Functionality which is usable on the host CPU controller thread. */
 									namespace alpaka::onHost
 									{
 									    /** @{
 									     * @name Query extents
 									     */
 									    /** Object extents
 									     *
 									     * @param any can be a std::vector, std::array, ...
 									     * @return the extents of the object
 									     */
 									    inline decltype(auto) getExtents(auto&& any)
 									    {
 									        return internal::getExtents(ALPAKA_FORWARD(any));
 									    }

 									    /** Handle extents
 									     *
 									     * @param handle can be a view, a data
 									     * @return the extents of the object
 									     */
 									    inline decltype(auto) getExtents(alpaka::concepts::HasGet auto&& handle)
 									    {
 									        return internal::getExtents(*handle.get());
 									    }

 									    /** @} */

 									    /** @{
 									     * @name Query multi-dimensional pitches
 									     */
 									    /** Object pitches
 									     *
 									     * @param any can be a std::vector, std::array, ...
 									     * @return the number of elements of the object
 									     */
 									    inline decltype(auto) getPitches(auto&& any)
 									    {
 									        return internal::getPitches(ALPAKA_FORWARD(any));
 									    }

 									    /** Handle pitches
 									     *
 									     * @param handle can be a view, a data
 									     * @return the number of elements of the object
 									     */
 									    inline decltype(auto) getPitches(alpaka::concepts::HasGet auto&& handle)
 									    {
 									        return internal::getPitches(*handle.get());
 									    }

 									    /** @} */

 									    /** @{
 									     * @name Query the name
 									     */

 									    /** Compile‑time available name for a given object.
 									     *
 									     * @param any object whose name shall be queried
 									     * @return a `std::string`‑compatible value holding the static name
 									     */
 									    inline std::convertible_to<std::string> auto getStaticName(auto const& any)
 									    {
 									        return alpaka::internal::GetStaticName::Op<ALPAKA_TYPEOF(any)>{}(any);
 									    }

 									    /** Compile‑time available name of an handle
 									     *
 									     * @param handle object whose name shall be queried
 									     * @return a `std::string`‑compatible value holding the static name
 									     */
 									    inline std::convertible_to<std::string> auto getStaticName(concepts::StaticNameHandle auto const& handle)
 									    {
 									        return alpaka::internal::GetStaticName::Op<std::decay_t<decltype(*handle.get())>>{}(*handle.get());
 									    }

 									    /** Runtime name for a given object.
 									     *
 									     * @param any object whose name shall be queried
 									     * @return a `std::string`‑compatible value holding the name
 									     */
 									    inline std::convertible_to<std::string> auto getName(auto&& any)
 									    {
 									        return alpaka::internal::GetName::Op<ALPAKA_TYPEOF(any)>{}(ALPAKA_FORWARD(any));
 									    }

 									    /** Runtime name for a given handle.
 									     *
 									     * @param handle object whose name shall be queried
 									     * @return a `std::string`‑compatible value holding the name
 									     */
 									    inline std::convertible_to<std::string> auto getName(concepts::NameHandle auto const& handle)
 									    {
 									        return alpaka::internal::GetName::Op<std::decay_t<decltype(*handle.get())>>{}(*handle.get());
 									    }

 									    /** @} */

 									    /** Get the native handle of an handle.
 									     *
 									     * The native handle can be passed to the underlying backend API
 									     * (e.g. CUDA, HIP, OpenMP) for low‑level operations.
 									     *
 									     * @param handle object exposing a native handle
 									     * @return the native handle returned by the backend‑specific implementation
 									     */
 									    inline auto getNativeHandle(auto const& handle)
 									    {
 									        return internal::getNativeHandle(*handle.get());
 									    }

 									    /** wait for all work to be finished
 									     *
 									     * Waits until all work submitted to any before this call has finished
 									     *
 									     * @param handle queue/device/event
 									     */
 									    inline void wait(alpaka::concepts::HasGet auto& handle)
 									    {
 									        return internal::wait(*handle.get());
 									    }

 									    /** @{
 									     * @name Query raw pointer
 									     */
 									    /** pointer to data of an object
 									     *
 									     * For multi‑dimensional data the data is not required to be continuous.
 									     *
 									     * @param any object providing data access (e.g. std::vector)
 									     * @return raw pointer to the underlying data (equivalent to `std::data`)
 									     */
 									    inline decltype(auto) data(auto&& any)
 									    {
 									        return internal::Data::data(ALPAKA_FORWARD(any));
 									    }

 									    /** pointer to data of an handle
 									     *
 									     * For multi‑dimensional data the data is not required to be continuous.
 									     *
 									     * @param handle handle providing data access (e.g. view)
 									     * @return raw pointer to the underlying data
 									     */
 									    inline decltype(auto) data(alpaka::concepts::HasGet auto&& handle)
 									    {
 									        return internal::Data::data(*handle.get());
 									    }

 									    /** @} */

 									    /** @{
 									     * @name Host allocations
 									     */
 									    /** Allocate host memory for a given element type and extents.
 									     *
 									     * The allocation is performed on the host controller device
 									     * (`api::host` ans `deviceKind::cpu`).
 									     * The returned view owns the allocated memory.
 									     *
 									     * @tparam T_ValueType type of the data elements
 									     * @param extents number of elements per dimension (vector or scalar)
 									     * @return a view owning the newly allocated memory
 									     */
 									    template<typename T_ValueType>
 									    inline auto allocHost(alpaka::concepts::VectorOrScalar auto const& extents)
 									    {
 									        auto device = makeHostDevice<T_ValueType>();
 									        Vec const extentsVec = extents;
 									        return internal::Alloc::Op<T_ValueType, std::decay_t<decltype(*device.get())>, ALPAKA_TYPEOF(extentsVec)>{}(
 									            *device.get(),
 									            extentsVec);
 									    }

 									    /** Allocate host memory with the same value type and extents as an existing view.
 									     *
 									     * The content of the source view is **not** copied. The function deduces the
 									     * element type and extents from `view` and creates a new shared buffer on the
 									     * host controller device.
 									     *
 									     * @param view a view (e.g. `std::vector`, `std::array`, or any compatible type)
 									     * @return a view owning the newly allocated memory
 									     */
 									    inline auto allocHostLike(auto const& view)
 									    {
 									        auto device = makeHostDevice<ALPAKA_TYPEOF(view)>();
 									        return alloc<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(view)>>(device, internal::getExtents(view));
 									    }

 									    /** @} */

 									    /** @{
 									     * @name Device selection utilities
 									     */
 									    /** Resolve the list of executors supported for a device specification.
 									     *
 									     * This helper is used internally to build backend dictionaries.
 									     *
 									     * @param deviceSpec      device specification to be used
 									     * @param listOfExecutors tuple of executor types to be filtered
 									     * @return a tuple containing the supported executor types
 									     */
 									    constexpr auto getExecutorsList(auto const deviceSpec, auto const listOfExecutors)
 									    {
 									        using DevSelectorType = decltype(makeDeviceSelector(deviceSpec));
 									        using DeviceType = decltype(std::declval<DevSelectorType>().makeDevice(0));
 									        using ExecutorListType = decltype(supportedExecutors(std::declval<DeviceType>(), listOfExecutors));
 									        return ExecutorListType{};
 									    }

 									    /** Create a tuple of device specifications for a single API.
 									     *
 									     * Each device specifications combines the supplied API with one of the supported
 									     * device types for that API.
 									     *
 									     * @param api a single alpaka API (e.g. `api::cuda`, `api::hip`)
 									     * @return a tuple containing all device specifications for the given API
 									     */
 									    constexpr auto getDeviceSpecsFor(auto const api)
 									    {
 									        return std::apply(
 									            [api](auto... devType) constexpr { return std::make_tuple(DeviceSpec{api, devType}...); },
 									            supportedDevices(api));
 									    }

 									    /** Create a flattened tuple of device specification objects for a list of APIs.
 									     *
 									     * @param apiList a `std::tuple` containing the APIs
 									     * @return a tuple containing all device specifications for the given API
 									     */
 									    template<alpaka::concepts::Api... T_Apis>
 									    constexpr auto getDeviceSpecsFor(std::tuple<T_Apis...> const apiList)
 									    {
 									        return std::apply([](auto... api) constexpr { return std::tuple_cat(getDeviceSpecsFor(api)...); }, apiList);
 									    }

 									    /** Build a tuple of backends for a single device specification.
 									     *
 									     * A backend is the combination of a device specification and an executor.
 									     * Each dictionary stores a `deviceSpec`(query: foo[object::deviceSpec]) entry and an `exec`(query:
 									     * foo[object::exec]) entry for the corresponding executor.
 									     *
 									     * @param deviceSpec the device specification to associate with the executors
 									     * @param listOfExecutors tuple of executor types
 									     * @return a tuple of backend objects, one per executor
 									     */
 									    constexpr auto createBackendsFor(auto const deviceSpec, auto const listOfExecutors)
 									    {
 									        return std::apply(
 									            [deviceSpec](auto... executor) constexpr
 									            {
 									                return std::make_tuple(
 									                    Dict{DictEntry{object::deviceSpec, deviceSpec}, DictEntry{object::exec, executor}}...);
 									            },
 									            listOfExecutors);
 									    }

 									    /** Create the complete backend list for all device specifications and executors.
 									     *
 									     * @param devSpecList tuple of device specifications
 									     * @param listOfExecutors tuple of executor types
 									     * @return a tuple of backend objects, for all executors
 									     */
 									    constexpr auto createBackendList(auto const devSpecList, auto const listOfExecutors)
 									    {
 									        return std::apply(
 									            [listOfExecutors](auto... devSpec) constexpr
 									            { return std::tuple_cat(createBackendsFor(devSpec, getExecutorsList(devSpec, listOfExecutors))...); },
 									            devSpecList);
 									    }

 									    /** Generate the full set of backend dictionaries for a set of APIs.
 									     *
 									     * The result contains a backend entry for each combination of supported device
 									     * specification for the APIs and executors.
 									     *
 									     * @param usedApis tuple of alpaka APIs to consider
 									     * @param listOfExecutors tuple of executor types
 									     * @return a tuple of backend dictionaries covering all APIs and executors
 									     */
 									    consteval auto allBackends(auto const usedApis, auto const listOfExecutors)
 									    {
 									        return std::apply(
 									            [listOfExecutors](auto... api) constexpr
 									            { return std::tuple_cat(createBackendList(getDeviceSpecsFor(api), listOfExecutors)...); },
 									            usedApis);
 									    }

 									    /** @} */
 									} // namespace alpaka::onHost
 									// ==
 									// == ./include/alpaka/onHost/interface.hpp ==
 									// ============================================================================

 								// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 								#include <type_traits>

 								namespace alpaka
 								{
 								    /** Lightweight view to data in an n-dimensional array.
 								     *
 								     * Const-ness of the MdSpan instance is propagated to the data region.
 								     * A constant MdSpan can be used to access non-const data.
 								     *
 								     * @tparam T_Type if the type is const the data is only readable
 								     */
 								    template<
 								        typename T_Type,
 								        concepts::Vector T_Extents,
 								        concepts::Vector T_Pitches,
 								        concepts::Alignment T_MemAlignment = Alignment<>>
 								    struct MdSpan;

 								    template<concepts::Alignment T_MemAlignment = Alignment<>>
 								    inline constexpr auto makeMdSpan(
 								        auto* pointer,
 								        concepts::Vector auto const& extents,
 								        concepts::Vector auto const& pitchBytes,
 								        T_MemAlignment const memAlignment = T_MemAlignment{})
 								    {
 								        return MdSpan{pointer, extents, pitchBytes, memAlignment};
 								    }

 								    template<typename T_ValueType, concepts::Alignment T_MemAlignment = Alignment<>>
 								    inline constexpr auto makeMdSpan(
 								        T_ValueType* pointer,
 								        concepts::Vector auto const& extents,
 								        T_MemAlignment const memAlignment = T_MemAlignment{})
 								    {
 								        auto pitchMd = alpaka::calculatePitchesFromExtents<T_ValueType>(extents);
 								        return MdSpan{pointer, extents, pitchMd, memAlignment};
 								    }

 								    inline constexpr auto makeMdSpan(auto&& any)
 								    {
 								        return MdSpan{onHost::data(any), onHost::getExtents(any), onHost::getPitches(any), alpaka::getAlignment(any)};
 								    }

 								    template<
 								        typename T_Type,
 								        concepts::Vector T_Extents,
 								        concepts::Vector T_Pitches,
 								        concepts::Alignment T_MemAlignment>
 								    struct MdSpan
 								    {
 								        using value_type = T_Type;
 								        using reference = value_type&;
 								        using const_reference = std::add_const_t<value_type>&;
 								        using pointer = value_type*;
 								        using const_pointer = std::add_const_t<value_type>*;
 								        using index_type = typename T_Pitches::type;

 								        using ConstThis = MdSpan<std::add_const_t<value_type>, T_Extents, T_Pitches, T_MemAlignment>;

 								        static_assert(std::is_convertible_v<index_type, typename T_Extents::type>);
 								        static_assert(T_Extents::dim() == T_Pitches::dim());

 								        static consteval uint32_t dim()
 								        {
 								            return T_Extents::dim();
 								        }

 								        /** return value the origin pointer is pointing to
 								         *
 								         * @return value at the current location
 								         */
 								        constexpr const_reference operator*() const
 								        {
 								            return *this->m_ptr;
 								        }

 								        constexpr reference operator*()
 								        {
 								            return *this->m_ptr;
 								        }

 								        /** get origin pointer
 								         *
 								         * If the pointer is const and therefore read only depends on T_Type and not the const-ness of MdSPan.
 								         */
 								        constexpr const_pointer data() const
 								        {
 								            return this->m_ptr;
 								        }

 								        constexpr pointer data()
 								        {
 								            return this->m_ptr;
 								        }

 								        constexpr auto begin() const
 								        {
 								            return MdForwardIter{*this};
 								        }

 								        constexpr auto end() const
 								        {
 								            return MdForwardIterEnd{*this};
 								        }

 								        constexpr auto cbegin() const
 								        {
 								            return MdForwardIter{this->getConstMdSpan()};
 								        }

 								        constexpr auto cend() const
 								        {
 								            return MdForwardIterEnd{this->getConstMdSpan()};
 								        }

 								        /*Object must init by copy a valid instance*/
 								        constexpr MdSpan() = default;

 								        /** Constructor
 								         *
 								         * @param pointer pointer to the memory
 								         * @param extents number of elements
 								         * @param pitchBytes pitch in bytes per dimension
 								         * @param memAlignmentInByte alignment in bytes (zero will set alignment to element alignment)
 								         */
 								        constexpr MdSpan(
 								            T_Type* pointer,
 								            T_Extents extents,
 								            T_Pitches const& pitchBytes,
 								            [[maybe_unused]] T_MemAlignment const& memAlignmentInByte = T_MemAlignment{})
 								            : m_ptr(pointer)
 								            , m_extent(extents)
 								            , m_pitch(pitchBytes)
 								        {
 								        }

 								        template<typename T_Type_Other>
 								        requires internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
 								        constexpr MdSpan(MdSpan<T_Type_Other, T_Extents, T_Pitches, T_MemAlignment> const& other)
 								            : m_ptr(other.data())
 								            , m_extent(other.getExtents())
 								            , m_pitch(other.getPitches()){};
 								        constexpr MdSpan(MdSpan const&) = default;

 								        // causes a compiler error with nvcc
 								        // error: static assertion failed with "All kernel arguments must be trivially copyable or specialize
 								        // trait::IsKernelArgumentTriviallyCopyable<>!"
 								        // constexpr MdSpan& operator=(MdSpan&) = default;


 								        constexpr MdSpan& operator=(MdSpan&&) = default;

 								        static constexpr auto getAlignment()
 								        {
 								            return T_MemAlignment{};
 								        }

 								        /** get value at the given index
 								         *
 								         * @param idx n-dimensional offset, relative to the origin pointer
 								         * @return reference to the value
 								         */
 								        constexpr const_reference operator[](concepts::Vector auto const& idx) const
 								        {
 								            return *ptr(idx);
 								        }

 								        constexpr reference operator[](concepts::Vector auto const& idx)
 								        {
 								            return *ptr(idx);
 								        }

 								        constexpr const_reference operator[](std::integral auto const& idx) const requires(dim() == 1u)
 								        {
 								            return *ptr(Vec{idx});
 								        }

 								        constexpr reference operator[](std::integral auto const& idx) requires(dim() == 1u)
 								        {
 								            return *ptr(Vec{idx});
 								        }

 								        constexpr auto getExtents() const
 								        {
 								            return m_extent;
 								        }

 								        constexpr T_Extents getPitches() const
 								        {
 								            return m_pitch.getPitches();
 								        }

 								        constexpr auto getConstMdSpan() const
 								        {
 								            using ConstValueType = std::add_const_t<value_type>;
 								            return makeMdSpan(
 								                static_cast<ConstValueType*>(m_ptr),
 								                this->getExtents(),
 								                this->getPitches(),
 								                T_MemAlignment{});
 								        }

 								        /** True if MdSpan is pointing to valid memory.
 								         *
 								         * @details
 								         * An MdSpan remains valid even after being moved. The reason for this is that the MdSpan is simply copied.
 								         * This is more efficient than a real move (e.g., setting the data pointer to nullptr). Implementing a real
 								         * move is also not possible because MdSpan must be trivially copyable, which requires a default move
 								         * constructor.
 								         */
 								        [[nodiscard]] constexpr explicit operator bool() const noexcept
 								        {
 								            return true;
 								        }

 								    protected:
 								        /** get the pointer of the value relative to the origin pointer m_ptr
 								         *
 								         * @param idx n-dimensional offset
 								         * @return pointer to value
 								         */
 								        constexpr auto ptr(concepts::Vector auto const& idx) const requires(dim() >= 2u)
 								        {
 								            /** offset in bytes
 								             *
 								             * We calculate the complete offset in bytes even if it would be possible to change the x-dimension
 								             * with the native value_types pointer, this is reducing the register footprint.
 								             */
 								            index_type offset = sizeof(value_type) * idx.back();
 								            for(uint32_t d = 0u; d < dim() - 1u; ++d)
 								            {
 								                offset += m_pitch[d] * idx[d];
 								            }
 								            using CharPtrType = std::conditional_t<std::is_const_v<value_type>, char const*, char*>;
 								            using ResultPtrType = std::conditional_t<std::is_const_v<value_type>, const_pointer, pointer>;
 								            return reinterpret_cast<ResultPtrType>(reinterpret_cast<CharPtrType>(this->m_ptr) + offset);
 								        }

 								        constexpr const_pointer ptr(concepts::Vector auto const& idx) const requires(dim() == 1u)
 								        {
 								            return this->m_ptr + idx.x();
 								        }

 								        constexpr pointer ptr(concepts::Vector auto const& idx) requires(dim() == 1u)
 								        {
 								            return this->m_ptr + idx.x();
 								        }

 								    private:
 								        pointer m_ptr;
 								        T_Extents m_extent;
 								        DataPitches<value_type, T_Pitches> m_pitch;
 								    };

 								    template<
 								        typename T_Type,
 								        concepts::Vector T_Extents,
 								        concepts::Vector T_Pitches,
 								        concepts::Alignment T_MemAlignment>
 								    std::ostream& operator<<(std::ostream& s, MdSpan<T_Type, T_Extents, T_Pitches, T_MemAlignment> const& mdSpan)
 								    {
 								        return s << "MdSpan{ dim=" << ALPAKA_TYPEOF(mdSpan)::dim() << ", extents=" << mdSpan.getExtents().toString()
 								                 << ", pitches=" << mdSpan.getPitches().toString()
 								                 << " , alignment=" << T_MemAlignment::template get<T_Type>() << " }";
 								    }

 								    template<
 								        typename T_Type,
 								        alpaka::concepts::Vector T_Extents,
 								        alpaka::concepts::Vector T_Pitches,
 								        alpaka::concepts::Alignment T_MemAlignment>
 								    struct internal::CopyConstructableDataSource<MdSpan<T_Type, T_Extents, T_Pitches, T_MemAlignment>> : std::true_type
 								    {
 								        using InnerMutable = MdSpan<std::remove_const_t<T_Type>, T_Extents, T_Pitches, T_MemAlignment>;
 								        using InnerConst = MdSpan<std::add_const_t<T_Type>, T_Extents, T_Pitches, T_MemAlignment>;
 								    };

 								    namespace trait
 								    {
 								        template<typename T>
 								        requires(isSpecializationOf_v<std::remove_cvref_t<T>, MdSpan>)
 								        struct IsMdSpan<T> : std::true_type
 								        {
 								        };
 								    } // namespace trait
 								} // namespace alpaka
 								// ==
 								// == ./include/alpaka/mem/MdSpan.hpp ==
 								// ============================================================================

 							// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/onAcc/layout.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/onAcc/traverse.hpp"    // amalgamate: file already inlined

 							/** functionality which is usable on the accelerator compute device from within a kernel. */
 							namespace alpaka::onAcc
 							{
 							    /**@{
 							     * @name range‑based loop indexable index container
 							     */

 							    /** Creates an index container
 							     *
 							     * The index data type is deduced from the supplied range.
 							     * The traversal policy (`T_Traverse`) defines how the next valid index is found for a worker and
 							     * defaults to @c traverse::Flat.
 							     * The mapping policy (`T_IdxLayout`) defines how the index is mapped to worker threads and defaults to
 							     * @c layout::Optimized.
 							     *
 							     * @param workGroup Description of the participating thread group.  More than one
 							     *                  thread can have the same index within the group; all workers
 							     *                  with the same id obtain the same index as result.
 							     * @param range     Index range description.
 							     * @param traverse  Policy describing how the next value can be found.
 							     * @param idxLayout Policy describing how real worker threads will be mapped to the range.
 							     * @return A index container that can be used in a range‑based for loop.
 							     */
 							    template<concepts::IdxTraversing T_Traverse = traverse::Flat, concepts::IdxMapping T_IdxLayout = layout::Optimized>
 							    ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(
 							        auto const& acc,
 							        auto const workGroup,
 							        auto const range,
 							        T_Traverse traverse = T_Traverse{},
 							        T_IdxLayout idxLayout = T_IdxLayout{})
 							    {
 							        return internal::MakeIter::
 							            Op<void, ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(DomainSpec{workGroup, range}), T_Traverse, T_IdxLayout>{}(
 							                acc,
 							                DomainSpec{workGroup, range},
 							                traverse,
 							                idxLayout);
 							    }

 							    /** Specialization for an index container with a given boundary direction of the volume described by the range.
 							     */
 							    template<concepts::IdxTraversing T_Traverse = traverse::Flat, concepts::IdxMapping T_IdxLayout = layout::Optimized>
 							    ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(
 							        auto const& acc,
 							        auto const workGroup,
 							        auto const range,
 							        alpaka::concepts::BoundaryDirection auto const& bd,
 							        T_Traverse traverse = T_Traverse{},
 							        T_IdxLayout idxLayout = T_IdxLayout{})
 							    {
 							        static_assert(ALPAKA_TYPEOF(bd)::dim() == ALPAKA_TYPEOF(range)::dim());
 							        auto const subRange = makeDirectionSubRange(range, bd);
 							        return internal::MakeIter::
 							            Op<void, ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(DomainSpec{workGroup, subRange}), T_Traverse, T_IdxLayout>{}(
 							                acc,
 							                DomainSpec{workGroup, subRange},
 							                traverse,
 							                idxLayout);
 							    }

 							    /** Creates an index container
 							     *
 							     * The traversal policy (`T_Traverse`) defines how the next valid index is found for a worker and
 							     * defaults to @c traverse::Flat.
 							     * The mapping policy (`T_IdxLayout`) defines how the index is mapped to worker threads and defaults to
 							     * @c layout::Optimized.
 							     *
 							     * @tparam T_ScalarIdxType scalar index type sed for the indices inside the iterator
 							     * @param workGroup Description of the participating thread group.  More than one
 							     *                  thread can have the same index within the group; all workers
 							     *                  with the same id obtain the same index as result.
 							     * @param range     Index range description.
 							     * @param traverse  Policy describing how the next value can be found.
 							     * @param idxLayout Policy describing how real worker threads will be mapped to the range.
 							     * @return A index container that can be used in a range‑based for loop.
 							     */
 							    template<
 							        typename T_ScalarIdxType,
 							        concepts::IdxTraversing T_Traverse = traverse::Flat,
 							        concepts::IdxMapping T_IdxLayout = layout::Optimized>
 							    ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(
 							        auto const& acc,
 							        auto const workGroup,
 							        auto const range,
 							        T_Traverse traverse = T_Traverse{},
 							        T_IdxLayout idxLayout = T_IdxLayout{})
 							    {
 							        return internal::MakeIter::Op<
 							            T_ScalarIdxType,
 							            ALPAKA_TYPEOF(acc),
 							            ALPAKA_TYPEOF(DomainSpec{workGroup, range}),
 							            T_Traverse,
 							            T_IdxLayout>{}(acc, DomainSpec{workGroup, range}, traverse, idxLayout);
 							    }

 							    ///@cond NO_HTML
 							    /** Specialisation for one‑dimensional ranges. */
 							    template<
 							        concepts::IdxTraversing T_Traverse = traverse::Tiled,
 							        concepts::IdxMapping T_IdxLayout = layout::Optimized>
 							    ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(
 							        auto const& acc,
 							        auto const workGroup,
 							        alpaka::concepts::IdxRange auto const range,
 							        T_Traverse traverse = T_Traverse{},
 							        T_IdxLayout idxLayout = T_IdxLayout{}) requires(ALPAKA_TYPEOF(range)::dim() == 1u)
 							    {
 							        return internal::MakeIter::
 							            Op<void, ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(DomainSpec{workGroup, range}), T_Traverse, T_IdxLayout>{}(
 							                acc,
 							                DomainSpec{workGroup, range},
 							                traverse,
 							                idxLayout);
 							    }

 							    /** Specialisation for one‑dimensional ranges. */
 							    template<
 							        typename T_ScalarIdxType,
 							        concepts::IdxTraversing T_Traverse = traverse::Tiled,
 							        concepts::IdxMapping T_IdxLayout = layout::Optimized>
 							    ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(
 							        auto const& acc,
 							        auto const workGroup,
 							        alpaka::concepts::IdxRange auto const range,
 							        T_Traverse traverse = T_Traverse{},
 							        T_IdxLayout idxLayout = T_IdxLayout{}) requires(ALPAKA_TYPEOF(range)::dim() == 1u)
 							    {
 							        return internal::MakeIter::Op<
 							            T_ScalarIdxType,
 							            ALPAKA_TYPEOF(acc),
 							            ALPAKA_TYPEOF(DomainSpec{workGroup, range}),
 							            T_Traverse,
 							            T_IdxLayout>{}(acc, DomainSpec{workGroup, range}, traverse, idxLayout);
 							    }

 							    ///@endcond NO_HTML
 							    /** @} */
 							} // namespace alpaka::onAcc
 							// ==
 							// == ./include/alpaka/onAcc/interface.hpp ==
 							// ============================================================================


 						// #include <cstdint>    // amalgamate: file already included
 						#include <new>

 						namespace alpaka::onAcc::internal
 						{
 						    /** concurrent foreach implementation */
 						    template<typename T_Parent>
 						    struct SimdConcurrent
 						    {
 						        constexpr SimdConcurrent() = default;

 						    protected:
 						        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment>
 						        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(
 						            auto const& acc,
 						            alpaka::concepts::Vector auto extents,
 						            auto&& func,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 						        {
 						            auto numElements = typename ALPAKA_TYPEOF(extents)::UniVec{extents};
 						            using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;

 						            constexpr uint32_t maxArchSimdWidth
 						                = getArchSimdWidth<ValueType>(ALPAKA_TYPEOF(acc.getApi()){}, ALPAKA_TYPEOF(acc.getDeviceKind()){});
 						            constexpr uint32_t cachelineBytes
 						                = getCachelineSize(ALPAKA_TYPEOF(acc.getApi()){}, ALPAKA_TYPEOF(acc.getDeviceKind()){});

 						            constexpr uint32_t width = std::min(
 						                maxArchSimdWidth,
 						                T_Parent::template calcSimdWidth<ValueType, T_maxConcurrencyInByte, cachelineBytes>());

 						            if constexpr(width != 1u)
 						            {
 						                concurrentSimdPackExecution<T_maxConcurrencyInByte, width, T_MemAlignment>(
 						                    acc,
 						                    numElements,
 						                    ALPAKA_FORWARD(func),
 						                    ALPAKA_FORWARD(data0),
 						                    ALPAKA_FORWARD(dataN)...);
 						            }
 						            else
 						            {
 						                // execute the algorithm with SIMD width one
 						                for(auto idx : onAcc::makeIdxMap(
 						                        acc,
 						                        asParent().getWorkGroup(),
 						                        IdxRange{numElements},
 						                        asParent().getTraversePolicy(),
 						                        asParent().getIdxLayoutPolicy()))
 						                {
 						                    func(
 						                        acc,
 						                        SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
 						                        SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...);
 						                }
 						            }
 						        }

 						    private:
 						        constexpr auto const& asParent() const
 						        {
 						            return static_cast<T_Parent const&>(*this);
 						        }

 						        template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width>
 						        ALPAKA_FN_INLINE static constexpr void executeDo(
 						            auto const& acc,
 						            auto const& dataIdx,
 						            auto&& func,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&&... data)
 						        {
 						            func(acc, SimdPtr{ALPAKA_FORWARD(data), dataIdx, T_MemAlignment{}, CVec<uint32_t, T_width>{}}...);
 						        }

 						        /** calls the functor and forward the data T_repeat times
 						         *
 						         * The calls to the functor are independent and compile time unrolled to support instruction parallelism.
 						         *
 						         * @param iter the caller must ensure tha the interator can be increased T_repeat times without jumping over
 						         * iter.end()
 						         */
 						        template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
 						        ALPAKA_FN_INLINE static constexpr void execute(
 						            auto const& acc,
 						            auto& iter,
 						            std::integer_sequence<uint32_t, T_repeat...>,
 						            auto&& func,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&&... data)
 						        {
 						            /* We do not check if the iterator points to a valid element, the caller must ensure that we can safely
 						             * increase the iterator without jumping over iter.end().
 						             *
 						             * The ternary operator is used to allow using the folding expression on iter.
 						             */
 						            auto ids = std::make_tuple(*(T_repeat + 1 != 0u ? iter++ : iter++)...);
 						            std::apply(
 						                [&](auto const&... dataIdx) constexpr
 						                {
 						                    (executeDo<T_MemAlignment, T_width>(acc, dataIdx, ALPAKA_FORWARD(func), ALPAKA_FORWARD(data)...),
 						                     ...);
 						                },
 						                ids);
 						        }

 						        template<uint32_t T_maxConcurrencyInByte, uint32_t T_simdWidth, alpaka::concepts::Alignment T_MemAlignment>
 						        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto concurrentSimdPackExecution(
 						            auto const& acc,
 						            alpaka::concepts::Vector auto numElements,
 						            auto&& func,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 						        {
 						            using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
 						            constexpr uint32_t simdWidthInByte = T_simdWidth * sizeof(ValueType);
 						            // number of simd packs fitting into the maxConcurrencyInByte
 						            constexpr uint32_t numSimdPacksToUtilizeConcurrency
 						                = alpaka::divExZero(T_maxConcurrencyInByte, simdWidthInByte);

 						            constexpr uint32_t cachelineBytes
 						                = getCachelineSize(ALPAKA_TYPEOF(acc.getApi()){}, ALPAKA_TYPEOF(acc.getDeviceKind()){});
 						            // number of simd packs fitting into the cacheline
 						            constexpr uint32_t numSimdPacksPerCacheLine = std::max(cachelineBytes / simdWidthInByte, 1u);
 						            /* number of simd packs used per functor call
 						             * - the number of simd packs per functor call should be a multiple of the number of simd packs per
 						             * cacheline
 						             */
 						            constexpr uint32_t numSimdPacksPerFnCall
 						                = alpaka::divExZero(numSimdPacksToUtilizeConcurrency, numSimdPacksPerCacheLine)
 						                  * numSimdPacksPerCacheLine;

 						            auto const workGroup = asParent().getWorkGroup();

 						            // we SIMDfy only over the fast moving dimension (columns of memory)
 						            auto const wSize = workGroup.size(acc).back();

 						            /* Number of data elements process per functor call. */
 						            auto const numElementsPerFnCall = T_simdWidth * numSimdPacksPerFnCall;
 						            /** To avoid a overflow in the index range we device first by the number of elements per
 						             * function call and than by the number of workers.
 						             */
 						            auto const numSimdPackLoops = numElements.back() / numElementsPerFnCall / wSize;

 						            // number of elments to jump over to start the remainder loop
 						            auto const remainderBegin = numSimdPackLoops * numElementsPerFnCall * wSize;

 						            // we SIMDfy only over the fast moving dimension (columns of memory)
 						            auto domainSize = numElements.rAssign(remainderBegin);
 						            auto stride = ALPAKA_TYPEOF(numElements)::all(1).rAssign(T_simdWidth);
 						            using IdxType = ALPAKA_TYPEOF(numElements);

 						            if constexpr(
 						                domainSize.dim() > 1u && std::is_same_v<ALPAKA_TYPEOF(asParent().getTraversePolicy()), traverse::Flat>)
 						            {
 						                /* For cases where we traverse with the flat policy, we cannot assume that we can blindly increase the
 						                 * iterator later N times. This could happen in cases where we have enough concurrency. We evaluate for
 						                 * SIMD operations only the fast moving dimension but with the flat policy flattening the worker group
 						                 * and use all workers on a linear domain. The loop must therefore be splited into iterating over all
 						                 * slow dimensions and an inner loop iterating over the fast moving dimension. For this we need to
 						                 * build our own groups out of the user-provided workgroup.
 						                 */
 						                // build a worker group with slow-moving dimension threads for the outer loop
 						                using index_type = typename IdxType::type;
 						                auto wIdx = workGroup.idx(acc).rAssign(index_type{0});
 						                auto wSize = workGroup.size(acc).rAssign(index_type{1});
 						                auto domSize = domainSize.rAssign(index_type{1});

 						                auto wOuter = WorkerGroup{wIdx, wSize};

 						                for(auto rowIdx : onAcc::makeIdxMap(
 						                        acc,
 						                        wOuter,
 						                        IdxRange{domSize},
 						                        asParent().getTraversePolicy(),
 						                        asParent().getIdxLayoutPolicy()))
 						                {
 						                    // build a worker group with fast-moving dimension threads for the inner loop
 						                    auto wIdxInner = ALPAKA_TYPEOF(domainSize)::all(0).rAssign(workGroup.idx(acc).back());
 						                    auto wSizeInner = ALPAKA_TYPEOF(domainSize)::all(1).rAssign(workGroup.size(acc).back());
 						                    auto wInner = WorkerGroup{wIdxInner, wSizeInner};

 						                    // iterate over the fast-moving dimension
 						                    auto simdIdxContainer = onAcc::makeIdxMap(
 						                        acc,
 						                        wInner,
 						                        IdxRange{rowIdx, domainSize, stride},
 						                        asParent().getTraversePolicy(),
 						                        asParent().getIdxLayoutPolicy())[CVec<uint32_t, ALPAKA_TYPEOF(domainSize)::dim() - 1u>{}];

 						                    for(auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
 						                    {
 						                        execute<T_MemAlignment, T_simdWidth>(
 						                            acc,
 						                            iter,
 						                            std::make_integer_sequence<uint32_t, numSimdPacksPerFnCall>{},
 						                            ALPAKA_FORWARD(func),
 						                            ALPAKA_FORWARD(data0),
 						                            ALPAKA_FORWARD(dataN)...);
 						                    }
 						                }
 						            }
 						            else
 						            {
 						                auto simdIdxContainer = onAcc::makeIdxMap(
 						                    acc,
 						                    workGroup,
 						                    IdxRange{IdxType::all(0), domainSize, stride},
 						                    asParent().getTraversePolicy(),
 						                    asParent().getIdxLayoutPolicy());

 						                for(auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
 						                {
 						                    execute<T_MemAlignment, T_simdWidth>(
 						                        acc,
 						                        iter,
 						                        std::make_integer_sequence<uint32_t, numSimdPacksPerFnCall>{},
 						                        ALPAKA_FORWARD(func),
 						                        ALPAKA_FORWARD(data0),
 						                        ALPAKA_FORWARD(dataN)...);
 						                }
 						            }

 						            ALPAKA_TYPEOF(numElements) remainderDomainSize = numElements.all(0).rAssign(remainderBegin);

 						            for(auto idx : onAcc::makeIdxMap(
 						                    acc,
 						                    workGroup,
 						                    IdxRange{remainderDomainSize, numElements},
 						                    asParent().getTraversePolicy(),
 						                    asParent().getIdxLayoutPolicy()))
 						            {
 						                func(
 						                    acc,
 						                    SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
 						                    SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...);
 						            }
 						        }
 						    };
 						} // namespace alpaka::onAcc::internal
 						// ==
 						// == ./include/alpaka/onAcc/internal/SimdConcurrent.hpp ==
 						// ============================================================================

 						// ============================================================================
 						// == ./include/alpaka/onAcc/internal/SimdTransformReduce.hpp ==
 						// ==
 						/* Copyright 2024 René Widera
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/SimdPtr.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 							// ============================================================================
 							// == ./include/alpaka/functor.hpp ==
 							// ==
 							/* Copyright 2025 René Widera
 							 * SPDX-License-Identifier: MPL-2.0
 							 */

 							// #pragma once
 							// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 							#include <type_traits>
 							// #include <utility>    // amalgamate: file already included

 							namespace alpaka
 							{
 							    /** Marks a functor which supports SimdPtr as arguments
 							     *
 							     * Wrapping a functor or lambda with this class to signal support for SimdPtr.
 							     * A stencil functor can be used to write stencil operations within a transform call.
 							     */
 							    template<typename T_Func>
 							    struct StencilFunc : T_Func
 							    {
 							        using Functor = T_Func;

 							        constexpr StencilFunc(auto&& func) : T_Func{ALPAKA_FORWARD(func)}
 							        {
 							        }
 							    };

 							    template<typename T_Func>
 							    ALPAKA_FN_HOST_ACC StencilFunc(T_Func&&) -> StencilFunc<T_Func>;

 							    /** Marks a functor that can only be executed with scalar types and not SIMD packages.
 							     *
 							     * The functor will be executed element wise for SIMD packages due to methods used which prevent using SIMD
 							     * packages directly.
 							     */
 							    template<typename T_Func>
 							    struct ScalarFunc : T_Func
 							    {
 							        using Functor = T_Func;

 							        constexpr ScalarFunc(auto&& func) : T_Func{ALPAKA_FORWARD(func)}
 							        {
 							        }
 							    };

 							    template<typename T_Func>
 							    ALPAKA_FN_HOST_ACC ScalarFunc(T_Func&&) -> ScalarFunc<T_Func>;

 							    /** Execute the functor with or without an accelerator as first argument
 							     *
 							     * The functor is not allowed to have both possible signatures.
 							     *
 							     * @{
 							     */
 							    template<typename T_Acc, typename T_Functor, typename... T_Args>
 							    requires std::invocable<T_Functor, T_Acc, T_Args...>
 							    inline constexpr auto callFunctor(T_Acc const& acc, T_Functor&& functor, T_Args&&... args)
 							    {
 							        return functor(acc, std::forward<T_Args>(args)...);
 							    }

 							    template<typename T_Acc, typename T_Functor, typename... T_Args>
 							    requires std::invocable<T_Functor, T_Args...>
 							    inline constexpr auto callFunctor(T_Acc const&, T_Functor&& functor, T_Args&&... args)
 							    {
 							        return functor(std::forward<T_Args>(args)...);
 							    }

 							    /** @} */
 							} // namespace alpaka
 							// ==
 							// == ./include/alpaka/functor.hpp ==
 							// ============================================================================

 						// #include "alpaka/mem/concepts.hpp"    // amalgamate: file already inlined
 						// #include "alpaka/mem/concepts/IGeneratorOrMdSpan.hpp"    // amalgamate: file already inlined
 							// ============================================================================
 							// == ./include/alpaka/onAcc/Acc.hpp ==
 							// ==
 							/* Copyright 2024 René Widera
 							 * SPDX-License-Identifier: MPL-2.0
 							 */

 							// #pragma once
 							// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/core/Tag.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 								// ============================================================================
 								// == ./include/alpaka/mem/MdSpanArray.hpp ==
 								// ==
 								/* Copyright 2025 René Widera, Simeon Ehrig
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 								// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/concepts/types.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/mem/DataPitches.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/mem/MdForwardIter.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/mem/concepts/detail/InnerTypeAllowedCast.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

 								// #include <concepts>    // amalgamate: file already included
 								#include <type_traits>

 								namespace alpaka
 								{
 								    /** access a C array with compile time extents via a runtime md index. */
 								    template<std::integral auto T_numDims, uint32_t T_dim = 0u>
 								    struct ResolveArrayAccess
 								    {
 								        constexpr decltype(auto) operator()(auto arrayPtr, concepts::Vector auto const& idx) const
 								        {
 								            return ResolveArrayAccess<T_numDims - 1u, T_dim + 1u>{}(arrayPtr[idx[T_dim]], idx);
 								        }
 								    };

 								    template<uint32_t T_dim>
 								    struct ResolveArrayAccess<1u, T_dim>
 								    {
 								        constexpr decltype(auto) operator()(auto arrayPtr, concepts::Vector auto const& idx) const
 								        {
 								            return arrayPtr[idx[T_dim]];
 								        }
 								    };

 								    /** build C array type with compile time extents from a scalar value based on the compile time extents vector */
 								    template<typename T, concepts::CVector T_Extents, uint32_t T_numDims = T_Extents::dim(), uint32_t T_dim = 0u>
 								    struct CArrayType
 								    {
 								        using type =
 								            typename CArrayType<T[T_Extents{}[T_numDims - T_dim - 1u]], T_Extents, T_numDims - 1u, T_dim + 1u>::type;
 								    };

 								    template<typename T, concepts::CVector T_Extents, uint32_t T_dim>
 								    struct CArrayType<T, T_Extents, 1u, T_dim>
 								    {
 								        using type = T[T_Extents{}[0u]];
 								    };

 								    template<typename T_ArrayType, concepts::Alignment T_MemAlignment = Alignment<>>
 								    struct MdSpanArray
 								    {
 								        static_assert(
 								            sizeof(T_ArrayType) && false,
 								            "MdSpanArray can only be used if std::is_array_v<T> is true for the given type.");
 								    };

 								    template<alpaka::concepts::CStaticArray T_ArrayType, concepts::Alignment T_MemAlignment>
 								    struct MdSpanArray<T_ArrayType, T_MemAlignment>
 								    {
 								    private:
 								        using MutArrayType = std::remove_cv_t<T_ArrayType>;
 								        using ConstArrayType = std::add_const_t<MutArrayType>;

 								    public:
 								        using extentType = std::extent<T_ArrayType, std::rank_v<T_ArrayType>>;
 								        using value_type = std::remove_all_extents_t<T_ArrayType>;
 								        using reference = value_type&;
 								        using const_reference = value_type const&;
 								        using pointer = value_type*;
 								        using const_pointer = value_type const*;
 								        using index_type = typename extentType::value_type;

 								        static consteval uint32_t dim()
 								        {
 								            return std::rank_v<T_ArrayType>;
 								        }

 								        /** return value the origin pointer is pointing to
 								         *
 								         * @return value at the current location
 								         */
 								        constexpr const_reference operator*() const
 								        {
 								            return *this->m_ptr;
 								        }

 								        constexpr reference operator*()
 								        {
 								            return *this->m_ptr;
 								        }

 								        /** get origin pointer */
 								        constexpr const_pointer data() const
 								        {
 								            return this->m_ptr;
 								        }

 								        constexpr pointer data()
 								        {
 								            return this->m_ptr;
 								        }

 								        constexpr auto begin() const
 								        {
 								            return MdForwardIter{*this};
 								        }

 								        constexpr auto end() const
 								        {
 								            return MdForwardIterEnd{*this};
 								        }

 								        constexpr auto getConstMdSpan() const
 								        {
 								            return MdSpanArray<ConstArrayType, T_MemAlignment>(*m_ptr);
 								        }

 								        constexpr auto cbegin() const
 								        {
 								            return MdForwardIter{this->getConstMdSpan()};
 								        }

 								        constexpr auto cend() const
 								        {
 								            return MdForwardIterEnd{this->getConstMdSpan()};
 								        }

 								        /*Object must init by copy a valid instance*/
 								        constexpr MdSpanArray() = default;

 								        /** Constructor
 								         *
 								         * @param pointer pointer to the memory
 								         */
 								        constexpr MdSpanArray(T_ArrayType& staticSizedArray) : m_ptr(const_cast<MutArrayType*>(&staticSizedArray))
 								        {
 								        }

 								        template<alpaka::concepts::CStaticArray T_OtherArrayType>
 								        requires internal::concepts::InnerTypeAllowedCast<T_ArrayType, T_OtherArrayType>
 								        constexpr MdSpanArray(MdSpanArray<T_OtherArrayType, T_MemAlignment> const& other) : m_ptr(other.m_ptr)
 								        {
 								        }

 								        constexpr MdSpanArray(MdSpanArray const&) = default;

 								        constexpr MdSpanArray(MdSpanArray&&) = default;

 								        template<alpaka::concepts::CStaticArray T_OtherArrayType>
 								        requires internal::concepts::InnerTypeAllowedCast<T_ArrayType, T_OtherArrayType>
 								        constexpr MdSpanArray(MdSpanArray<T_OtherArrayType, T_MemAlignment>&& other) : m_ptr(other.m_ptr)
 								        {
 								        }

 								        constexpr MdSpanArray& operator=(MdSpanArray&&) = default;

 								        static constexpr auto getAlignment()
 								        {
 								            return T_MemAlignment{};
 								        }

 								        /** get value at the given index
 								         *
 								         * @param idx offset relative to the origin pointer
 								         * @return reference to the value
 								         * @{
 								         */
 								        constexpr const_reference operator[](concepts::Vector auto const& idx) const
 								        {
 								            return ResolveArrayAccess<dim()>{}(*m_ptr, idx);
 								        }

 								        constexpr reference operator[](concepts::Vector auto const& idx)
 								        {
 								            return ResolveArrayAccess<dim()>{}(*m_ptr, idx);
 								        }

 								        constexpr const_reference operator[](index_type const& idx) const
 								        {
 								            return (*m_ptr)[idx];
 								        }

 								        constexpr reference operator[](index_type const& idx)
 								        {
 								            return (*m_ptr)[idx];
 								        }

 								        constexpr bool operator==(MdSpanArray const other) const
 								        {
 								            return m_ptr == other.m_ptr;
 								        }

 								        /** @} */

 								        constexpr auto getExtents() const
 								        {
 								            auto const createExtents = []<auto... T_extent>(std::index_sequence<T_extent...>)
 								            { return CVec<index_type, std::extent_v<T_ArrayType, T_extent>...>{}; };
 								            return createExtents(std::make_integer_sequence<index_type, dim()>{});
 								        }

 								        constexpr auto getPitches() const
 								        {
 								            return alpaka::calculatePitchesFromExtents<value_type>(getExtents());
 								        }

 								        /** True if MdSpanArray is pointing to valid memory.
 								         *
 								         * @details
 								         * An MdSpanArray remains valid even after being moved. The reason is, that it use stack memory which cannot be
 								         * freed.
 								         */
 								        [[nodiscard]] constexpr explicit operator bool() const noexcept
 								        {
 								            return true;
 								        }

 								        // Needs to be friend of itself with that the copy and move constructor can access the m_ptr of other, if the
 								        // const modifier of the C static array type of the other type is different.
 								        friend MdSpanArray<MutArrayType, T_MemAlignment>;
 								        friend MdSpanArray<ConstArrayType, T_MemAlignment>;

 								    protected:
 								        // we store the C static array as mutable type that we can assign it another MdSpanArray with const or
 								        // non-const inner type
 								        // Depending on the value_type, the const is added at memory access
 								        MutArrayType* m_ptr;
 								    };

 								    template<alpaka::concepts::CStaticArray T_ArrayType, alpaka::concepts::Alignment T_MemAlignment>
 								    struct internal::CopyConstructableDataSource<MdSpanArray<T_ArrayType, T_MemAlignment>> : std::true_type
 								    {
 								        using InnerMutable = MdSpanArray<std::remove_const_t<T_ArrayType>, T_MemAlignment>;
 								        using InnerConst = MdSpanArray<std::add_const_t<T_ArrayType>, T_MemAlignment>;
 								    };

 								    namespace trait
 								    {
 								        template<typename T>
 								        requires(isSpecializationOf_v<std::remove_cvref_t<T>, MdSpanArray>)
 								        struct IsMdSpan<T> : std::true_type
 								        {
 								        };
 								    } // namespace trait
 								} // namespace alpaka
 								// ==
 								// == ./include/alpaka/mem/MdSpanArray.hpp ==
 								// ============================================================================

 								// ============================================================================
 								// == ./include/alpaka/meta/NdLoop.hpp ==
 								// ==
 								/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
 								 * SPDX-License-Identifier: MPL-2.0
 								 */

 								// #pragma once
 								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 								// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 									// ============================================================================
 									// == ./include/alpaka/meta/IntegerSequence.hpp ==
 									// ==
 									/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bernhard Manfred Gruber
 									 * SPDX-License-Identifier: MPL-2.0
 									 */

 									// #pragma once
 										// ============================================================================
 										// == ./include/alpaka/meta/Set.hpp ==
 										// ==
 										/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
 										 * SPDX-License-Identifier: MPL-2.0
 										 */

 										// #pragma once
 										// #include <utility>    // amalgamate: file already included

 										namespace alpaka::meta
 										{
 										    namespace detail
 										    {
 										        //! Empty dependent type.
 										        template<typename T>
 										        struct Empty
 										        {
 										        };

 										        template<typename... Ts>
 										        struct IsParameterPackSetImpl;

 										        template<>
 										        struct IsParameterPackSetImpl<>
 										        {
 										            static constexpr bool value = true;
 										        };

 										        // Based on code by Roland Bock: https://gist.github.com/rbock/ad8eedde80c060132a18
 										        // Linearly inherits from empty<T> and checks if it has already inherited from this type.
 										        template<typename T, typename... Ts>
 										        struct IsParameterPackSetImpl<T, Ts...>
 										            : public IsParameterPackSetImpl<Ts...>
 										            , public virtual Empty<T>
 										        {
 										            using Base = IsParameterPackSetImpl<Ts...>;

 										            static constexpr bool value = Base::value && !std::is_base_of_v<Empty<T>, Base>;
 										        };
 										    } // namespace detail

 										    //! Trait that tells if the parameter pack contains only unique (no equal) types.
 										    template<typename... Ts>
 										    using IsParameterPackSet = detail::IsParameterPackSetImpl<Ts...>;

 										    namespace detail
 										    {
 										        template<typename TList>
 										        struct IsSetImpl;

 										        template<template<typename...> class TList, typename... Ts>
 										        struct IsSetImpl<TList<Ts...>>
 										        {
 										            static constexpr bool value = IsParameterPackSet<Ts...>::value;
 										        };
 										    } // namespace detail

 										    //! Trait that tells if the template contains only unique (no equal) types.
 										    template<typename TList>
 										    using IsSet = detail::IsSetImpl<TList>;
 										} // namespace alpaka::meta
 										// ==
 										// == ./include/alpaka/meta/Set.hpp ==
 										// ============================================================================


 									// #include <cstddef>    // amalgamate: file already included
 									#include <type_traits>
 									// #include <utility>    // amalgamate: file already included

 									namespace alpaka::meta
 									{
 									    namespace detail
 									    {
 									        template<typename TDstType, typename TIntegerSequence>
 									        struct ConvertIntegerSequence;

 									        template<typename TDstType, typename T, T... Tvals>
 									        struct ConvertIntegerSequence<TDstType, std::integer_sequence<T, Tvals...>>
 									        {
 									            using type = std::integer_sequence<TDstType, static_cast<TDstType>(Tvals)...>;
 									        };
 									    } // namespace detail

 									    template<typename TDstType, typename TIntegerSequence>
 									    using ConvertIntegerSequence = typename detail::ConvertIntegerSequence<TDstType, TIntegerSequence>::type;

 									    namespace detail
 									    {
 									        template<bool TisSizeNegative, bool TbIsBegin, typename T, T Tbegin, typename TIntCon, typename TIntSeq>
 									        struct MakeIntegerSequenceHelper
 									        {
 									            static_assert(!TisSizeNegative, "MakeIntegerSequence<T, N> requires N to be non-negative.");
 									        };

 									        template<typename T, T Tbegin, T... Tvals>
 									        struct MakeIntegerSequenceHelper<
 									            false,
 									            true,
 									            T,
 									            Tbegin,
 									            std::integral_constant<T, Tbegin>,
 									            std::integer_sequence<T, Tvals...>>
 									        {
 									            using type = std::integer_sequence<T, Tvals...>;
 									        };

 									        template<typename T, T Tbegin, T TIdx, T... Tvals>
 									        struct MakeIntegerSequenceHelper<
 									            false,
 									            false,
 									            T,
 									            Tbegin,
 									            std::integral_constant<T, TIdx>,
 									            std::integer_sequence<T, Tvals...>>
 									        {
 									            using type = typename MakeIntegerSequenceHelper<
 									                false,
 									                TIdx == (Tbegin + 1),
 									                T,
 									                Tbegin,
 									                std::integral_constant<T, TIdx - 1>,
 									                std::integer_sequence<T, TIdx - 1, Tvals...>>::type;
 									        };
 									    } // namespace detail

 									    template<typename T, T Tbegin, T Tsize>
 									    using MakeIntegerSequenceOffset = typename detail::MakeIntegerSequenceHelper<
 									        (Tsize < 0),
 									        (Tsize == 0),
 									        T,
 									        Tbegin,
 									        std::integral_constant<T, Tbegin + Tsize>,
 									        std::integer_sequence<T>>::type;

 									    //! Checks if the integral values are unique.
 									    template<typename T, T... Tvals>
 									    struct IntegralValuesUnique
 									    {
 									        static constexpr bool value = meta::IsParameterPackSet<std::integral_constant<T, Tvals>...>::value;
 									    };

 									    //! Checks if the values in the index sequence are unique.
 									    template<typename TIntegerSequence>
 									    struct IntegerSequenceValuesUnique;

 									    //! Checks if the values in the index sequence are unique.
 									    template<typename T, T... Tvals>
 									    struct IntegerSequenceValuesUnique<std::integer_sequence<T, Tvals...>>
 									    {
 									        static constexpr bool value = IntegralValuesUnique<T, Tvals...>::value;
 									    };

 									    //! Checks if the integral values are within the given range.
 									    template<typename T, T Tmin, T Tmax, T... Tvals>
 									    struct IntegralValuesInRange;

 									    //! Checks if the integral values are within the given range.
 									    template<typename T, T Tmin, T Tmax>
 									    struct IntegralValuesInRange<T, Tmin, Tmax>
 									    {
 									        static constexpr bool value = true;
 									    };

 									    //! Checks if the integral values are within the given range.
 									    template<typename T, T Tmin, T Tmax, T I, T... Tvals>
 									    struct IntegralValuesInRange<T, Tmin, Tmax, I, Tvals...>
 									    {
 									        static constexpr bool value
 									            = (I >= Tmin) && (I <= Tmax) && IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
 									    };

 									    //! Checks if the values in the index sequence are within the given range.
 									    template<typename TIntegerSequence, typename T, T Tmin, T Tmax>
 									    struct IntegerSequenceValuesInRange;

 									    //! Checks if the values in the index sequence are within the given range.
 									    template<typename T, T... Tvals, T Tmin, T Tmax>
 									    struct IntegerSequenceValuesInRange<std::integer_sequence<T, Tvals...>, T, Tmin, Tmax>
 									    {
 									        static constexpr bool value = IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
 									    };
 									} // namespace alpaka::meta
 									// ==
 									// == ./include/alpaka/meta/IntegerSequence.hpp ==
 									// ============================================================================


 								// #include <utility>    // amalgamate: file already included

 								namespace alpaka::meta
 								{
 								    namespace detail
 								    {
 								        template<typename TIndex, typename TExtentVec, typename TFnObj>
 								        constexpr void ndLoopImpl(std::index_sequence<>, TIndex& idx, TExtentVec const&, TFnObj const& f)
 								        {
 								            f(idx);
 								        }

 								        template<std::size_t Tdim0, std::size_t... Tdims, typename TIndex, typename TExtentVec, typename TFnObj>
 								        constexpr void ndLoopImpl(
 								            std::index_sequence<Tdim0, Tdims...>,
 								            TIndex& idx,
 								            TExtentVec const& extent,
 								            TFnObj const& f)
 								        {
 								            static_assert(TIndex::dim() > 0u, "The dimension given to ndLoop has to be larger than zero!");
 								            static_assert(
 								                TIndex::dim() == TExtentVec::dim(),
 								                "The dimensions of the iteration vector and the extent vector have to be identical!");
 								            static_assert(TIndex::dim() > Tdim0, "The current dimension has to be in the range [0,dim-1]!");

 								            for(idx[Tdim0] = 0u; idx[Tdim0] < extent[Tdim0]; ++idx[Tdim0])
 								            {
 								                ndLoopImpl(std::index_sequence<Tdims...>{}, idx, extent, f);
 								            }
 								        }
 								    } // namespace detail

 								    //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
 								    //! The loops are nested in the order given by the index_sequence with the first element being the outermost
 								    //! and the last index the innermost loop.
 								    //!
 								    //! \param indexSequence A sequence of indices being a permutation of the values [0, dim-1].
 								    //! \param extent N-dimensional loop extent.
 								    //! \param f The function called at each iteration.
 								    template<typename TExtentVec, typename TFnObj, std::size_t... Tdims>
 								    auto ndLoop(
 								        [[maybe_unused]] std::index_sequence<Tdims...> indexSequence,
 								        TExtentVec& idx,
 								        TExtentVec const& extent,
 								        TFnObj const& f) -> void
 								    {
 								        static_assert(
 								            IntegerSequenceValuesInRange<std::index_sequence<Tdims...>, std::size_t, 0, TExtentVec::dim()>::value,
 								            "The values in the index_sequence have to be in the range [0,dim-1]!");
 								        static_assert(
 								            IntegerSequenceValuesUnique<std::index_sequence<Tdims...>>::value,
 								            "The values in the index_sequence have to be unique!");

 								        detail::ndLoopImpl(std::index_sequence<Tdims...>{}, idx, extent, f);
 								    }

 								    //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
 								    //! The loops are nested from index zero outmost to index (dim-1) innermost.
 								    //!
 								    //! \param extent N-dimensional loop extent.
 								    //! \param f The function called at each iteration.
 								    template<typename TExtentVec, typename TFnObj>
 								    auto ndLoopIncIdx(TExtentVec& idx, TExtentVec const& extent, TFnObj const& f) -> void
 								    {
 								        idx = TExtentVec::all(0);
 								        ndLoop(std::make_index_sequence<TExtentVec::dim()>(), idx, extent, f);
 								    }

 								    template<typename TExtentVec, typename TFnObj>
 								    auto ndLoopIncIdx(TExtentVec const& extent, TFnObj const& f) -> void
 								    {
 								        // TExtentVec could be a CVec therefore we need to make it writable
 								        using IndexVector = typename TExtentVec::UniVec;
 								        auto idx = IndexVector::all(0);

 								        ndLoop(std::make_index_sequence<TExtentVec::dim()>(), idx, IndexVector{extent}, f);
 								    }
 								} // namespace alpaka::meta
 								// ==
 								// == ./include/alpaka/meta/NdLoop.hpp ==
 								// ============================================================================

 							// #include "alpaka/onAcc/interface.hpp"    // amalgamate: file already inlined
 							// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 							// #include <cassert>    // amalgamate: file already included
 							// #include <tuple>    // amalgamate: file already included

 							namespace alpaka::onAcc
 							{
 							    template<typename T_Storage>
 							    struct Acc : T_Storage
 							    {
 							        constexpr Acc(T_Storage const& storage) : T_Storage{storage}
 							        {
 							        }

 							        constexpr Acc(Acc const&) = delete;
 							        constexpr Acc(Acc&&) = delete;
 							        constexpr Acc& operator=(Acc const&) = delete;
 							        constexpr Acc& operator=(Acc&&) = delete;

 							        /** Get the n-dimensional indices within the origin in the quantity of the selected unit */
 							        constexpr alpaka::concepts::Vector auto getIdxWithin(concepts::Origin auto origin, concepts::Unit auto unit)
 							            const
 							        {
 							            return internalCompute::GetIdxWithin::Op<Acc, ALPAKA_TYPEOF(origin), ALPAKA_TYPEOF(unit)>{}(
 							                *this,
 							                origin,
 							                unit);
 							        }

 							        /** Get the n-dimensional extents of an origin in the quantity of the selected unit */
 							        constexpr alpaka::concepts::Vector auto getExtentsOf(concepts::Origin auto origin, concepts::Unit auto unit)
 							            const
 							        {
 							            return internalCompute::GetExtentsOf::Op<Acc, ALPAKA_TYPEOF(origin), ALPAKA_TYPEOF(unit)>{}(
 							                *this,
 							                origin,
 							                unit);
 							        }

 							        static constexpr bool hasKey(auto key)
 							        {
 							            constexpr auto idx = Idx<ALPAKA_TYPEOF(key), std::decay_t<T_Storage>>::value;
 							            return idx != -1;
 							        }

 							        constexpr auto getApi() const
 							        {
 							            return T_Storage::operator[](object::api);
 							        }

 							        constexpr auto getDeviceKind() const
 							        {
 							            return T_Storage::operator[](object::deviceKind);
 							        }
 							    };

 							    namespace concepts
 							    {
 							        /** Concept to check if a type is an accelerator
 							         *
 							         * @tparam T_Acc Type to check
 							         * @tparam T_Api Enforce an API type, if not provided api type is not checked
 							         */
 							        template<typename T_Acc, typename T_Api = alpaka::NotRequired>
 							        concept Acc = alpaka::isSpecializationOf_v<T_Acc, alpaka::onAcc::Acc>
 							                      && (std::same_as<T_Api, ALPAKA_TYPEOF(std::declval<T_Acc>().getApi())>
 							                          || std::same_as<T_Api, alpaka::NotRequired>);
 							    } // namespace concepts

 							    /** Synchronize all threads within a given scope */
 							    template<alpaka::concepts::Layer T_Scope>
 							    constexpr void sync(concepts::Acc auto const& acc, T_Scope scope)
 							    {
 							        internalCompute::sync(acc, scope);
 							    }

 							    /** Synchronize all threads within a thread block */
 							    constexpr void syncBlockThreads(concepts::Acc auto const& acc)
 							    {
 							        internalCompute::sync(acc, alpaka::layer::block);
 							    }

 							    /** Create a variable located in the thread blocks shared memory
 							     *
 							     * @code{.cpp}
 							     * // creates a reference to a float value
 							     * auto& foo = declareSharedVar<float, uniqueId()>(acc);
 							     * @endcode
 							     *
 							     * @attention The data is not initialized; it can contain garbage.
 							     *
 							     * @tparam T The type that should be created; the constructor is not called
 							     * @tparam T_uniqueId ID that is unique inside a kernel.
 							     *                  Reusing the id will return the same memory declared before with the same id.
 							     * @return Result should be taken by reference
 							     */
 							    template<typename T, size_t T_uniqueId>
 							    constexpr decltype(auto) declareSharedVar(concepts::Acc auto const& acc)
 							    {
 							        return internalCompute::declareSharedVar<T, T_uniqueId>(acc);
 							    }

 							    /** creates an M-dimensional array
 							     *
 							     * @code{.cpp}
 							     * // creates a MdSpan view to a float value, do NOT use a reference here
 							     * auto fooArrayMd = declareSharedVar<float, uniqueId()>(acc, CVec<uint32_t, 5, 8>{});
 							     * @endcode
 							     *
 							     * @attention The data is not initialized it can contains garbage.
 							     *
 							     * @tparam T type which should be created, the constructor is not called
 							     * @tparam T_uniqueId id those is unique inside a kernel.
 							     *                  Reusing the id will return the same memory declared before with the same id.
 							     * @param extent M-dimensional extent in elements for each dimension, 1 - M dimensions are supported
 							     * @return MdSpan non owning view to the corresponding data, you should NOT store a reference to the handle
 							     */
 							    template<typename T, size_t T_uniqueId>
 							    constexpr decltype(auto) declareSharedMdArray(
 							        concepts::Acc auto const& acc,
 							        alpaka::concepts::CVector auto const& extent)
 							    {
 							        using CArrayType = typename CArrayType<T, ALPAKA_TYPEOF(extent)>::type;
 							        /* XOR with hash to avoid issues in case the user is using the same id to create an array and normal shared
 							         * variables.
 							         */
 							        constexpr size_t id = T_uniqueId ^ 0x9e37'79b9'7f4a'7c15;
 							        constexpr auto alignment = Alignment<alignof(T)>{};
 							        return MdSpanArray<CArrayType, ALPAKA_TYPEOF(alignment)>{declareSharedVar<CArrayType, id>(acc)};
 							    }

 							    /** Get block shared dynamic memory.
 							     *
 							     * The available size of the memory can be defined by specializing 'onHost::trait:GetDynSharedMemBytes' or adding a
 							     * public member variable 'uint32_t dynSharedMemBytes' for a kernel. The Memory can be accessed by all threads
 							     * within a block. Access to the memory is not thread safe.
 							     *
 							     * \tparam T The element type.
 							     * \return Pointer to pre-allocated contiguous memory.
 							     */
 							    template<typename T>
 							    constexpr auto getDynSharedMem(concepts::Acc auto const& acc) -> T*
 							    {
 							        return internalCompute::declareDynamicSharedMem<T>(acc);
 							    }

 							} // namespace alpaka::onAcc

 							namespace alpaka::onAcc::internalCompute
 							{
 							    /** synchronize all threads within a thread block */
 							    template<concepts::Acc T_Acc>
 							    struct Sync::Op<T_Acc, alpaka::layer::Block>
 							    {
 							        constexpr auto operator()(T_Acc const& acc, alpaka::layer::Block const scope) const
 							        {
 							            acc[action::threadBlockSync]();
 							        }
 							    };
 							} // namespace alpaka::onAcc::internalCompute
 							// ==
 							// == ./include/alpaka/onAcc/Acc.hpp ==
 							// ============================================================================

 						// #include "alpaka/onAcc/interface.hpp"    // amalgamate: file already inlined

 						// #include <cstdint>    // amalgamate: file already included
 						// #include <new>    // amalgamate: file already included

 						namespace alpaka::onAcc::internal
 						{

 						    /** concurrent reduce implementation */
 						    template<typename T_Parent>
 						    struct SimdTransformReduce
 						    {
 						        constexpr SimdTransformReduce() = default;

 						    protected:
 						        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
 						        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(
 						            auto const& acc,
 						            alpaka::concepts::Vector auto extents,
 						            auto const& neutralElement,
 						            auto&& reduceFunc,
 						            auto&& func,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 						        {
 						            auto numElements = typename ALPAKA_TYPEOF(extents)::UniVec{extents};
 						            using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;

 						            constexpr uint32_t maxArchSimdWidth
 						                = getArchSimdWidth<ValueType>(ALPAKA_TYPEOF(acc.getApi()){}, ALPAKA_TYPEOF(acc.getDeviceKind()){});
 						            constexpr uint32_t cachlineBytes
 						                = getCachelineSize(ALPAKA_TYPEOF(acc.getApi()){}, ALPAKA_TYPEOF(acc.getDeviceKind()){});

 						            constexpr uint32_t width = std::min(
 						                maxArchSimdWidth,
 						                T_Parent::template calcSimdWidth<ValueType, T_maxConcurrencyInByte, cachlineBytes>());

 						            auto const workGroup = asParent().getWorkGroup();

 						            if constexpr(width != 1u)
 						            {
 						                return reduceSimdPackExecution<T_maxConcurrencyInByte, width, T_MemAlignment>(
 						                    acc,
 						                    numElements,
 						                    neutralElement,
 						                    ALPAKA_FORWARD(reduceFunc),
 						                    ALPAKA_FORWARD(func),
 						                    ALPAKA_FORWARD(data0),
 						                    ALPAKA_FORWARD(dataN)...);
 						            }

 						            // execute the algorithm with SIMD width one
 						            auto traverse = onAcc::makeIdxMap(
 						                acc,
 						                workGroup,
 						                IdxRange{numElements},
 						                asParent().getTraversePolicy(),
 						                asParent().getIdxLayoutPolicy());
 						            using ReturnType = decltype(func(
 						                acc,
 						                SimdPtr{data0, *(traverse.begin()), T_MemAlignment{}, CVec<uint32_t, 1u>{}},
 						                SimdPtr{dataN, *(traverse.begin()), T_MemAlignment{}, CVec<uint32_t, 1u>{}}...));

 						            auto retValue = ReturnType::all(neutralElement);
 						            for(auto idx : traverse)
 						            {
 						                retValue = reduceFunc(
 						                    retValue,
 						                    func(
 						                        acc,
 						                        SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
 						                        SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...));
 						            }
 						            return retValue[0];
 						        }

 						    private:
 						        template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width>
 						        ALPAKA_FN_INLINE static constexpr auto executeDoTransform(
 						            auto const& acc,
 						            auto const& dataIdx,
 						            auto&& func,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&&... data)
 						        {
 						            return func(acc, SimdPtr{ALPAKA_FORWARD(data), dataIdx, T_MemAlignment{}, CVec<uint32_t, T_width>{}}...);
 						        }

 						        /** calls the functor and forward the data T_repeat times
 						         *
 						         * The calls to the functor are independent and compile time unrolled to support instruction parallelism.
 						         *
 						         * @param iter the caller must ensure tha the interator can be increased T_repeat times without jumping over
 						         * iter.end()
 						         */
 						        template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
 						        ALPAKA_FN_INLINE static constexpr auto executeReduce(
 						            auto const& acc,
 						            auto& iter,
 						            std::integer_sequence<uint32_t, T_repeat...>,
 						            auto&& reduceFunc,
 						            auto&& func,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&&... data)
 						        {
 						            /* We do not check if the iterator points to a valid element, the caller must ensure that we can safely
 						             * increase the iterator without jumping over iter.end().
 						             *
 						             * The ternary operator is used to allow using the folding expression on iter.
 						             */
 						            auto ids = std::make_tuple(*(T_repeat + 1 != 0u ? iter++ : iter++)...);
 						            return std::apply(
 						                [&](auto const&... dataIdx) constexpr
 						                {
 						                    /* It is not possible to create a Simd{Simd} due to constructor issues. Therefore we need to define
 						                     * the type for the result explicit.
 						                     */
 						                    using ComponentType = ALPAKA_TYPEOF(
 						                        executeDoTransform<T_MemAlignment, T_width>(
 						                            acc,
 						                            std::get<0>(std::make_tuple(dataIdx...)),
 						                            ALPAKA_FORWARD(func),
 						                            ALPAKA_FORWARD(data)...));
 						                    auto results = Simd<ComponentType, std::tuple_size_v<ALPAKA_TYPEOF(ids)>>{
 						                        executeDoTransform<T_MemAlignment, T_width>(
 						                            acc,
 						                            dataIdx,
 						                            ALPAKA_FORWARD(func),
 						                            ALPAKA_FORWARD(data)...)...};

 						                    return results.reduce(ALPAKA_FORWARD(reduceFunc));
 						                },
 						                ids);
 						        }

 						    private:
 						        template<onAcc::concepts::Acc T_Acc, typename T_ReduceOp>
 						        struct ScalarReducer
 						        {
 						            // using a const reference here is fine because we control the lifetime
 						            T_Acc const& m_acc;
 						            T_ReduceOp const& m_reduceOp;

 						            constexpr ScalarReducer(T_Acc const& acc, auto&& func) : m_acc(acc), m_reduceOp{ALPAKA_FORWARD(func)}
 						            {
 						            }

 						            constexpr auto operator()(auto&& a, auto&& b) const
 						                requires(alpaka::concepts::Simd<ALPAKA_TYPEOF(a)> && alpaka::concepts::Simd<ALPAKA_TYPEOF(b)>)
 						            {
 						                return loadAncExecuteScalarOp(
 						                    std::make_integer_sequence<uint32_t, ALPAKA_TYPEOF(a)::dim()>{},
 						                    [this](alpaka::concepts::CVector auto idx, auto const& acc, auto&& func, auto&&... data) constexpr
 						                    {
 						                        // recursively call until no Simd type is the result
 						                        return this->operator()(data[idx.x()]...);
 						                    },
 						                    m_acc,
 						                    m_reduceOp,
 						                    a,
 						                    b);
 						            }

 						            constexpr auto operator()(auto&& a, auto&& b) const
 						                requires(!alpaka::concepts::Simd<ALPAKA_TYPEOF(a)> && !alpaka::concepts::Simd<ALPAKA_TYPEOF(b)>)
 						            {
 						                return m_reduceOp(a, b);
 						            }

 						        private:
 						            template<uint32_t... T_idx>
 						            ALPAKA_FN_INLINE ALPAKA_FN_ACC static constexpr auto loadAncExecuteScalarOp(
 						                std::integer_sequence<uint32_t, T_idx...>,
 						                auto&& op,
 						                auto const& acc,
 						                auto&& func,
 						                auto&&... data)
 						            {
 						                return Simd{op(CVec<uint32_t, T_idx>{}, acc, ALPAKA_FORWARD(func), ALPAKA_FORWARD(data)...)...};
 						            }
 						        };

 						        /** Get the reducer functor
 						         *
 						         * @return wrapped functor in case the input is @see ScalarFunc else the identity
 						         */
 						        ALPAKA_FN_INLINE constexpr auto getReducer(onAcc::concepts::Acc auto const&, auto&& reduceOp) const
 						            requires(!isSpecializationOf_v<ALPAKA_TYPEOF(reduceOp), ScalarFunc>)
 						        {
 						            return reduceOp;
 						        }

 						        ALPAKA_FN_INLINE constexpr auto getReducer(onAcc::concepts::Acc auto const& acc, auto&& reduceOp) const
 						            requires(isSpecializationOf_v<ALPAKA_TYPEOF(reduceOp), ScalarFunc>)
 						        {
 						            return ScalarReducer<ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(reduceOp)>{acc, reduceOp};
 						        }

 						        constexpr auto const& asParent() const
 						        {
 						            return static_cast<T_Parent const&>(*this);
 						        }

 						        template<uint32_t T_maxConcurrencyInByte, uint32_t T_simdWidth, alpaka::concepts::Alignment T_MemAlignment>
 						        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto reduceSimdPackExecution(
 						            auto const& acc,
 						            alpaka::concepts::Vector auto numElements,
 						            auto const& neutralElement,
 						            auto&& userReduceFunc,
 						            auto&& func,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 						            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 						        {
 						            auto reduceFunc = getReducer(acc, userReduceFunc);

 						            using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
 						            constexpr uint32_t simdWidthInByte = T_simdWidth * sizeof(ValueType);
 						            // number of simd packs fitting into the maxConcurrencyInByte
 						            constexpr uint32_t numSimdPacksToUtilizeConcurrency
 						                = alpaka::divExZero(T_maxConcurrencyInByte, simdWidthInByte);

 						            constexpr uint32_t cachlineBytes
 						                = getCachelineSize(ALPAKA_TYPEOF(acc.getApi()){}, ALPAKA_TYPEOF(acc.getDeviceKind()){});
 						            // number of simd packs fitting into the cacheline
 						            constexpr uint32_t numSimdPacksPerCacheLine = alpaka::divExZero(cachlineBytes, simdWidthInByte);
 						            /* number of simd packs used per functor call
 						             * - the number of simd packs per functor call should be a multiple of the number of simd packs per
 						             * cacheline
 						             */
 						            constexpr uint32_t numSimdPacksPerFnCall
 						                = alpaka::divExZero(numSimdPacksToUtilizeConcurrency, numSimdPacksPerCacheLine)
 						                  * numSimdPacksPerCacheLine;

 						            auto const workGroup = asParent().getWorkGroup();

 						            // we SIMDfy only over the fast moving dimension (columns of memory)
 						            auto const wSize = workGroup.size(acc).back();

 						            /* Number of data elements process per functor call. */
 						            auto const numElementsPerFnCall = T_simdWidth * numSimdPacksPerFnCall;
 						            /** To avoid a overflow in the index range we device first by the number of elements per
 						             * function call and than by the number of workers.
 						             */
 						            auto const numSimdPackLoops = numElements.back() / numElementsPerFnCall / wSize;

 						            // number of elments to jump over to start the remainder loop
 						            auto const remainderBegin = numSimdPackLoops * numElementsPerFnCall * wSize;

 						            // we SIMDfy only over the fast moving dimension (columns of memory)
 						            auto domainSize = numElements.rAssign(remainderBegin);
 						            auto stride = ALPAKA_TYPEOF(numElements)::all(1).rAssign(T_simdWidth);

 						            using IdxType = ALPAKA_TYPEOF(numElements);
 						            auto simdIdxContainer = onAcc::makeIdxMap(
 						                acc,
 						                workGroup,
 						                IdxRange{IdxType::all(0), domainSize, stride},
 						                asParent().getTraversePolicy(),
 						                asParent().getIdxLayoutPolicy());

 						            auto iter = simdIdxContainer.begin();
 						            using SimdReturn = decltype(executeReduce<T_MemAlignment, T_simdWidth>(
 						                acc,
 						                iter,
 						                std::make_integer_sequence<uint32_t, numSimdPacksPerFnCall>{},
 						                ALPAKA_FORWARD(reduceFunc),
 						                ALPAKA_FORWARD(func),
 						                ALPAKA_FORWARD(data0),
 						                ALPAKA_FORWARD(dataN)...));

 						            auto tmpReturn = SimdReturn::all(neutralElement);

 						            if constexpr(
 						                domainSize.dim() > 1u && std::is_same_v<ALPAKA_TYPEOF(asParent().getTraversePolicy()), traverse::Flat>)
 						            {
 						                /* For cases where we traverse with the flat policy, we cannot assume that we can blindly increase the
 						                 * iterator later N times. This could happen in cases where we have enough concurrency. We evaluate for
 						                 * SIMD operations only the fast moving dimension but with the flat policy flattening the worker group
 						                 * and use all workers on a linear domain. The loop must therefore be splited into iterating over all
 						                 * slow dimensions and an inner loop iterating over the fast moving dimension. For this we need to
 						                 * build our own groups out of the user-provided workgroup.
 						                 */
 						                // build a worker group with slow-moving dimension threads for the outer loop
 						                using index_type = typename IdxType::type;
 						                auto wIdx = workGroup.idx(acc).rAssign(index_type{0});
 						                auto wSize = workGroup.size(acc).rAssign(index_type{1});
 						                auto domSize = domainSize.rAssign(index_type{1});

 						                auto wOuter = WorkerGroup{wIdx, wSize};

 						                for(auto rowIdx : onAcc::makeIdxMap(
 						                        acc,
 						                        wOuter,
 						                        IdxRange{domSize},
 						                        asParent().getTraversePolicy(),
 						                        asParent().getIdxLayoutPolicy()))
 						                {
 						                    // build a worker group with fast-moving dimension threads for the inner loop
 						                    auto wIdxInner = ALPAKA_TYPEOF(domainSize)::all(0).rAssign(workGroup.idx(acc).back());
 						                    auto wSizeInner = ALPAKA_TYPEOF(domainSize)::all(1).rAssign(workGroup.size(acc).back());
 						                    auto wInner = WorkerGroup{wIdxInner, wSizeInner};

 						                    // iterate over the fast-moving dimension
 						                    auto simdIdxContainer = onAcc::makeIdxMap(
 						                        acc,
 						                        wInner,
 						                        IdxRange{rowIdx, domainSize, stride},
 						                        asParent().getTraversePolicy(),
 						                        asParent().getIdxLayoutPolicy())[CVec<uint32_t, ALPAKA_TYPEOF(domainSize)::dim() - 1u>{}];

 						                    for(auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
 						                    {
 						                        tmpReturn = reduceFunc(
 						                            tmpReturn,
 						                            executeReduce<T_MemAlignment, T_simdWidth>(
 						                                acc,
 						                                iter,
 						                                std::make_integer_sequence<uint32_t, numSimdPacksPerFnCall>{},
 						                                ALPAKA_FORWARD(reduceFunc),
 						                                ALPAKA_FORWARD(func),
 						                                ALPAKA_FORWARD(data0),
 						                                ALPAKA_FORWARD(dataN)...));
 						                    }
 						                }
 						            }
 						            else
 						            {
 						                for(; iter != simdIdxContainer.end();)
 						                {
 						                    tmpReturn = reduceFunc(
 						                        tmpReturn,
 						                        executeReduce<T_MemAlignment, T_simdWidth>(
 						                            acc,
 						                            iter,
 						                            std::make_integer_sequence<uint32_t, numSimdPacksPerFnCall>{},
 						                            ALPAKA_FORWARD(reduceFunc),
 						                            ALPAKA_FORWARD(func),
 						                            ALPAKA_FORWARD(data0),
 						                            ALPAKA_FORWARD(dataN)...));
 						                }
 						            }

 						            ALPAKA_TYPEOF(numElements) remainderDomainSize = numElements.all(0).rAssign(remainderBegin);

 						            for(auto idx : onAcc::makeIdxMap(
 						                    acc,
 						                    workGroup,
 						                    IdxRange{remainderDomainSize, numElements},
 						                    asParent().getTraversePolicy(),
 						                    asParent().getIdxLayoutPolicy()))
 						            {
 						                tmpReturn[0] = reduceFunc(
 						                    tmpReturn[0],
 						                    func(
 						                        acc,
 						                        SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
 						                        SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...)[0]);
 						            }

 						            return tmpReturn.reduce(ALPAKA_FORWARD(reduceFunc));
 						        }
 						    };
 						} // namespace alpaka::onAcc::internal
 						// ==
 						// == ./include/alpaka/onAcc/internal/SimdTransformReduce.hpp ==
 						// ============================================================================


 					// #include <cstdint>    // amalgamate: file already included

 					namespace alpaka::onAcc
 					{
 					    /** Creates a functor operate on contiguous data concurrently.
 					     *
 					     * The class is automatically configured to use the best fitting SIMD width for the given data type and is able to
 					     * expose instruction level parallelism.
 					     *
 					     * @param T_WorkGroup participating thread description. More than one thread can have the same index within the
 					     * group. All worker with the same id will get the same index as result.
 					     * @param T_Traverse Policy to configure the method used to find the next valid index for a worker. @see namespace
 					     * traverse
 					     * @param T_IdxLayout Policy to define how indecision will be mapped to worker threads. @see namsepsace layout
 					     */
 					    template<
 					        typename T_WorkGroup,
 					        concepts::IdxTraversing T_Traverse = traverse::Flat,
 					        concepts::IdxMapping T_IdxLayout = layout::Optimized>
 					    struct SimdAlgo
 					        : protected internal::SimdConcurrent<SimdAlgo<T_WorkGroup, T_Traverse, T_IdxLayout>>
 					        , protected internal::SimdTransformReduce<SimdAlgo<T_WorkGroup, T_Traverse, T_IdxLayout>>
 					    {
 					        constexpr SimdAlgo(
 					            T_WorkGroup const workGroup,
 					            T_Traverse traverse = T_Traverse{},
 					            T_IdxLayout idxLayout = T_IdxLayout{})
 					            : m_workGroup{workGroup}
 					        {
 					        }

 					        constexpr T_WorkGroup getWorkGroup() const
 					        {
 					            return m_workGroup;
 					        }

 					        constexpr T_Traverse getTraversePolicy() const
 					        {
 					            return T_Traverse{};
 					        }

 					        constexpr T_IdxLayout getIdxLayoutPolicy() const
 					        {
 					            return T_IdxLayout{};
 					        }

 					        /** execute the functor concurrently over the given data.
 					         *
 					         * @attention The number of elements to process is derived from the first MdSpan object.
 					         *            All other MdSpan objects must have hat least the same number of elements.
 					         *            The optimal concurrency is also derived from the first MdSpan.
 					         *
 					         * @param func the functor to be executed
 					         * @param data0 the first data to be processed
 					         * @param dataN the remaining data to be processed
 					         *
 					         * @{
 					         */
 					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(
 					            auto const& acc,
 					            auto&& func,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 					        {
 					            concurrent(acc, data0.getExtents(), ALPAKA_FORWARD(func), ALPAKA_FORWARD(data0), ALPAKA_FORWARD(dataN)...);
 					        }

 					        /**
 					         * @param extents number of elements to process in each dimension
 					         */
 					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(
 					            auto const& acc,
 					            alpaka::concepts::Vector auto extents,
 					            auto&& func,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 					        {
 					            using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
 					            concurrent<
 					                alpaka::getNumElemPerThread<ValueType>(
 					                    ALPAKA_TYPEOF(acc.getApi()){},
 					                    ALPAKA_TYPEOF(acc.getDeviceKind()){})
 					                * sizeof(ValueType)>(
 					                acc,
 					                extents,
 					                ALPAKA_FORWARD(func),
 					                ALPAKA_FORWARD(data0),
 					                ALPAKA_FORWARD(dataN)...);
 					        }

 					        /** @} */

 					        /** execute the functor concurrently over the given data.
 					         *
 					         * @attention The number of elements to process is derived from the first MdSpan object.
 					         *            All other MdSpan objects must have hat least the same number of elements.
 					         *
 					         * @param T_maxConcurrencyInByte
 					         *    Maximum number of bytes to be used for concurrency.
 					         *    Concurrency bytes describe a virtual simd pack size which is not exceeded.
 					         *    Internally a best fitting SIMD width is calculated and instruction parallelism is exposed based on
 					         *    T_maxConcurrencyInByte.
 					         * @param T_MemAlignment alignment of the memory, if no alignments is given the alignment will be derived from
 					         * the MdSpan data descriptions
 					         * @param func the functor to be executed
 					         * @param data0 the first data to be processed
 					         * @param dataN the remaining data to be processed
 					         *
 					         * @{
 					         */
 					        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
 					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(
 					            auto const& acc,
 					            auto&& func,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 					        {
 					            concurrent<T_maxConcurrencyInByte, T_MemAlignment>(
 					                acc,
 					                data0.getExtents(),
 					                ALPAKA_FORWARD(func),
 					                ALPAKA_FORWARD(data0),
 					                ALPAKA_FORWARD(dataN)...);
 					        }

 					        /**
 					         * @param extents number of elements to process in each dimension
 					         */
 					        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
 					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(
 					            auto const& acc,
 					            alpaka::concepts::Vector auto extents,
 					            auto&& func,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 					        {
 					            ConcurrentAlgo::template concurrent<T_maxConcurrencyInByte, T_MemAlignment>(
 					                acc,
 					                extents,
 					                ALPAKA_FORWARD(func),
 					                ALPAKA_FORWARD(data0),
 					                ALPAKA_FORWARD(dataN)...);
 					        }

 					        /** @} */

 					        /** execute the functor concurrently over the given data.
 					         *
 					         * @attention The number of elements to process is derived from the first MdSpan object.
 					         *            All other MdSpan objects must have hat least the same number of elements.
 					         *
 					         * @param neutralElement the neutral element for the reduction operation
 					         * @param reduceFunc the binary reduction operation to be executed, e.g. std::plus
 					         * @param transformFunc n-nary functor to be executed, values of all containers will be passed to the functor
 					         * as arguments
 					         * @param data0 the first data to be processed
 					         * @param dataN the remaining data to be processed
 					         *
 					         * @{
 					         */

 					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(
 					            auto const& acc,
 					            auto const& neutralElement,
 					            auto&& reduceFunc,
 					            auto&& transformFunc,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 					        {
 					            return transformReduce(
 					                acc,
 					                data0.getExtents(),
 					                neutralElement,
 					                ALPAKA_FORWARD(reduceFunc),
 					                ALPAKA_FORWARD(transformFunc),
 					                ALPAKA_FORWARD(data0),
 					                ALPAKA_FORWARD(dataN)...);
 					        }

 					        /**
 					         * @param extents number of elements to process in each dimension
 					         */
 					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(
 					            auto const& acc,
 					            alpaka::concepts::Vector auto extents,
 					            auto const& neutralElement,
 					            auto&& reduceFunc,
 					            auto&& transformFunc,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 					        {
 					            using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
 					            return transformReduce<
 					                alpaka::getNumElemPerThread<ValueType>(
 					                    ALPAKA_TYPEOF(acc.getApi()){},
 					                    ALPAKA_TYPEOF(acc.getDeviceKind()){})
 					                * sizeof(ValueType)>(
 					                acc,
 					                extents,
 					                neutralElement,
 					                ALPAKA_FORWARD(reduceFunc),
 					                ALPAKA_FORWARD(transformFunc),
 					                ALPAKA_FORWARD(data0),
 					                ALPAKA_FORWARD(dataN)...);
 					        }

 					        /** @} */

 					        /** execute the transformFunctor concurrently over the given data.
 					         *
 					         * @attention The number of elements to process is derived from the first MdSpan object.
 					         *            All other MdSpan objects must have hat least the same number of elements.
 					         *
 					         * @param T_maxConcurrencyInByte
 					         *    Maximum number of bytes to be used for concurrency.
 					         *    Concurrency bytes describe a virtual simd pack size which is not exceeded.
 					         *    Internally a best fitting SIMD width is calculated and instruction parallelism is exposed based on
 					         *    T_maxConcurrencyInByte.
 					         * @param T_MemAlignment alignment of the memory, if no alignments is given the alignment will be derived from
 					         * the MdSpan data descriptions
 					         * @param neutralElement the neutral element for the reduction operation
 					         * @param reduceFunc the binary reduction operation to be executed, e.g. std::plus
 					         * @param transformFunc n-nary functor to be executed, values of all containers will be passed to the functor
 					         * as arguments
 					         * @param T_data0 the first data to be processed
 					         * @param T_dataN the remaining data to be processed
 					         *
 					         * @{
 					         */
 					        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
 					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(
 					            auto const& acc,
 					            auto const& neutralElement,
 					            auto&& reduceFunc,
 					            auto&& transformFunc,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 					        {
 					            return transformReduce<T_maxConcurrencyInByte, T_MemAlignment>(
 					                acc,
 					                data0.getExtents(),
 					                neutralElement,
 					                ALPAKA_FORWARD(reduceFunc),
 					                ALPAKA_FORWARD(transformFunc),
 					                ALPAKA_FORWARD(data0),
 					                ALPAKA_FORWARD(dataN)...);
 					        }

 					        /**
 					         * @param extents number of elements to process in each dimension
 					         */
 					        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
 					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(
 					            auto const& acc,
 					            alpaka::concepts::Vector auto extents,
 					            auto const& neutralElement,
 					            auto&& reduceFunc,
 					            auto&& transformFunc,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&& data0,
 					            alpaka::concepts::IGeneratorOrMdSpan auto&&... dataN) const
 					        {
 					            return ReduceAlgo::template transformReduce<T_maxConcurrencyInByte, T_MemAlignment>(
 					                acc,
 					                data0.getExtents(),
 					                neutralElement,
 					                ALPAKA_FORWARD(reduceFunc),
 					                ALPAKA_FORWARD(transformFunc),
 					                ALPAKA_FORWARD(data0),
 					                ALPAKA_FORWARD(dataN)...);
 					        }

 					        /** @} */

 					    private:
 					        using ConcurrentAlgo = internal::SimdConcurrent<SimdAlgo<T_WorkGroup, T_Traverse, T_IdxLayout>>;
 					        using ReduceAlgo = internal::SimdTransformReduce<SimdAlgo<T_WorkGroup, T_Traverse, T_IdxLayout>>;

 					        friend ConcurrentAlgo;
 					        friend ReduceAlgo;

 					        template<typename T_Type, uint32_t T_maxConcurrencyInByte, uint32_t T_cacheLineInByte>
 					        static constexpr auto calcSimdWidth()
 					        {
 					            constexpr uint32_t maxSimdBytes = std::min(T_cacheLineInByte, T_maxConcurrencyInByte);
 					            return alpaka::divExZero(maxSimdBytes, static_cast<uint32_t>(sizeof(T_Type)));
 					        }

 					        T_WorkGroup m_workGroup;
 					    };
 					} // namespace alpaka::onAcc
 					// ==
 					// == ./include/alpaka/onAcc/SimdAlgo.hpp ==
 					// ============================================================================

 				// #include "alpaka/onAcc/interface.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onHost/FrameSpec.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined

 				// #include <algorithm>    // amalgamate: file already included

 				namespace alpaka::internal::generic
 				{
 				    /** assign a value to each element of the destination
 				     *
 				     * @todo replace the kernel as soon as we have an algorithm forEach callable from host
 				     */
 				    struct SimdFillKernel
 				    {
 				        ALPAKA_FN_ACC void operator()(auto const& acc, alpaka::concepts::MdSpan auto dest, auto const value) const
 				        {
 				            auto simdGrid = onAcc::SimdAlgo{onAcc::worker::threadsInGrid};
 				            simdGrid.concurrent(
 				                acc,
 				                dest.getExtents(),
 				                [value](auto const& acc, auto destSimdPtr) constexpr
 				                {
 				                    using SimdType = ALPAKA_TYPEOF(destSimdPtr.load());
 				                    destSimdPtr = SimdType::all(value);
 				                },
 				                dest);
 				        }
 				    };

 				    template<typename T_Value>
 				    inline void fill(
 				        auto& internalQueue,
 				        auto executor,
 				        alpaka::concepts::MdSpan<T_Value> auto&& dest,
 				        T_Value elementValue)
 				    {
 				        ALPAKA_LOG_FUNCTION(onHost::logger::memory);
 				        uint32_t elementsPerFrameItem = getNumElemPerThread<T_Value>(internalQueue);

 				        auto extents = onHost::getExtents(dest);

 				        using ExtentsType = ALPAKA_TYPEOF(extents);
 				        using IndexType = typename ExtentsType::type;
 				        auto virtualFrameExtent = ExtentsType::all(1u);
 				        // 512 is randomly chosen because it is on all devices a good value for a value assign kernel
 				        virtualFrameExtent.x() = std::min(static_cast<IndexType>(512u * elementsPerFrameItem), extents.x());

 				        auto numFrames = divExZero(extents, virtualFrameExtent);
 				        auto realFrameExtent = ExtentsType::all(1u);
 				        realFrameExtent.x() = IndexType{512u};

 				        auto frameSpec = onHost::FrameSpec{numFrames, realFrameExtent};

 				        ALPAKA_LOG_INFO(
 				            onHost::logger::memory,
 				            [&]()
 				            {
 				                std::stringstream ss;
 				                ss << "fill{ extents=" << extents << ", elementsPerFrameItem" << elementsPerFrameItem
 				                   << ", dst=" << dest << ", value_type=" << onHost::demangledName(elementValue) << ", " << frameSpec
 				                   << " }";
 				                return ss.str();
 				            });

 				        onHost::internal::enqueue(
 				            internalQueue,
 				            executor,
 				            frameSpec,
 				            KernelBundle{SimdFillKernel{}, dest, elementValue});
 				    }
 				} // namespace alpaka::internal::generic
 				// ==
 				// == ./include/alpaka/api/generic.hpp ==
 				// ============================================================================

 			// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/api/host/Event.hpp"    // amalgamate: file already inlined
 				// ============================================================================
 				// == ./include/alpaka/api/host/exec/OmpBlocks.hpp ==
 				// ==
 				/* Copyright 2024 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 					// ============================================================================
 					// == ./include/alpaka/api/host/IdxLayer.hpp ==
 					// ==
 					/* Copyright 2024 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/core/Tag.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined

 					// #include <cassert>    // amalgamate: file already included
 					// #include <tuple>    // amalgamate: file already included

 					namespace alpaka::onAcc
 					{
 					    namespace cpu
 					    {
 					        template<typename IndexVecType>
 					        struct OneLayer
 					        {
 					            constexpr OneLayer() = default;

 					            constexpr auto idx() const
 					            {
 					                return IndexVecType::all(0);
 					            }

 					            constexpr auto idx() const requires alpaka::concepts::CVector<IndexVecType>
 					            {
 					                return IndexVecType::template all<0>();
 					            }

 					            constexpr auto count() const
 					            {
 					                return IndexVecType::all(1);
 					            }

 					            constexpr auto count() const requires alpaka::concepts::CVector<IndexVecType>
 					            {
 					                return IndexVecType::template all<1u>();
 					            }
 					        };

 					        template<typename T_Idx, typename T_Count>
 					        struct GenericLayer
 					        {
 					            constexpr GenericLayer(T_Idx idx, T_Count count) : m_idx(idx), m_count(count)
 					            {
 					            }

 					            constexpr decltype(auto) idx() const
 					            {
 					                return unWrapp(m_idx);
 					            }

 					            constexpr decltype(auto) count() const
 					            {
 					                return unWrapp(m_count);
 					            }

 					            T_Idx m_idx;
 					            T_Count m_count;
 					        };
 					    } // namespace cpu
 					} // namespace alpaka::onAcc
 					// ==
 					// == ./include/alpaka/api/host/IdxLayer.hpp ==
 					// ============================================================================

 					// ============================================================================
 					// == ./include/alpaka/api/host/block/mem/SingleThreadStaticShared.hpp ==
 					// ==
 					/* Copyright 2024 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 						// ============================================================================
 						// == ./include/alpaka/api/host/block/mem/SharedStorage.hpp ==
 						// ==
 						/* Copyright 2022 Jeffrey Kelling, Rene Widera, Bernhard Manfred Gruber
 						 * SPDX-License-Identifier: MPL-2.0
 						 */

 						// #pragma once
 						// #include "alpaka/core/Assert.hpp"    // amalgamate: file already inlined

 						// #include <algorithm>    // amalgamate: file already included
 						// #include <array>    // amalgamate: file already included
 						// #include <cstdint>    // amalgamate: file already included
 						// #include <functional>    // amalgamate: file already included
 						// #include <limits>    // amalgamate: file already included
 						#include <type_traits>

 						#ifndef ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
 						#    define ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB 64u
 						#endif

 						namespace alpaka::onAcc::cpu::detail
 						{
 						    //! Implementation of static block shared memory provider.
 						    //!
 						    //! externally allocated fixed-size memory, likely provided by BlockSharedMemDynMember.
 						    template<std::size_t TMinDataAlignBytes>
 						    class SharedStorage
 						    {
 						        struct alignas(TMinDataAlignBytes) MetaData
 						        {
 						            //! Unique id if the next data chunk.
 						            size_t id = 0u;
 						            //! Offset to the next meta data header, relative to m_mem.
 						            //! To access the meta data header the offset must by aligned first.
 						            std::uint32_t offset = 0u;
 						        };

 						        static constexpr std::uint32_t metaDataSize = sizeof(MetaData);

 						    public:
 						        SharedStorage() = default;

 						        template<typename T>
 						        void alloc(size_t id) const
 						        {
 						            // Add meta data chunk in front of the user data
 						            m_allocdBytes = varChunkEnd<MetaData>(m_allocdBytes, sizeof(MetaData));
 						            ALPAKA_ASSERT_ACC(m_allocdBytes <= totalSharedBytes);
 						            auto* meta = getLatestVarPtr<MetaData>();

 						            // Allocate variable
 						            m_allocdBytes = varChunkEnd<T>(m_allocdBytes, sizeof(T));
 						            ALPAKA_ASSERT_ACC(m_allocdBytes <= totalSharedBytes);

 						            // Update meta data with id and offset for the allocated variable.
 						            meta->id = id;
 						            meta->offset = m_allocdBytes;
 						        }

 						        template<typename T>
 						        void allocDynamic(size_t id, uint32_t numBytes) const
 						        {
 						            // Add meta data chunk in front of the user data
 						            m_allocdBytes = varChunkEnd<MetaData>(m_allocdBytes, sizeof(MetaData));
 						            ALPAKA_ASSERT_ACC(m_allocdBytes <= totalSharedBytes);
 						            auto* meta = getLatestVarPtr<MetaData>();

 						            // Allocate variable
 						            m_allocdBytes = varChunkEnd<T>(m_allocdBytes, numBytes);
 						            ALPAKA_ASSERT_ACC(m_allocdBytes <= totalSharedBytes);

 						            // Update meta data with id and offset for the allocated variable.
 						            meta->id = id;
 						            meta->offset = m_allocdBytes;
 						        }

 						#if BOOST_COMP_GNUC
 						#    pragma GCC diagnostic push
 						#    pragma GCC diagnostic ignored                                                                                    \
 						        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
 						#endif

 						        //! Give the pointer to an exiting variable
 						        //!
 						        //! @tparam T type of the variable
 						        //! @param id unique id of the variable
 						        //! @return nullptr if variable with id not exists
 						        template<typename T>
 						        auto getVarPtr(size_t id) const -> T*
 						        {
 						            // Offset in bytes to the next unaligned meta data header behind the variable.
 						            std::uint32_t off = 0;

 						            // Iterate over allocated data only
 						            while(off < m_allocdBytes)
 						            {
 						                // Adjust offset to be aligned
 						                std::uint32_t const alignedMetaDataOffset
 						                    = varChunkEnd<MetaData>(off, sizeof(MetaData)) - static_cast<std::uint32_t>(sizeof(MetaData));
 						                ALPAKA_ASSERT_ACC(
 						                    (alignedMetaDataOffset + static_cast<std::uint32_t>(sizeof(MetaData))) <= m_allocdBytes);
 						                auto* metaDataPtr = reinterpret_cast<MetaData*>(data() + alignedMetaDataOffset);
 						                off = metaDataPtr->offset;

 						                if(metaDataPtr->id == id)
 						                    return reinterpret_cast<T*>(&data()[off - sizeof(T)]);
 						            }

 						            // Variable not found.
 						            return nullptr;
 						        }

 						        //! Get last allocated variable.
 						        template<typename T>
 						        auto getLatestVarPtr() const -> T*
 						        {
 						            return reinterpret_cast<T*>(&data()[m_allocdBytes - sizeof(T)]);
 						        }

 						    private:
 						#if BOOST_COMP_GNUC
 						#    pragma GCC diagnostic pop
 						#endif
 						        uint8_t* data() const
 						        {
 						            return m_data.data();
 						        }

 						        //! Byte offset to the end of the memory chunk
 						        //!
 						        //! Calculate bytes required to store a type with a aligned starting address in m_mem.
 						        //! Start offset to the origin of the user data chunk can be calculated with `result - sizeof(T)`.
 						        //! The padding is always before the origin of the user data chunk and can be zero byte.
 						        //!
 						        //! \tparam T type should fit into the chunk
 						        //! \param byteOffset Current byte offset.
 						        //! \param byteOffset Number of bytes to allocate, should be at least sizeof(T).
 						        //! \result Byte offset to the end of the data chunk, relative to m_mem..
 						        template<typename T>
 						        auto varChunkEnd(uint32_t byteOffset, uint32_t numBytes) const -> std::uint32_t
 						        {
 						            auto const ptr = reinterpret_cast<std::size_t>(data() + byteOffset);
 						            constexpr size_t align = std::max(TMinDataAlignBytes, alignof(T));
 						            std::size_t const newPtrAdress = ((ptr + align - 1u) / align) * align + numBytes;
 						            return static_cast<uint32_t>(newPtrAdress - reinterpret_cast<std::size_t>(data()));
 						        }

 						        static constexpr std::uint32_t totalSharedBytes
 						            = static_cast<std::uint32_t>(ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB << 10u);
 						        //! Memory layout
 						        //! |Header|Padding|Variable|Padding|Header|....uninitialized Data ....
 						        //! Size of padding can be zero if data after padding is already aligned.
 						        mutable std::array<uint8_t, totalSharedBytes> m_data;

 						        //! Offset in bytes relative to m_mem to next free data area.
 						        //! The last aligned before the free area is always a meta data header.
 						        mutable std::uint32_t m_allocdBytes = 0u;
 						    };
 						} // namespace alpaka::onAcc::cpu::detail
 						// ==
 						// == ./include/alpaka/api/host/block/mem/SharedStorage.hpp ==
 						// ============================================================================

 					// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 					// #include <cstdint>    // amalgamate: file already included

 					namespace alpaka::onAcc
 					{
 					    namespace cpu
 					    {
 					        template<std::size_t TDataAlignBytes>
 					        struct SingleThreadStaticShared : private detail::SharedStorage<TDataAlignBytes>
 					        {
 					            using Base = detail::SharedStorage<TDataAlignBytes>;

 					            template<typename T, size_t T_unique>
 					            T& allocVar()
 					            {
 					                auto* data = Base::template getVarPtr<T>(T_unique);

 					                if(!data)
 					                {
 					                    Base::template alloc<T>(T_unique);
 					                    data = Base::template getLatestVarPtr<T>();
 					                }
 					                ALPAKA_ASSERT(data != nullptr);
 					                return *data;
 					            }

 					            template<typename T, size_t T_unique>
 					            T* allocDynamic(uint32_t numBytes)
 					            {
 					                auto* data = Base::template getVarPtr<T>(T_unique);

 					                if(!data)
 					                {
 					                    Base::template allocDynamic<T>(T_unique, numBytes);
 					                    data = Base::template getLatestVarPtr<T>();
 					                }
 					                ALPAKA_ASSERT(data != nullptr);
 					                return data;
 					            }

 					            void reset()
 					            {
 					            }
 					        };
 					    } // namespace cpu
 					} // namespace alpaka::onAcc
 					// ==
 					// == ./include/alpaka/api/host/block/mem/SingleThreadStaticShared.hpp ==
 					// ============================================================================

 					// ============================================================================
 					// == ./include/alpaka/api/host/block/sync/NoOp.hpp ==
 					// ==
 					/* Copyright 2024 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					namespace alpaka::onAcc
 					{
 					    namespace cpu
 					    {
 					        struct NoOp
 					        {
 					            constexpr void operator()() const
 					            {
 					            }
 					        };
 					    } // namespace cpu
 					} // namespace alpaka::onAcc
 					// ==
 					// == ./include/alpaka/api/host/block/sync/NoOp.hpp ==
 					// ============================================================================

 				// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/meta/NdLoop.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onHost/ThreadSpec.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 				// #include <cassert>    // amalgamate: file already included
 				#include <stdexcept>
 				// #include <tuple>    // amalgamate: file already included

 				#if ALPAKA_OMP

 				namespace alpaka::onHost
 				{
 				    namespace cpu
 				    {
 				        template<onHost::concepts::ThreadSpec T_ThreadSpec>
 				        struct OmpBlocks
 				        {
 				            constexpr OmpBlocks(T_ThreadSpec threadBlocking) : m_threadBlocking{std::move(threadBlocking)}
 				            {
 				            }

 				            void operator()(auto const& kernelBundle, auto const& dict) const
 				            {
 				                using NumThreadsVecType = typename T_ThreadSpec::NumThreadsVecType;

 				                if(m_threadBlocking.m_numThreads.product() != 1u)
 				                    throw std::runtime_error("Thread block extent must be 1.");
 				#    pragma omp parallel
 				                {
 				                    // copy from num blocks to derive correct index type
 				                    auto blockIdx = m_threadBlocking.m_numBlocks;
 				                    constexpr uint32_t simdWidth
 				                        = alpaka::getArchSimdWidth<uint8_t>(api::host, ALPAKA_TYPEOF(dict[object::deviceKind]){});
 				                    auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};

 				                    // dynamic shared mem
 				                    uint32_t blockDynSharedMemBytes
 				                        = onHost::getDynSharedMemBytes(exec::cpuOmpBlocks, m_threadBlocking, kernelBundle);
 				                    auto const blockDynSharedMemEntry = DictEntry{layer::dynShared, std::ref(blockSharedMem)};
 				                    auto const blockDynSharedMemBytesEntry
 				                        = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};

 				                    /* Only add dynamic shared memory objects if defined by the user, if not we will get a clean static
 				                     * assert if the kernel tries to access dynamic shared memory */
 				                    auto additionalDict = conditionalAppendDict<trait::HasUserDefinedDynSharedMemBytes<
 				                        exec::CpuOmpBlocks,
 				                        T_ThreadSpec,
 				                        ALPAKA_TYPEOF(kernelBundle)>::value>(
 				                        dict,
 				                        Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});

 				                    auto blockCount = m_threadBlocking.m_numBlocks;

 				                    auto const blockLayerEntry = DictEntry{
 				                        layer::block,
 				                        onAcc::cpu::GenericLayer{std::cref(blockIdx), std::cref(blockCount)}};
 				                    auto const threadLayerEntry = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};
 				                    auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};
 				                    auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};

 				                    auto acc = onAcc::Acc(joinDict(
 				                        Dict{blockLayerEntry, threadLayerEntry, blockSharedMemEntry, blockSyncEntry},
 				                        additionalDict));

 				                    using ThreadIdxType = typename NumThreadsVecType::type;
 				#    pragma omp for nowait
 				                    for(ThreadIdxType i = 0; i < blockCount.product(); ++i)
 				                    {
 				                        blockIdx = mapToND(blockCount, i);
 				                        kernelBundle(acc);
 				                        blockSharedMem.reset();
 				                    }
 				                }
 				            }

 				            T_ThreadSpec m_threadBlocking;
 				        };
 				    } // namespace cpu

 				    inline auto makeAcc(exec::CpuOmpBlocks, auto const& threadBlocking)
 				    {
 				        return cpu::OmpBlocks(threadBlocking);
 				    }
 				} // namespace alpaka::onHost

 				#endif
 				// ==
 				// == ./include/alpaka/api/host/exec/OmpBlocks.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/api/host/exec/Serial.hpp ==
 				// ==
 				/* Copyright 2024 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/api/host/IdxLayer.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/api/host/block/mem/SingleThreadStaticShared.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/api/host/block/sync/NoOp.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/meta/NdLoop.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onHost/ThreadSpec.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 				// #include <cassert>    // amalgamate: file already included
 				// #include <tuple>    // amalgamate: file already included

 				namespace alpaka::onHost
 				{
 				    namespace cpu
 				    {
 				        template<onHost::concepts::ThreadSpec T_ThreadSpec>
 				        struct Serial
 				        {
 				            using NumThreadsVecType = typename T_ThreadSpec::NumThreadsVecType;

 				            constexpr Serial(T_ThreadSpec threadBlocking) : m_threadBlocking{std::move(threadBlocking)}
 				            {
 				            }

 				            void operator()(auto const& kernelBundle, auto const& dict) const
 				            {
 				                // copy from num blocks to derive correct index type
 				                auto blockIdx = m_threadBlocking.m_numBlocks;
 				                constexpr uint32_t simdWidth
 				                    = alpaka::getArchSimdWidth<uint8_t>(api::host, ALPAKA_TYPEOF(dict[object::deviceKind]){});
 				                auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};

 				                auto const blockLayerEntry = DictEntry{
 				                    layer::block,
 				                    onAcc::cpu::GenericLayer{std::cref(blockIdx), std::cref(m_threadBlocking.m_numBlocks)}};
 				                auto const threadLayerEntry = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};
 				                auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};
 				                auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};

 				                // dynamic shared mem
 				                uint32_t blockDynSharedMemBytes
 				                    = onHost::getDynSharedMemBytes(exec::cpuSerial, m_threadBlocking, kernelBundle);
 				                auto const blockDynSharedMemEntry = DictEntry{layer::dynShared, std::ref(blockSharedMem)};
 				                auto const blockDynSharedMemBytesEntry
 				                    = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};

 				                /* Only add dynamic shared memory objects if defined by the user, if not we will get a clean static
 				                 * assert if the kernel tries to access dynamic shared memory */
 				                auto additionalDict = conditionalAppendDict<trait::HasUserDefinedDynSharedMemBytes<
 				                    exec::CpuSerial,
 				                    T_ThreadSpec,
 				                    ALPAKA_TYPEOF(kernelBundle)>::value>(
 				                    dict,
 				                    Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});

 				                auto acc = onAcc::Acc(joinDict(
 				                    Dict{blockLayerEntry, threadLayerEntry, blockSharedMemEntry, blockSyncEntry},
 				                    additionalDict));
 				                meta::ndLoopIncIdx(
 				                    blockIdx,
 				                    m_threadBlocking.m_numBlocks,
 				                    [&](auto const&)
 				                    {
 				                        kernelBundle(acc);
 				                        acc[layer::shared].reset();
 				                    });
 				            }

 				            T_ThreadSpec m_threadBlocking;
 				        };
 				    } // namespace cpu

 				    inline auto makeAcc(exec::CpuSerial, auto const& threadBlocking)
 				    {
 				        return cpu::Serial(threadBlocking);
 				    }
 				} // namespace alpaka::onHost
 				// ==
 				// == ./include/alpaka/api/host/exec/Serial.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/api/host/exec/TbbBlocks.hpp ==
 				// ==
 				/* Copyright 2024 Mehmet Yusufoglu, René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/api/host/IdxLayer.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/api/host/block/mem/SingleThreadStaticShared.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/api/host/block/sync/NoOp.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/api/host/executor.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onHost/ThreadSpec.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 				// #include <cstddef>    // amalgamate: file already included
 				// #include <stdexcept>    // amalgamate: file already included

 				#if ALPAKA_TBB
 				#    include <oneapi/tbb/blocked_range.h>
 				#    include <oneapi/tbb/parallel_for.h>
 				#    include <oneapi/tbb/task_group.h>

 				namespace alpaka::onHost
 				{
 				    namespace cpu
 				    {
 				        template<onHost::concepts::ThreadSpec T_ThreadSpec>
 				        struct TbbBlocks
 				        {
 				            using NumThreadsVecType = typename T_ThreadSpec::NumThreadsVecType;

 				            // Construct the executor with the thread blocking configuration chosen by the queue.
 				            constexpr TbbBlocks(T_ThreadSpec threadBlocking) : m_threadBlocking(std::move(threadBlocking))
 				            {
 				            }

 				            void operator()(auto const& kernelBundle, auto const& dict) const
 				            {
 				                if(m_threadBlocking.m_numThreads.product() != 1u)
 				                    throw std::runtime_error("Thread block extent must be 1.");

 				                auto blockCount = m_threadBlocking.m_numBlocks;

 				                constexpr uint32_t simdWidth
 				                    = alpaka::getArchSimdWidth<uint8_t>(api::host, ALPAKA_TYPEOF(dict[object::deviceKind]){});

 				                oneapi::tbb::this_task_arena::isolate(
 				                    [&]
 				                    {
 				                        using ThreadIdxType = typename NumThreadsVecType::type;
 				                        ThreadIdxType const linearNumBlocks = blockCount.product();

 				                        oneapi::tbb::parallel_for(
 				                            static_cast<ThreadIdxType>(0),
 				                            linearNumBlocks,
 				                            [&](ThreadIdxType i)
 				                            {
 				                                auto const blockIdx = mapToND(blockCount, i);

 				                                auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};
 				                                // Compose the accelerator dictionary entries consumed by the kernel.
 				                                auto const blockLayerEntry = DictEntry{
 				                                    layer::block,
 				                                    onAcc::cpu::GenericLayer{std::cref(blockIdx), blockCount}};
 				                                auto const threadLayerEntry
 				                                    = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};
 				                                auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};
 				                                auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};

 				                                // dynamic shared mem
 				                                uint32_t blockDynSharedMemBytes = onHost::getDynSharedMemBytes(
 				                                    alpaka::exec::CpuTbbBlocks{},
 				                                    m_threadBlocking,
 				                                    kernelBundle);
 				                                auto const blockDynSharedMemEntry
 				                                    = DictEntry{layer::dynShared, std::ref(blockSharedMem)};
 				                                auto const blockDynSharedMemBytesEntry
 				                                    = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};

 				                                auto additionalDict = conditionalAppendDict<trait::HasUserDefinedDynSharedMemBytes<
 				                                    alpaka::exec::CpuTbbBlocks,
 				                                    T_ThreadSpec,
 				                                    ALPAKA_TYPEOF(kernelBundle)>::value>(
 				                                    dict,
 				                                    Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});

 				                                auto acc = onAcc::Acc(joinDict(
 				                                    Dict{blockLayerEntry, threadLayerEntry, blockSharedMemEntry, blockSyncEntry},
 				                                    additionalDict));

 				                                kernelBundle(acc);
 				                            });
 				                    });
 				            }

 				            T_ThreadSpec m_threadBlocking;
 				        };
 				    } // namespace cpu

 				    inline auto makeAcc(alpaka::exec::CpuTbbBlocks, auto const& threadBlocking)
 				    {
 				        return cpu::TbbBlocks(threadBlocking);
 				    }
 				} // namespace alpaka::onHost
 				#endif
 				// ==
 				// == ./include/alpaka/api/host/exec/TbbBlocks.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/api/util.hpp ==
 				// ==
 				/* Copyright 2025 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */


 				// #pragma once
 				// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/mem/DataPitches.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 				// #include <cstdint>    // amalgamate: file already included
 				// #include <utility>    // amalgamate: file already included

 				namespace alpaka::api::util
 				{
 				    namespace detail
 				    {
 				        template<
 				            std::integral auto T_limit,
 				            std::integral auto T_index,
 				            std::integral auto T_increment,
 				            std::integral auto... T_idx>
 				        consteval auto adjustToLimit(concepts::CVector auto const input, std::index_sequence<T_idx...>)
 				        {
 				            if constexpr(input.product() <= T_limit)
 				                return input;
 				            else
 				            {
 				                constexpr uint32_t dim = static_cast<uint32_t>(sizeof...(T_idx));

 				                constexpr auto newValue = CVec<
 				                    typename ALPAKA_TYPEOF(input)::type,
 				                    (T_idx == T_index ? divExZero(input[T_idx], static_cast<ALPAKA_TYPEOF(T_limit)>(2))
 				                                      : input[T_idx])...>{};

 				                constexpr auto nextIncrement = dim == 1u ? 0u : T_increment;
 				                constexpr auto nextIdx = T_index + T_increment;

 				                if constexpr(nextIdx == dim)
 				                {
 				                    constexpr auto nextIncrement = dim == 1u ? 0u : -1u;

 				                    return adjustToLimit < T_limit, dim == 1 ? 0 : dim - 1u,
 				                           nextIncrement > (newValue, std::index_sequence<T_idx...>{});
 				                }
 				                else if constexpr(nextIdx == 0u)
 				                {
 				                    return adjustToLimit<T_limit, nextIdx, 1u>(newValue, std::index_sequence<T_idx...>{});
 				                }

 				                return adjustToLimit<T_limit, nextIdx, nextIncrement>(newValue, std::index_sequence<T_idx...>{});
 				            }
 				        }
 				    } // namespace detail

 				    /** adjust the input vector to a given limit by halving all components
 				     * until the product of these is is below or equal to the limit */
 				    template<std::integral auto T_limit, std::integral auto T_index, std::integral auto T_increment>
 				    consteval auto adjustToLimit(concepts::CVector auto const input)
 				    {
 				        return detail::adjustToLimit<T_limit, 0u, 1u>(input, std::make_index_sequence<input.dim()>{});
 				    }

 				    /** adjust the input vector to a given limit by halving the largest dimension until the product of all components
 				     * is below or equal to the limit */
 				    inline auto adjustToLimit(concepts::Vector auto input, std::integral auto const limit)
 				    {
 				        using IdxType = typename ALPAKA_TYPEOF(input)::type;
 				        constexpr uint32_t dim = input.dim();
 				        IdxType limitValue = static_cast<IdxType>(limit);

 				        while(input.product() > limitValue)
 				        {
 				            uint32_t maxIdx = 0u;
 				            auto maxValue = input[0];
 				            for(auto i = 0u; i < dim; ++i)
 				                if(maxValue < input[i])
 				                {
 				                    maxIdx = i;
 				                    maxValue = input[i];
 				                }
 				            if(input.product() > limitValue)
 				                input[maxIdx] = divExZero(input[maxIdx], IdxType{2u});
 				        }
 				        return input;
 				    }

 				    /** provides a memory description to create multidimensional linewise aligned memory within a one dimensional
 				     * byte area
 				     *
 				     * @param alignment data alignment in bytes
 				     * @return tuple with the linearized data blob size in bytes and multi-dimensional pitches,
 				     * std::tuple(numBytes,pitcheMD)
 				     */
 				    template<typename T_ValueType, alpaka::concepts::Vector T_Extents>
 				    inline auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
 				    {
 				        constexpr auto dim = T_Extents::dim();
 				        if constexpr(dim == 1u)
 				        {
 				            size_t memSizeInByte = static_cast<size_t>(extents.x()) * sizeof(T_ValueType);
 				            alpaka::concepts::Vector auto pitches = typename T_Extents::UniVec{sizeof(T_ValueType)};
 				            return std::make_tuple(memSizeInByte, pitches);
 				        }
 				        else
 				        {
 				            using IdxType = typename T_Extents::type;
 				            auto alignment = static_cast<IdxType>(alignmentInByte);

 				            IdxType rowExtentInBytes = extents.x() * static_cast<IdxType>(sizeof(T_ValueType));
 				            IdxType rowPitchInBytes = alpaka::divCeil(rowExtentInBytes, alignment) * alignment;
 				            auto pitches = alpaka::calculatePitches<T_ValueType>(extents, rowPitchInBytes);

 				            size_t memSizeInByte = static_cast<size_t>(pitches[0]) * static_cast<size_t>(extents[0]);
 				            return std::make_tuple(memSizeInByte, pitches);
 				        }
 				    }

 				    consteval uint32_t highestPowerOfTwo(uint32_t value)
 				    {
 				        uint32_t result = 1u;
 				        while((result << 1u) <= value)
 				        {
 				            result <<= 1u;
 				        }
 				        return result;
 				    }

 				    /** Calculate the best alignment for SIMD optimized memory allocation
 				     *
 				     * @param api the API to use
 				     * @param deviceKind the device kind to use
 				     * @return the best alignment in bytes, will be a power of two value
 				     */
 				    template<typename T_ValueType>
 				    inline constexpr auto simdOptimizedAlignment(auto api, alpaka::concepts::DeviceKind auto deviceKind)
 				    {
 				        constexpr uint32_t typeAlignmentBytes = alignof(T_ValueType);
 				        constexpr uint32_t simdPackBytes
 				            = alpaka::getArchSimdWidth<T_ValueType>(api, deviceKind) * sizeof(T_ValueType);
 				        constexpr uint32_t bestSimdPackBytes = highestPowerOfTwo(simdPackBytes);
 				        constexpr uint32_t alignment = std::max(bestSimdPackBytes, typeAlignmentBytes);
 				        return alignment;
 				    }
 				} // namespace alpaka::api::util
 				// ==
 				// == ./include/alpaka/api/util.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/core/CallbackThread.hpp ==
 				// ==
 				/* Copyright 2022 Antonio Di Pilato
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 				// #include <cassert>    // amalgamate: file already included
 				#include <condition_variable>
 				// #include <functional>    // amalgamate: file already included
 				// #include <future>    // amalgamate: file already included
 				// #include <iostream>    // amalgamate: file already included
 				// #include <mutex>    // amalgamate: file already included
 				#include <queue>
 				#include <thread>

 				namespace alpaka::core
 				{
 				    class CallbackThread
 				    {
 				#if ALPAKA_COMP_CLANG
 				#    pragma clang diagnostic push
 				#    pragma clang diagnostic ignored "-Wweak-vtables"
 				#endif
 				        // A custom class is used because std::function<F> requires F to be copyable, and std::packaged_task provides a
 				        // std::future which will keep the task alive and we cannot control the moment the future is set.
 				        //! \todo with C++23 std::move_only_function should be used
 				        struct Task
 				#if ALPAKA_COMP_CLANG
 				#    pragma clang diagnostic pop
 				#endif
 				        {
 				            virtual ~Task() = default;
 				            virtual void run() = 0;
 				        };

 				        template<typename Function>
 				        struct FunctionHolder : Task
 				        {
 				            Function m_func;

 				            template<typename FunctionFwd>
 				            explicit FunctionHolder(FunctionFwd&& func) : m_func{std::forward<FunctionFwd>(func)}
 				            {
 				            }

 				            void run() override
 				            {
 				                // if m_func throws, let it propagate
 				                m_func();
 				            }
 				        };

 				        using TaskPackage = std::pair<std::unique_ptr<Task>, std::promise<void>>;

 				    public:
 				        ~CallbackThread()
 				        {
 				            {
 				                std::unique_lock<std::mutex> lock{m_mutex};
 				                m_stop = true;
 				                m_cond.notify_one();
 				            }

 				            if(m_thread.joinable())
 				            {
 				                if(std::this_thread::get_id() == m_thread.get_id())
 				                {
 				                    std::cerr << "ERROR in ~CallbackThread: thread joins itself" << std::endl;
 				                    std::abort();
 				                }
 				                m_thread.join();
 				            }
 				        }

 				        //! It is guaranteed that the task is fully destroyed before the future's result is set.
 				        //! @{
 				        template<typename NullaryFunction>
 				        auto submit(NullaryFunction&& nf) -> std::future<void>
 				        {
 				            using DecayedFunction = std::decay_t<NullaryFunction>;
 				            static_assert(
 				                std::is_void_v<std::invoke_result_t<DecayedFunction>>,
 				                "Submitted function must not have any arguments and return void.");

 				            // FunctionHolder stores a copy of the user's task, but may be constructed from an expiring value to avoid
 				            // the copy. We do NOT store a reference to the users task, which could dangle if the user isn't careful.
 				            auto tp = std::pair(
 				                std::unique_ptr<Task>(new FunctionHolder<DecayedFunction>{std::forward<NullaryFunction>(nf)}),
 				                std::promise<void>{});
 				            auto f = tp.second.get_future();
 				            {
 				                std::unique_lock<std::mutex> lock{m_mutex};
 				                m_tasks.emplace(std::move(tp));
 				                if(!m_thread.joinable())
 				                    startWorkerThread();
 				                m_cond.notify_one();
 				            }

 				            return f;
 				        }

 				        //! @}

 				        //! @return True if queue is empty and no task is executed else false.
 				        //! If only one tasks is enqueued and the task is executed the task will see the queue as not empty.
 				        //! During the destruction of this single enqueued task the queue will already be accounted as empty.
 				        [[nodiscard]] auto empty()
 				        {
 				            std::unique_lock<std::mutex> lock{m_mutex};
 				            return m_tasks.empty();
 				        }

 				    private:
 				        std::thread m_thread;
 				        std::condition_variable m_cond;
 				        std::mutex m_mutex;
 				        bool m_stop{false};
 				        std::queue<TaskPackage> m_tasks;

 				        auto startWorkerThread() -> void
 				        {
 				            m_thread = std::thread(
 				                [this]
 				                {
 				                    while(true)
 				                    {
 				                        std::promise<void> taskPromise;
 				                        std::exception_ptr eptr;
 				                        {
 				                            // Task is destroyed before promise is updated but after the queue state is up to date.
 				                            std::unique_ptr<Task> task = nullptr;
 				                            {
 				                                std::unique_lock<std::mutex> lock{m_mutex};
 				                                m_cond.wait(lock, [this] { return m_stop || !m_tasks.empty(); });

 				                                if(m_stop && m_tasks.empty())
 				                                    break;

 				                                task = std::move(m_tasks.front().first);
 				                                taskPromise = std::move(m_tasks.front().second);
 				                            }
 				                            assert(task);
 				                            try
 				                            {
 				                                task->run();
 				                            }
 				                            catch(...)
 				                            {
 				                                eptr = std::current_exception();
 				                            }
 				                            {
 				                                std::unique_lock<std::mutex> lock{m_mutex};
 				                                // Pop empty data from the queue, task and promise will be destroyed later in a
 				                                // well-defined order.
 				                                m_tasks.pop();
 				                            }
 				                            // Task will be destroyed here, the queue status is already updated.
 				                        }
 				                        // In case the executed tasks is the last task in the queue the waiting threads will see the
 				                        // queue as empty.
 				                        if(eptr)
 				                            taskPromise.set_exception(std::move(eptr));
 				                        else
 				                            taskPromise.set_value();
 				                    }
 				                });
 				        }
 				    };
 				} // namespace alpaka::core
 				// ==
 				// == ./include/alpaka/core/CallbackThread.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/core/alignedAlloc.hpp ==
 				// ==
 				/* Copyright 2022 René Widera, Bernhard Manfred Gruber
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 				// #include <cstddef>    // amalgamate: file already included
 				// #include <new>    // amalgamate: file already included

 				namespace alpaka::core
 				{
 				    ALPAKA_FN_INLINE ALPAKA_FN_HOST auto alignedAlloc(size_t alignment, size_t size) -> void*
 				    {
 				        if(size == 0u)
 				        {
 				            return nullptr;
 				        }
 				        else
 				        {
 				            return ::operator new(size, std::align_val_t{alignment});
 				        }
 				    }

 				    ALPAKA_FN_INLINE ALPAKA_FN_HOST void alignedFree(size_t alignment, auto ptr)
 				        requires(std::is_pointer_v<ALPAKA_TYPEOF(ptr)>)
 				    {
 				        if(ptr != nullptr)
 				        {
 				            ::operator delete(toVoidPtr(ptr), std::align_val_t{alignment});
 				        }
 				    }
 				} // namespace alpaka::core
 				// ==
 				// == ./include/alpaka/core/alignedAlloc.hpp ==
 				// ============================================================================

 			// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/meta/NdLoop.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/onHost/FrameSpec.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
 				// ============================================================================
 				// == ./include/alpaka/onHost/mem/SharedBuffer.hpp ==
 				// ==
 				/* Copyright 2024 René Widera, Bernhard Manfred Gruber
 				 * SPDX-License-Identifier: MPL-2.0
 				 */


 				// #pragma once
 				// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 					// ============================================================================
 					// == ./include/alpaka/mem/View.hpp ==
 					// ==
 					/* Copyright 2024 Bernhard Manfred Gruber, René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/mem/BoundaryIter.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/mem/MdSpan.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/mem/concepts.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/mem/concepts/detail/InnerTypeAllowedCast.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
 					// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined

 					// #include <cstdint>    // amalgamate: file already included
 					// #include <functional>    // amalgamate: file already included

 					namespace alpaka
 					{
 					    /** @brief Non owning view to data
 					     *
 					     * This view is only holding a pointer to real data, copying the view is cheap.
 					     * Const-ness of the view instance is propagated to the data region.
 					     *
 					     * This satisfies the alpaka::concepts::IView concept and, therefore, also the alpaka::concepts::IMdSpan concept.
 					     */
 					    template<
 					        alpaka::concepts::Api T_Api,
 					        typename T_Type,
 					        alpaka::concepts::Vector T_Extents,
 					        alpaka::concepts::Alignment T_MemAlignment = Alignment<>>
 					    struct View;

 					    template<typename T_ValueType, concepts::Alignment T_MemAlignment = Alignment<>>
 					    inline constexpr auto makeView(
 					        auto&& anyWithApi,
 					        T_ValueType* pointer,
 					        concepts::Vector auto const& extents,
 					        T_MemAlignment const memAlignment = T_MemAlignment{})
 					    {
 					        auto pitchMd = alpaka::calculatePitchesFromExtents<T_ValueType>(extents);
 					        return View{getApi(ALPAKA_FORWARD(anyWithApi)), pointer, extents, pitchMd, memAlignment};
 					    }

 					    template<typename T_ValueType, concepts::Alignment T_MemAlignment = Alignment<>>
 					    inline constexpr auto makeView(
 					        auto&& anyWithApi,
 					        T_ValueType* pointer,
 					        concepts::Vector auto const& extents,
 					        concepts::Vector auto const& pitches,
 					        T_MemAlignment const memAlignment = T_MemAlignment{})
 					    {
 					        static_assert(std::is_same_v<ALPAKA_TYPEOF(extents), ALPAKA_TYPEOF(pitches)>);
 					        return View{getApi(ALPAKA_FORWARD(anyWithApi)), pointer, extents, pitches, memAlignment};
 					    }

 					    inline constexpr auto makeView(auto&& any)
 					    {
 					        return View{
 					            getApi(ALPAKA_FORWARD(any)),
 					            onHost::data(ALPAKA_FORWARD(any)),
 					            onHost::getExtents(ALPAKA_FORWARD(any)),
 					            onHost::getPitches(ALPAKA_FORWARD(any)),
 					            alpaka::getAlignment(ALPAKA_FORWARD(any))};
 					    }

 					    template<
 					        alpaka::concepts::Api T_Api,
 					        typename T_Type,
 					        alpaka::concepts::Vector T_Extents,
 					        alpaka::concepts::Alignment T_MemAlignment>
 					    struct View : MdSpan<T_Type, typename T_Extents::UniVec, typename T_Extents::UniVec, T_MemAlignment>
 					    {
 					    private:
 					        using BaseMdSpan = MdSpan<T_Type, typename T_Extents::UniVec, typename T_Extents::UniVec, T_MemAlignment>;

 					    public:
 					        /** Creates a view
 					         *
 					         * @param data handle to the physical data
 					         * @param extents n-dimensional extents in elements of the view. Must satisfy `n <= number_of_elements` in the
 					         * data handle.
 					         */
 					        template<
 					            alpaka::concepts::HasApi T_Any,
 					            alpaka::concepts::Vector T_UserExtents,
 					            alpaka::concepts::Vector T_UserPitches>
 					        constexpr View(
 					            T_Any const& any,
 					            T_Type* data,
 					            T_UserExtents const& extents,
 					            T_UserPitches const& pitches,
 					            T_MemAlignment const memAlignment = T_MemAlignment{})
 					            : BaseMdSpan{
 					                  data,
 					                  typename T_UserExtents::UniVec{extents},
 					                  typename T_UserPitches::UniVec{pitches},
 					                  memAlignment}
 					        {
 					            static_assert(
 					                isLosslesslyConvertible_v<typename T_UserPitches::type, typename T_UserExtents::type>,
 					                "extent type and pitch type must be lossless convertible");
 					        }

 					        template<typename T_Type_Other>
 					        requires alpaka::internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
 					        constexpr View(View<T_Api, T_Type_Other, T_Extents, T_MemAlignment> const& other)
 					            : BaseMdSpan{static_cast<BaseMdSpan>(other)}
 					        {
 					        }

 					        constexpr View(View const&) = default;

 					        template<typename T_Type_Other>
 					        requires alpaka::internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
 					        constexpr View(View<T_Api, T_Type_Other, T_Extents, T_MemAlignment>&& other)
 					            : BaseMdSpan{std::move(static_cast<BaseMdSpan>(other))}
 					        {
 					        }

 					        constexpr View(View&&) = default;

 					        constexpr View& operator=(View const&) = default;
 					        constexpr View& operator=(View&&) = default;

 					        static consteval T_Api getApi()
 					        {
 					            return T_Api{};
 					        }

 					        constexpr alpaka::concepts::MdSpan auto getMdSpan() const
 					        {
 					            return BaseMdSpan::getConstMdSpan();
 					        }

 					        constexpr alpaka::concepts::MdSpan auto getMdSpan()
 					        {
 					            return BaseMdSpan{*this};
 					        }

 					        /** create a read only view */
 					        constexpr auto getConstView() const
 					        {
 					            using ConstValueType = std::add_const_t<typename BaseMdSpan::value_type>;
 					            return View<T_Api, ConstValueType, T_Extents, T_MemAlignment>{
 					                T_Api{},
 					                static_cast<ConstValueType*>(this->data()),
 					                this->getExtents(),
 					                this->getPitches(),
 					                T_MemAlignment{}};
 					        }

 					        /** Creates a sub view to a part of the memory.
 					         *
 					         * @param extents number of elements for each dimension
 					         * @return View which is pointing only to a part of the original view.
 					         */
 					        constexpr auto getSubView(alpaka::concepts::VectorOrScalar auto const& extents) const
 					        {
 					            Vec extentMd = extents;
 					            assert((extentMd <= this->getExtents()).reduce(std::logical_and{}));
 					            return makeView(T_Api{}, this->data(), extentMd, this->getPitches(), T_MemAlignment{});
 					        }

 					        constexpr auto getSubView(alpaka::concepts::VectorOrScalar auto const& extents)
 					        {
 					            Vec extentMd = extents;
 					            assert((extentMd <= this->getExtents()).reduce(std::logical_and{}));
 					            return makeView(T_Api{}, this->data(), extentMd, this->getPitches(), T_MemAlignment{});
 					        }

 					        /** Creates a sub view to a part of the memory.
 					         *
 					         * @param offset offset in elements to the original view
 					         * @param extents number of elements for each dimension
 					         * @return View which is pointing only to a part of the original view with a shifted origin pointer.
 					         *         View which pointThe alignment of the sub view is reduced to the element alignment.
 					         */
 					        constexpr auto getSubView(
 					            alpaka::concepts::VectorOrScalar auto const& offset,
 					            alpaka::concepts::VectorOrScalar auto const& extents) const
 					        {
 					            Vec offsetMd = offset;
 					            Vec extentMd = extents;
 					            assert((offsetMd + extentMd <= this->getExtents()).reduce(std::logical_and{}));
 					            auto shiftedPtr = &(*this)[offsetMd];
 					            return makeView(T_Api{}, shiftedPtr, extentMd, this->getPitches(), Alignment<>{});
 					        }

 					        constexpr auto getSubView(
 					            alpaka::concepts::VectorOrScalar auto const& offset,
 					            alpaka::concepts::VectorOrScalar auto const& extents)
 					        {
 					            Vec offsetMd = offset;
 					            Vec extentMd = extents;
 					            assert((offsetMd + extentMd <= this->getExtents()).reduce(std::logical_and{}));
 					            auto shiftedPtr = &(*this)[offsetMd];
 					            return makeView(T_Api{}, shiftedPtr, extentMd, this->getPitches(), Alignment<>{});
 					        }

 					        template<alpaka::concepts::Vector LowHaloVecType, alpaka::concepts::Vector UpHaloVecType>
 					        constexpr auto getSubView(
 					            alpaka::BoundaryDirection<View::dim(), LowHaloVecType, UpHaloVecType> boundaryDir) const
 					        {
 					            constexpr uint32_t dim = View::dim();
 					            auto offset = alpaka::Vec<uint32_t, dim>{};
 					            auto extents = alpaka::Vec<uint32_t, dim>{};

 					            for(uint32_t i = 0; i < dim; ++i)
 					            {
 					                switch(boundaryDir.data[i])
 					                {
 					                case BoundaryType::LOWER:
 					                    offset[i] = 0;
 					                    extents[i] = boundaryDir.lowerHaloSize[i];
 					                    break;
 					                case BoundaryType::UPPER:
 					                    offset[i] = this->getExtents()[i] - boundaryDir.upperHaloSize[i];
 					                    extents[i] = boundaryDir.upperHaloSize[i];
 					                    break;
 					                case BoundaryType::MIDDLE:
 					                    offset[i] = boundaryDir.lowerHaloSize[i];
 					                    extents[i] = this->getExtents()[i] - boundaryDir.lowerHaloSize[i] - boundaryDir.upperHaloSize[i];
 					                    break;
 					                default:
 					                    throw std::invalid_argument("invalid direction");
 					                }
 					            }
 					            return getSubView(offset, extents);
 					        }
 					    };

 					    template<typename T_Api, typename T_Type, concepts::Vector T_Extents, concepts::Alignment T_MemAlignment>
 					    std::ostream& operator<<(std::ostream& s, View<T_Api, T_Type, T_Extents, T_MemAlignment> const& view)
 					    {
 					        return s << "View{ dim=" << ALPAKA_TYPEOF(view)::dim() << ", api= " << onHost::getName(T_Api{})
 					                 << ", extents=" << view.getExtents().toString() << ", pitches=" << view.getPitches().toString()
 					                 << " , alignment=" << T_MemAlignment::template get<T_Type>() << " }";
 					    }

 					    template<
 					        alpaka::concepts::HasApi T_Any,
 					        typename T_Type,
 					        alpaka::concepts::Vector T_UserExtents,
 					        alpaka::concepts::Vector T_UserPitches,
 					        alpaka::concepts::Alignment T_MemAlignment>
 					    ALPAKA_FN_HOST_ACC View(
 					        T_Any const&,
 					        T_Type*,
 					        T_UserExtents const&,
 					        T_UserPitches const&,
 					        T_MemAlignment const memAlignment)
 					        -> View<ALPAKA_TYPEOF(getApi(std::declval<T_Any>())), T_Type, typename T_UserPitches::UniVec, T_MemAlignment>;

 					    template<
 					        alpaka::concepts::HasApi T_Any,
 					        typename T_Type,
 					        alpaka::concepts::Vector T_UserExtents,
 					        alpaka::concepts::Vector T_UserPitches>
 					    ALPAKA_FN_HOST_ACC View(T_Any, T_Type*, T_UserExtents const&, T_UserPitches const&)
 					        -> View<ALPAKA_TYPEOF(getApi(std::declval<T_Any>())), T_Type, typename T_UserPitches::UniVec, Alignment<>>;

 					    namespace trait
 					    {
 					        template<typename T>
 					        requires(isSpecializationOf_v<std::remove_cvref_t<T>, alpaka::View>)
 					        struct IsMdSpan<T> : std::true_type
 					        {
 					        };
 					    } // namespace trait
 					} // namespace alpaka

 					namespace alpaka::internal
 					{
 					    // externally define the API trait to support constexpr evaluation
 					    template<
 					        alpaka::concepts::Api T_Api,
 					        typename T_Type,
 					        alpaka::concepts::Vector T_Extents,
 					        alpaka::concepts::Alignment T_MemAlignment>
 					    struct GetApi::Op<alpaka::View<T_Api, T_Type, T_Extents, T_MemAlignment>>
 					    {
 					        inline constexpr auto operator()(auto&& data) const
 					        {
 					            return T_Api{};
 					        }
 					    };

 					    template<
 					        alpaka::concepts::Api T_Api,
 					        typename T_Type,
 					        alpaka::concepts::Vector T_Extents,
 					        alpaka::concepts::Alignment T_MemAlignment>
 					    struct CopyConstructableDataSource<View<T_Api, T_Type, T_Extents, T_MemAlignment>> : std::true_type
 					    {
 					        using InnerMutable = View<T_Api, std::remove_const_t<T_Type>, T_Extents, T_MemAlignment>;
 					        using InnerConst = View<T_Api, std::add_const_t<T_Type>, T_Extents, T_MemAlignment>;
 					    };
 					} // namespace alpaka::internal
 					// ==
 					// == ./include/alpaka/mem/View.hpp ==
 					// ============================================================================

 				// #include "alpaka/mem/concepts/detail/InnerTypeAllowedCast.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onHost/Device.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
 					// ============================================================================
 					// == ./include/alpaka/onHost/mem/MangedDealloc.hpp ==
 					// ==
 					/* Copyright 2025 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include <functional>    // amalgamate: file already included
 					// #include <memory>    // amalgamate: file already included

 					namespace alpaka::onHost::internal
 					{
 					    /** Manage the deallocation of memory
 					     *
 					     * This class is used to manage the deallocation of memory in a shared_ptr.
 					     * It takes a function that will be called when the shared_ptr is destroyed.
 					     * This is useful for managing memory that needs to be deallocated
 					     * when the shared_ptr goes out of scope.
 					     */
 					    struct ManagedDealloc : std::enable_shared_from_this<ManagedDealloc>
 					    {
 					        /**
 					         * Constructor
 					         * @param freeOp Function to be called when the shared_ptr is destroyed after all actions are executed.
 					         *               All dependencies required to deallocate the memory must be holed by freeOp.
 					         */
 					        ManagedDealloc(std::function<void()> freeOp) : freeOp{std::move(freeOp)}
 					        {
 					        }

 					        ~ManagedDealloc()
 					        {
 					            // Execute all actions before freeing the memory
 					            {
 					                std::lock_guard<std::mutex> lock{actionGuard};
 					                for(auto& action : actions)
 					                {
 					                    action();
 					                }
 					            }
 					            freeOp();
 					        }

 					        /** Add an action to be executed when the shared_ptr is destroyed.
 					         *
 					         * @param action Callable to execute on destruction.
 					         */
 					        void addAction(std::function<void()> action)
 					        {
 					            std::lock_guard<std::mutex> lock{actionGuard};
 					            actions.emplace_back(std::move(action));
 					        }

 					        std::shared_ptr<ManagedDealloc> getSharedPtr()
 					        {
 					            return this->shared_from_this();
 					        }

 					    private:
 					        std::function<void()> freeOp;
 					        std::mutex actionGuard;
 					        std::vector<std::function<void()>> actions;
 					    };
 					} // namespace alpaka::onHost::internal
 					// ==
 					// == ./include/alpaka/onHost/mem/MangedDealloc.hpp ==
 					// ============================================================================


 				// #include <cstdint>    // amalgamate: file already included
 				// #include <functional>    // amalgamate: file already included
 				// #include <memory>    // amalgamate: file already included
 				// #include <sstream>    // amalgamate: file already included

 				namespace alpaka::onHost
 				{
 				    /** Life time managed buffer with contiguous data
 				     *
 				     * This buffer owns the data and will deallocate it when last copy is destroyed.
 				     * Const-ness of the buffer instance is propagated to the data region.
 				     * A copy of this instance will only perform a shallow copy, to perform a deep copy to duplicate the data you
 				     * should use @c onHost::memcpy.
 				     */
 				    template<
 				        alpaka::concepts::Api T_Api,
 				        typename T_Type,
 				        alpaka::concepts::Vector T_Extents,
 				        alpaka::concepts::Alignment T_MemAlignment = Alignment<>>
 				    struct SharedBuffer : View<T_Api, T_Type, T_Extents, T_MemAlignment>
 				    {
 				    private:
 				        using BaseView = View<T_Api, T_Type, T_Extents, T_MemAlignment>;

 				        /** Constructor with existing managed deleter */
 				        SharedBuffer(
 				            T_Api const api,
 				            T_Type* data,
 				            T_Extents const& extents,
 				            T_Extents const& pitches,
 				            std::shared_ptr<internal::ManagedDealloc> managedDeleter,
 				            T_MemAlignment const memAlignment)
 				            : BaseView{api, data, extents, pitches, memAlignment}
 				            , m_deleter{std::move(managedDeleter)}
 				        {
 				        }

 				        // friend declaration is required that any type of SharedBuffer can access the private constructor
 				        template<
 				            alpaka::concepts::Api T_OtherApi,
 				            typename T_OtherType,
 				            alpaka::concepts::Vector T_OtherExtents,
 				            alpaka::concepts::Alignment T_OtherMemAlignment2>
 				        friend struct SharedBuffer;

 				        template<
 				            alpaka::concepts::Api T_OtherApi,
 				            typename T_OtherType,
 				            alpaka::concepts::Vector T_OtherExtents,
 				            alpaka::concepts::Alignment T_OtherMemAlignment2>
 				        friend std::ostream& operator<<(
 				            std::ostream& s,
 				            SharedBuffer<T_OtherApi, T_OtherType, T_OtherExtents, T_OtherMemAlignment2> const& buffer);

 				    public:
 				        template<
 				            alpaka::concepts::HasApi T_Any,
 				            alpaka::concepts::Vector T_UserExtents,
 				            alpaka::concepts::Vector T_UserPitches>
 				        SharedBuffer(
 				            T_Any const& any,
 				            T_Type* data,
 				            T_UserExtents const& extents,
 				            T_UserPitches const& pitches,
 				            std::invocable<> auto deleter,
 				            T_MemAlignment const memAlignment = Alignment{})
 				            : BaseView{any, data, extents, pitches, memAlignment}
 				            , m_deleter{std::make_shared<internal::ManagedDealloc>(deleter)}
 				        {
 				            static_assert(
 				                isLosslesslyConvertible_v<typename T_UserPitches::type, typename T_UserExtents::type>,
 				                "extent type and pitch type must be lossless convertible");
 				        }

 				        template<typename T_Type_Other>
 				        requires alpaka::internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
 				        SharedBuffer(SharedBuffer<T_Api, T_Type_Other, T_Extents, T_MemAlignment> const& other)
 				            : BaseView{static_cast<BaseView>(other)}
 				            , m_deleter(other.m_deleter)
 				        {
 				        }

 				        SharedBuffer(SharedBuffer const&) = default;

 				        SharedBuffer& operator=(SharedBuffer const& otherSharedBuffer)
 				        {
 				            *this = otherSharedBuffer.getConstSharedBuffer();
 				            return *this;
 				        }

 				        template<typename T_Type_Other>
 				        requires alpaka::internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
 				        SharedBuffer(SharedBuffer<T_Api, T_Type_Other, T_Extents, T_MemAlignment>&& other)
 				            : BaseView{std::move(static_cast<BaseView>(other))}
 				            , m_deleter(std::move(other.m_deleter))

 				        {
 				        }

 				        SharedBuffer(SharedBuffer&&) = default;

 				        SharedBuffer& operator=(SharedBuffer&&) = default;

 				        auto getView() const
 				        {
 				            return BaseView::getConstView();
 				        }

 				        auto getView()
 				        {
 				            return static_cast<BaseView>(*this);
 				        }

 				        /** create a read shared buffer view */
 				        auto getConstSharedBuffer() const
 				        {
 				            using ConstValueType = std::add_const_t<typename BaseView::value_type>;
 				            return SharedBuffer<T_Api, ConstValueType, T_Extents, T_MemAlignment>(
 				                T_Api{},
 				                static_cast<ConstValueType*>(this->data()),
 				                this->getExtents(),
 				                this->getPitches(),
 				                m_deleter,
 				                T_MemAlignment{});
 				        }

 				        /** Creates a buffer pointing to a part of the memory.
 				         *
 				         * @param extents number of elements for each dimension
 				         * @return shared buffer which is pointing only to a part of the original buffer.
 				         */
 				        auto getSubSharedBuffer(alpaka::concepts::VectorOrScalar auto const& extents) const
 				        {
 				            Vec extentMd = extents;
 				            assert((extentMd <= this->getExtents()).reduce(std::logical_and{}));
 				            return SharedBuffer<T_Api, std::remove_pointer_t<ALPAKA_TYPEOF(this->data())>, T_Extents, T_MemAlignment>{
 				                T_Api{},
 				                this->data(),
 				                extentMd,
 				                this->getPitches(),
 				                m_deleter,
 				                T_MemAlignment{}};
 				        }

 				        auto getSubSharedBuffer(alpaka::concepts::VectorOrScalar auto const& extents)
 				        {
 				            Vec extentMd = extents;
 				            assert((extentMd <= this->getExtents()).reduce(std::logical_and{}));
 				            return SharedBuffer<T_Api, std::remove_pointer_t<ALPAKA_TYPEOF(this->data())>, T_Extents, T_MemAlignment>{
 				                T_Api{},
 				                this->data(),
 				                extentMd,
 				                this->getPitches(),
 				                m_deleter,
 				                T_MemAlignment{}};
 				        }

 				        /** Creates a shared sub-buffer view to a part of the memory.
 				         *
 				         * @param offsets offset in elements to the original buffer
 				         * @param extents number of elements for each dimension
 				         * @return Buffer which is pointing only to a part of the original buffer with a shifted origin pointer.
 				         *         Buffer which pointThe alignment of the sub view is reduced to the element alignment.
 				         */
 				        auto getSubSharedBuffer(
 				            alpaka::concepts::VectorOrScalar auto const& offsets,
 				            alpaka::concepts::VectorOrScalar auto const& extents) const
 				        {
 				            Vec offsetMd = offsets;
 				            Vec extentMd = extents;
 				            assert((offsetMd + extentMd <= this->getExtents()).reduce(std::logical_and{}));
 				            auto shiftedPtr = &(*this)[offsetMd];
 				            return SharedBuffer<T_Api, std::remove_pointer_t<ALPAKA_TYPEOF(shiftedPtr)>, T_Extents, Alignment<>>{
 				                T_Api{},
 				                shiftedPtr,
 				                extentMd,
 				                this->getPitches(),
 				                m_deleter,
 				                Alignment<>{}};
 				        }

 				        auto getSubSharedBuffer(
 				            alpaka::concepts::VectorOrScalar auto const& offsets,
 				            alpaka::concepts::VectorOrScalar auto const& extents)
 				        {
 				            Vec offsetMd = offsets;
 				            Vec extentMd = extents;
 				            assert((offsetMd + extentMd <= this->getExtents()).reduce(std::logical_and{}));
 				            auto shiftedPtr = &(*this)[offsetMd];
 				            return SharedBuffer<T_Api, std::remove_pointer_t<ALPAKA_TYPEOF(shiftedPtr)>, T_Extents, Alignment<>>{
 				                T_Api{},
 				                shiftedPtr,
 				                extentMd,
 				                this->getPitches(),
 				                m_deleter,
 				                Alignment<>{}};
 				        }

 				        /** Adds a destructor action to the shared buffer
 				         *
 				         * The action will be executed when the buffer is destroyed.
 				         * This can be used to add additional cleanup actions e.g. waiting on a specific queue.
 				         * Actions are executed in FIFO order.
 				         *
 				         * @param action callable to execute on destruction
 				         */
 				        void addDestructorAction(std::function<void()>&& action)
 				        {
 				            m_deleter->addAction(ALPAKA_FORWARD(action));
 				        }

 				        /** Add an action to be executed when the shared_ptr is destroyed.
 				         *
 				         * @param action Callable to execute on destruction.
 				         */
 				        void destructorWaitFor(auto const& any)
 				        {
 				            addDestructorAction([any]() { onHost::wait(any); });
 				        }

 				        /** Return the number of SharedBuffers which points to the same memory */
 				        [[nodiscard]] constexpr long getUseCount() const noexcept
 				        {
 				            return m_deleter.use_count();
 				        }

 				        /** True if SharedBuffer is pointing to valid memory. */
 				        [[nodiscard]] constexpr explicit operator bool() const noexcept
 				        {
 				            return static_cast<bool>(m_deleter);
 				        }

 				    private:
 				        /** @todo move this to trais or somewhere else that it can be used everywhere */
 				        template<alpaka::concepts::Pointer T>
 				        using ConstPtr_t = std::add_pointer_t<std::add_const_t<std::remove_pointer_t<T>>>;

 				        std::shared_ptr<internal::ManagedDealloc> m_deleter;
 				    }; // namespace alpaka::onHost

 				    template<
 				        alpaka::concepts::HasApi T_Any,
 				        typename T_Type,
 				        alpaka::concepts::Vector T_UserExtents,
 				        alpaka::concepts::Vector T_UserPitches,
 				        alpaka::concepts::Alignment T_MemAlignment>
 				    SharedBuffer(
 				        T_Any const&,
 				        T_Type*,
 				        T_UserExtents const&,
 				        T_UserPitches const&,
 				        std::invocable<> auto,
 				        T_MemAlignment const)
 				        -> SharedBuffer<
 				            ALPAKA_TYPEOF(getApi(std::declval<T_Any>())),
 				            T_Type,
 				            typename T_UserPitches::UniVec,
 				            T_MemAlignment>;

 				    template<
 				        alpaka::concepts::HasApi T_Any,
 				        typename T_Type,
 				        alpaka::concepts::Vector T_UserExtents,
 				        alpaka::concepts::Vector T_UserPitches>
 				    SharedBuffer(T_Any const&, T_Type*, T_UserExtents const&, T_UserPitches const&, std::invocable<> auto)
 				        -> SharedBuffer<
 				            ALPAKA_TYPEOF(getApi(std::declval<T_Any>())),
 				            T_Type,
 				            typename T_UserPitches::UniVec,
 				            Alignment<>>;

 				    template<
 				        alpaka::concepts::Api T_Api,
 				        typename T_Type,
 				        alpaka::concepts::Vector T_Extents,
 				        alpaka::concepts::Alignment T_MemAlignment>
 				    struct MakeAccessibleOnAcc::Op<SharedBuffer<T_Api, T_Type, T_Extents, T_MemAlignment>>
 				    {
 				        auto operator()(auto&& any) const
 				        {
 				            return any.getView();
 				        }
 				    };

 				    template<
 				        alpaka::concepts::Api T_Api,
 				        typename T_Type,
 				        alpaka::concepts::Vector T_Extents,
 				        alpaka::concepts::Alignment T_MemAlignment>
 				    std::ostream& operator<<(std::ostream& s, SharedBuffer<T_Api, T_Type, T_Extents, T_MemAlignment> const& buff)
 				    {
 				        return s << "SharedBuffer{ dim=" << ALPAKA_TYPEOF(buff)::dim() << ", api= " << onHost::getName(T_Api{})
 				                 << ", extents=" << buff.getExtents().toString() << ", pitches=" << buff.getPitches().toString()
 				                 << " , alignment=" << T_MemAlignment::template get<T_Type>() << " }";
 				    }

 				} // namespace alpaka::onHost

 				namespace alpaka::internal
 				{
 				    // external define the API trait to support constexpr evaluation
 				    template<
 				        alpaka::concepts::Api T_Api,
 				        typename T_Type,
 				        alpaka::concepts::Vector T_Extents,
 				        alpaka::concepts::Alignment T_MemAlignment>
 				    struct GetApi::Op<onHost::SharedBuffer<T_Api, T_Type, T_Extents, T_MemAlignment>>
 				    {
 				        inline constexpr auto operator()(auto&& data) const
 				        {
 				            return T_Api{};
 				        }
 				    };

 				    template<
 				        alpaka::concepts::Api T_Api,
 				        typename T_Type,
 				        alpaka::concepts::Vector T_Extents,
 				        alpaka::concepts::Alignment T_MemAlignment>
 				    struct CopyConstructableDataSource<onHost::SharedBuffer<T_Api, T_Type, T_Extents, T_MemAlignment>> : std::true_type
 				    {
 				        using InnerMutable = onHost::SharedBuffer<T_Api, std::remove_const_t<T_Type>, T_Extents, T_MemAlignment>;
 				        using InnerConst = onHost::SharedBuffer<T_Api, std::add_const_t<T_Type>, T_Extents, T_MemAlignment>;
 				    };

 				} // namespace alpaka::internal

 				namespace alpaka::trait
 				{
 				    template<typename T>
 				    requires(isSpecializationOf_v<std::remove_cvref_t<T>, alpaka::onHost::SharedBuffer>)
 				    struct IsMdSpan<T> : std::true_type
 				    {
 				    };
 				} // namespace alpaka::trait
 				// ==
 				// == ./include/alpaka/onHost/mem/SharedBuffer.hpp ==
 				// ============================================================================


 			// #include <cstdint>    // amalgamate: file already included
 			// #include <cstring>    // amalgamate: file already included
 			// #include <future>    // amalgamate: file already included
 			// #include <sstream>    // amalgamate: file already included

 			namespace alpaka::onHost
 			{
 			    namespace cpu
 			    {
 			        template<typename T_Device>
 			        struct Queue : std::enable_shared_from_this<Queue<T_Device>>
 			        {
 			        public:
 			            Queue(internal::concepts::DeviceHandle auto device, uint32_t const idx, bool isBlocking)
 			                : m_device(std::move(device))
 			                , m_idx(idx)
 			                , m_isBlocking(isBlocking)
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
 			            }

 			            ~Queue()
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
 			                internal::wait(*this);
 			            }

 			            Queue(Queue const&) = delete;
 			            Queue& operator=(Queue const&) = delete;

 			            Queue(Queue&&) = delete;
 			            Queue& operator=(Queue&&) = delete;

 			            bool operator==(Queue const& other) const
 			            {
 			                return m_idx == other.m_idx && m_device == other.m_device;
 			            }

 			            bool operator!=(Queue const& other) const
 			            {
 			                return !(*this == other);
 			            }

 			        private:
 			            void _()
 			            {
 			                static_assert(internal::concepts::Queue<Queue>);
 			            }

 			            Handle<T_Device> m_device;
 			            uint32_t m_idx = 0u;
 			            core::CallbackThread m_workerThread;
 			            bool m_isBlocking{false};
 			            /** Mutex to ensure sequential execution of tasks and operation if the queue is blocking.
 			             *
 			             * For non-blocking queue @c m_workerThread is taking care of the execution order
 			             */
 			            std::mutex m_mutex;

 			            /** Submit a task to the queue.
 			             *
 			             * Centralizes blocking / non-blocking behavior within the method to keep other code as easy as possible.
 			             * For a blocking queue this method is NOT giving the control back to the caller until the operation is
 			             * processed.
 			             * All internal calls should use this method and not enqueue tasks directly in @c m_workerThread
 			             */
 			            template<typename T_Fn>
 			            auto submit(T_Fn&& fn)
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
 			                if(m_isBlocking)
 			                {
 			                    std::lock_guard<std::mutex> lk(m_mutex);
 			                    fn();
 			                    // silent tsan warnings: The promise is fulfilled directly and only a future which is true is
 			                    // returned, there can not be a data race in between.
 			#if defined(__GNUC__) && !defined(__clang__)
 			#    pragma GCC diagnostic push
 			#    pragma GCC diagnostic ignored "-Wtsan"
 			#endif
 			                    // return a ready future-like placeholder; reuse CallbackThread interface minimally
 			                    std::promise<void> p;
 			                    p.set_value();
 			#if defined(__GNUC__) && !defined(__clang__)
 			#    pragma GCC diagnostic pop
 			#endif
 			                    auto f = p.get_future();

 			                    // to keep the uniform interface with the non-blocking case,
 			                    // return by moving the f since it is move-only
 			                    return f;
 			                }
 			                // enqueue the task into the worker thread, callers can wait/chain later.
 			                return m_workerThread.submit(std::forward<T_Fn>(fn));
 			            }

 			            friend struct alpaka::internal::GetName;

 			            std::string getName() const
 			            {
 			                return std::string("host::Queue id=") + std::to_string(m_idx);
 			            }

 			            friend struct internal::GetNativeHandle;

 			            [[nodiscard]] auto getNativeHandle() const noexcept
 			            {
 			                return m_idx;
 			            }

 			            friend struct internal::Enqueue;

 			            template<alpaka::concepts::Vector T_NumBlocks, alpaka::concepts::Vector T_NumThreads>
 			            void enqueue(
 			                auto const executor,
 			                ThreadSpec<T_NumBlocks, T_NumThreads> const& threadBlocking,
 			                auto const& kernelBundle)
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);
 			                auto deviceKind = alpaka::getDeviceKind(m_device);
 			                submit(
 			                    [kernelBundle, executor, threadBlocking, deviceKind]()
 			                    {
 			                        auto moreLayer = Dict{
 			                            DictEntry(object::api, api::host),
 			                            DictEntry(object::deviceKind, deviceKind),
 			                            DictEntry(object::exec, executor)};
 			                        onAcc::Acc acc = makeAcc(executor, threadBlocking);
 			                        acc(kernelBundle, moreLayer);
 			                    });
 			            }

 			            template<
 			                alpaka::concepts::Executor T_Executor,
 			                alpaka::concepts::Vector T_NumFrames,
 			                alpaka::concepts::Vector T_FrameExtents,
 			                alpaka::concepts::Vector T_ThreadExtents>
 			            void enqueue(
 			                T_Executor const executor,
 			                FrameSpec<T_NumFrames, T_FrameExtents, T_ThreadExtents> const& frameSpec,
 			                auto const& kernelBundle)
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);
 			                auto threadBlocking = internal::adjustThreadSpec(*m_device.get(), executor, frameSpec, kernelBundle);
 			                auto deviceKind = alpaka::getDeviceKind(m_device);

 			                submit(
 			                    [kernelBundle, executor, threadBlocking, deviceKind, frameSpec]()
 			                    {
 			                        auto moreLayer = Dict{
 			                            DictEntry(frame::count, frameSpec.m_numFrames),
 			                            DictEntry(frame::extent, frameSpec.m_frameExtent),
 			                            DictEntry(object::api, api::host),
 			                            DictEntry(object::deviceKind, deviceKind),
 			                            DictEntry(object::exec, executor)};
 			                        onAcc::Acc acc = makeAcc(executor, threadBlocking);
 			                        acc(kernelBundle, moreLayer);
 			                    });
 			            }

 			            /** execute a task in the queue
 			             *
 			             * @attention Do NOT enqueue a task which captures the queue internally to keep the queue alive as
 			             * dependency. In this case the destructure of the queue is not called.
 			             */
 			            void enqueue(auto const& task)
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
 			                submit([task]() { task(); });
 			            }

 			            friend struct alpaka::internal::GetDeviceType;

 			            auto getDeviceKind() const
 			            {
 			                return alpaka::internal::getDeviceKind(*m_device.get());
 			            }

 			            auto getDevice() const
 			            {
 			                return m_device;
 			            }

 			            std::shared_ptr<Queue> getSharedPtr()
 			            {
 			                return this->shared_from_this();
 			            }

 			            friend struct onHost::internal::GetDevice;

 			            friend struct internal::Wait;
 			            friend struct internal::WaitFor;
 			            friend struct internal::Memcpy;
 			            friend struct internal::Memset;
 			            friend struct alpaka::internal::GetApi;
 			            friend struct internal::AllocDeferred;
 			        };
 			    } // namespace cpu

 			    namespace internal
 			    {
 			        template<typename T_Device>
 			        struct Wait::Op<cpu::Queue<T_Device>>
 			        {
 			            void operator()(cpu::Queue<T_Device>& queue) const
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
 			                // enqueue an empty task as marker and wait for the future
 			                queue.submit([]() {}).wait();
 			            }
 			        };

 			        template<typename T_Device, typename T_Event>
 			        struct Enqueue::Event<cpu::Queue<T_Device>, T_Event>
 			        {
 			            void operator()(cpu::Queue<T_Device>& queue, T_Event& event) const
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::queue);
 			                // open a scope to avoid logging during we hold the lock for this class
 			                {
 			                    // Setting the event state (e.g. the future) and enqueuing it has to be atomic.
 			                    std::lock_guard<std::mutex> lk(event.m_mutex);

 			                    ++event.m_enqueueCount;

 			                    auto const enqueueCount = event.m_enqueueCount;

 			                    /* In case the queue is blocking we can not use queue.submit() because we hold the lock already.
 			                     * The blocking queue executes the lambda directly which will create a deadlock.
 			                     */
 			                    if(queue.m_isBlocking)
 			                    {
 			                        // Nothing to do if it has been re-enqueued to a later position in the queue.
 			                        if(enqueueCount == event.m_enqueueCount)
 			                        {
 			                            event.m_LastReadyEnqueueCount = std::max(enqueueCount, event.m_LastReadyEnqueueCount);
 			                        }
 			                        // apply a fulfilled future
 			                        std::promise<void> p;
 			                        p.set_value();
 			                        event.m_future = p.get_future();
 			                    }
 			                    else
 			                    {
 			                        auto sharedEvent = event.getSharedPtr();
 			                        // Enqueue a task that only resets the events flag if it is completed.
 			                        event.m_future = queue.submit(
 			                            [sharedEvent, enqueueCount]() mutable
 			                            {
 			                                std::unique_lock<std::mutex> lk2(sharedEvent->m_mutex);

 			                                // Nothing to do if it has been re-enqueued to a later position in the queue.
 			                                if(enqueueCount == sharedEvent->m_enqueueCount)
 			                                {
 			                                    sharedEvent->m_LastReadyEnqueueCount
 			                                        = std::max(enqueueCount, sharedEvent->m_LastReadyEnqueueCount);
 			                                }
 			                            });
 			                    }
 			                }
 			            }
 			        };

 			        template<typename T_Device, typename T_Event>
 			        struct WaitFor::Op<cpu::Queue<T_Device>, T_Event>
 			        {
 			            void operator()(cpu::Queue<T_Device>& queue, cpu::Event<T_Device>& event) const
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::queue);
 			                // open a scope to avoid logging during we hold the lock for this class
 			                {
 			                    // Setting the event state and enqueuing it has to be atomic.
 			                    std::unique_lock<std::mutex> lk(event.m_mutex);

 			                    if(!event.isReady())
 			                    {
 			                        /* In case the queue is blocking we can not use queue.submit() because we hold the lock
 			                         * already. The blocking queue executes the lambda directly which will create a deadlock.
 			                         */
 			                        if(queue.m_isBlocking)
 			                        {
 			                            std::shared_future sFuture = event.m_future;
 			                            lk.unlock();
 			                            sFuture.get();
 			                        }
 			                        else
 			                        {
 			                            auto sharedEvent = event.getSharedPtr();
 			                            auto oldFuture = event.m_future;

 			                            // unlock here to avoid keeping the look during the maybe expensive enqueue of the task
 			                            lk.unlock();
 			                            // Enqueue a task that waits for the given future of the event.
 			                            queue.submit([sharedEvent, oldFuture]() { oldFuture.get(); });
 			                        }
 			                    }
 			                }
 			            }
 			        };

 			        template<typename T_Device, typename T_Dest, typename T_Source, typename T_Extents>
 			        struct Memcpy::Op<cpu::Queue<T_Device>, T_Dest, T_Source, T_Extents>
 			        {
 			            void operator()(cpu::Queue<T_Device>& queue, auto&& dest, T_Source const& source, T_Extents const& extents)
 			                const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
 			                constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;

 			                /* Get all required properties outside the lambda function to not extend the life-time of the data.
 			                 * The life-time is not extended to have some life-time behaviours with all backends.
 			                 */
 			                auto* destPtr = toVoidPtr(alpaka::onHost::data(ALPAKA_FORWARD(dest)));
 			                auto const* srcPtr = toVoidPtr(alpaka::onHost::data(source));

 			                if constexpr(dim == 1u)
 			                {
 			                    queue.submit(
 			                        [extents, destPtr, srcPtr]()
 			                        {
 			                            std::memcpy(destPtr, srcPtr, extents.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
 			                        });
 			                }
 			                else
 			                {
 			                    // memcpy is implemented as row wise copy therefore the last dimension is not required
 			                    auto destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
 			                    auto sourcePitchBytesWithoutColumn = source.getPitches().eraseBack();

 			                    queue.submit(
 			                        [extents, destPtr, srcPtr, destPitchBytesWithoutColumn, sourcePitchBytesWithoutColumn]()
 			                        {
 			                            auto const dstExtentWithoutColumn = extents.eraseBack();
 			                            if(static_cast<std::size_t>(extents.product()) != 0u)
 			                            {
 			                                meta::ndLoopIncIdx(
 			                                    dstExtentWithoutColumn,
 			                                    [&](auto const& idx)
 			                                    {
 			                                        std::memcpy(
 			                                            reinterpret_cast<std::uint8_t*>(destPtr)
 			                                                + (idx * destPitchBytesWithoutColumn).sum(),
 			                                            reinterpret_cast<std::uint8_t const*>(srcPtr)
 			                                                + (idx * sourcePitchBytesWithoutColumn).sum(),
 			                                            static_cast<size_t>(extents.back())
 			                                                * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
 			                                    });
 			                            }
 			                        });
 			                }
 			            }
 			        };

 			        template<typename T_Device, typename T_Dest, typename T_Extents>
 			        struct Memset::Op<cpu::Queue<T_Device>, T_Dest, T_Extents>
 			        {
 			            /** @attention Do not use `requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>` here else gcc 11.X
 			             * (tested 11.4 and 11.3) will run into an internal compiler segfault during the evaluation of the
 			             * constraints */
 			            void operator()(cpu::Queue<T_Device>& queue, auto&& dest, uint8_t byteValue, T_Extents const& extents)
 			                const requires(std::is_same_v<ALPAKA_TYPEOF(dest), T_Dest>)
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
 			                constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;

 			                void* destPtr = static_cast<void*>(alpaka::onHost::data(dest));

 			                if constexpr(dim == 1u)
 			                {
 			                    queue.submit(
 			                        [extents, destPtr, byteValue]()
 			                        {
 			                            std::memset(
 			                                destPtr,
 			                                byteValue,
 			                                extents.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
 			                        });
 			                }
 			                else
 			                {
 			                    // memset is implemented as row wise memset therefore the last dimension is not required
 			                    auto destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
 			                    queue.submit(
 			                        [extents, destPtr, destPitchBytesWithoutColumn, byteValue]()
 			                        {
 			                            auto const dstExtentWithoutColumn = extents.eraseBack();
 			                            if(static_cast<std::size_t>(extents.product()) != 0u)
 			                            {
 			                                meta::ndLoopIncIdx(
 			                                    dstExtentWithoutColumn,
 			                                    [&](auto const& idx)
 			                                    {
 			                                        std::memset(
 			                                            reinterpret_cast<std::uint8_t*>(destPtr)
 			                                                + (idx * destPitchBytesWithoutColumn).sum(),
 			                                            byteValue,
 			                                            static_cast<size_t>(extents.back())
 			                                                * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
 			                                    });
 			                            }
 			                        });
 			                }
 			            }
 			        };

 			        template<typename T_Device, typename T_Dest, typename T_Value, typename T_Extents>
 			        struct Fill::Op<cpu::Queue<T_Device>, T_Dest, T_Value, T_Extents>
 			        {
 			            void operator()(cpu::Queue<T_Device>& queue, auto&& dest, T_Value elementValue, T_Extents const& extents)
 			                const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
 			                               && std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
 			                auto executors = supportedExecutors(getDevice(queue), exec::allExecutors);
 			                // avoid that we pass a SharedBuffer and convert non alpaka data views
 			                alpaka::concepts::MdSpan<T_Value> auto dataView = makeView(dest);

 			                alpaka::internal::generic::fill(
 			                    queue,
 			                    std::get<0>(executors),
 			                    dataView.getSubView(extents),
 			                    elementValue);
 			            }
 			        };

 			        /** The code is a copy of the Alloc::Op with the difference that the memory is allocated and freed
 			         * within a queue
 			         */
 			        template<typename T_Type, typename T_Device, alpaka::concepts::Vector T_Extents>
 			        struct AllocDeferred::Op<T_Type, cpu::Queue<T_Device>, T_Extents>
 			        {
 			            static consteval uint32_t highestPowerOfTwo(uint32_t value)
 			            {
 			                uint32_t result = 1u;
 			                while((result << 1u) <= value)
 			                {
 			                    result <<= 1u;
 			                }
 			                return result;
 			            }

 			            auto operator()(cpu::Queue<T_Device>& queue, T_Extents const& extents) const
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
 			                auto device = queue.getDevice();
 			                constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
 			                    ALPAKA_TYPEOF(getApi(device)){},
 			                    ALPAKA_TYPEOF(getDeviceKind(device)){});
 			                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

 			                auto deviceDependency = onHost::Device{queue.getDevice()->getSharedPtr()};
 			                auto queueDependency = queue.getSharedPtr();

 			                T_Type* ptr = reinterpret_cast<T_Type*>(alpaka::core::alignedAlloc(alignment, memSizeInByte));
 			                // queueDependency is captured to keep the device alive until the memory is deleted
 			                auto deleter = [ptr, queueDep = std::move(queueDependency)]()
 			                { queueDep.get()->submit([ptr]() { alpaka::core::alignedFree(alignment, ptr); }); };

 			                auto sharedBuffer = onHost::SharedBuffer{
 			                    deviceDependency,
 			                    ptr,
 			                    extents,
 			                    pitches,
 			                    std::move(deleter),
 			                    Alignment<alignment>{}};

 			                ALPAKA_LOG_INFO(
 			                    onHost::logger::memory + onHost::logger::queue,
 			                    [&]()
 			                    {
 			                        std::stringstream ss;
 			                        ss << sharedBuffer;
 			                        return ss.str();
 			                    });
 			                return sharedBuffer;
 			            }
 			        };
 			    } // namespace internal
 			} // namespace alpaka::onHost

 			namespace alpaka::internal
 			{
 			    template<typename T_Device>
 			    struct GetApi::Op<onHost::cpu::Queue<T_Device>>
 			    {
 			        inline constexpr auto operator()(auto&& queue) const
 			        {
 			            return alpaka::getApi(queue.m_device);
 			        }
 			    };
 			} // namespace alpaka::internal
 			// ==
 			// == ./include/alpaka/api/host/Queue.hpp ==
 			// ============================================================================

 		// #include "alpaka/api/util.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/core/alignedAlloc.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/Device.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/DeviceProperties.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/mem/SharedBuffer.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

 		// #include <cstdint>    // amalgamate: file already included
 		// #include <memory>    // amalgamate: file already included
 		// #include <sstream>    // amalgamate: file already included

 		namespace alpaka::onHost
 		{
 		    namespace cpu
 		    {
 		        template<typename T_Platform>
 		        struct Device : std::enable_shared_from_this<Device<T_Platform>>
 		        {
 		        public:
 		            Device(internal::concepts::PlatformHandle auto platform, uint32_t const idx)
 		                : m_platform(std::move(platform))
 		                , m_idx(idx)
 		                , m_properties{internal::getDeviceProperties(*m_platform.get(), m_idx)}
 		            {
 		                ALPAKA_LOG_FUNCTION(onHost::logger::device);
 		                m_properties.m_name += " id=" + std::to_string(m_idx);
 		            }

 		            ~Device()
 		            {
 		                ALPAKA_LOG_FUNCTION(onHost::logger::device);
 		            }

 		            Device(Device const&) = delete;
 		            Device& operator=(Device const&) = delete;

 		            Device(Device&&) = delete;
 		            Device& operator=(Device&&) = delete;

 		            bool operator==(Device const& other) const
 		            {
 		                return m_idx == other.m_idx;
 		            }

 		            bool operator!=(Device const& other) const
 		            {
 		                return m_idx != other.m_idx;
 		            }

 		            void wait()
 		            {
 		                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
 		                // Host device synchronization - wait on all queues associated with this device.
 		                // IMPORTANT: Do not hold queuesGuard across potentially long waits; copy weak refs first.
 		                std::vector<std::weak_ptr<cpu::Queue<Device>>> tmpQueues;
 		                {
 		                    std::lock_guard<std::mutex> lk{queuesGuard};
 		                    tmpQueues = queues; // copy weak_ptr list
 		                }
 		                for(auto& weakQueue : tmpQueues)
 		                {
 		                    if(auto queue = weakQueue.lock())
 		                    {
 		                        internal::wait(*queue);
 		                    }
 		                }
 		            }

 		        private:
 		            void _()
 		            {
 		                static_assert(internal::concepts::Device<Device>);
 		            }

 		            Handle<T_Platform> m_platform;
 		            uint32_t m_idx = 0u;
 		            DeviceProperties m_properties;
 		            std::vector<std::weak_ptr<cpu::Queue<Device>>> queues;
 		            std::vector<std::weak_ptr<cpu::Event<Device>>> events;
 		            std::mutex queuesGuard;

 		            std::shared_ptr<Device> getSharedPtr()
 		            {
 		                return this->shared_from_this();
 		            }

 		            friend struct alpaka::internal::GetName;

 		            std::string getName() const
 		            {
 		                return m_properties.m_name;
 		            }

 		            friend struct internal::GetNativeHandle;

 		            [[nodiscard]] uint32_t getNativeHandle() const noexcept
 		            {
 		                return m_idx;
 		            }

 		            friend struct internal::MakeQueue;

 		            Handle<cpu::Queue<Device>> makeQueue(alpaka::concepts::QueueKind auto kind)
 		            {
 		                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
 		                static_assert(
 		                    kind == queueKind::blocking || kind == queueKind::nonBlocking,
 		                    "Unsupported queue kind.");
 		                auto thisHandle = this->getSharedPtr();
 		                std::lock_guard<std::mutex> lk{queuesGuard};

 		                constexpr bool isBlocking = kind == queueKind::blocking;
 		                auto newQueue = std::make_shared<cpu::Queue<Device>>(std::move(thisHandle), queues.size(), isBlocking);

 		                queues.emplace_back(newQueue);
 		                return newQueue;
 		            }

 		            friend struct internal::MakeEvent;

 		            Handle<cpu::Event<Device>> makeEvent()
 		            {
 		                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::event);
 		                auto thisHandle = this->getSharedPtr();
 		                std::lock_guard<std::mutex> lk{queuesGuard};
 		                auto newEvent = std::make_shared<cpu::Event<Device>>(std::move(thisHandle), queues.size());

 		                events.emplace_back(newEvent);
 		                return newEvent;
 		            }

 		            friend struct alpaka::internal::GetDeviceType;

 		            auto getDeviceKind() const
 		            {
 		                return alpaka::internal::getDeviceKind(*m_platform.get());
 		            }

 		            friend struct internal::Alloc;
 		            friend struct alpaka::internal::GetApi;
 		            friend struct internal::GetDeviceProperties;
 		            friend struct internal::AdjustThreadSpec;
 		            friend struct internal::AllocDeferred;
 		            friend struct internal::AllocUnified;
 		            friend struct internal::AllocMapped;
 		        };
 		    } // namespace cpu

 		    namespace trait

 		    {
 		        template<typename T_Platform>
 		        struct IsExecutorSupportedBy::Op<exec::CpuSerial, cpu::Device<T_Platform>> : std::true_type
 		        {
 		        };
 		#if ALPAKA_OMP
 		        template<typename T_Platform>
 		        struct IsExecutorSupportedBy::Op<exec::CpuOmpBlocks, cpu::Device<T_Platform>> : std::true_type
 		        {
 		        };
 		#endif
 		#if ALPAKA_TBB
 		        template<typename T_Platform>
 		        struct IsExecutorSupportedBy::Op<exec::CpuTbbBlocks, cpu::Device<T_Platform>> : std::true_type
 		        {
 		        };
 		#endif
 		    } // namespace trait

 		    namespace internal
 		    {
 		        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
 		        struct Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
 		        {
 		            auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
 		            {
 		                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
 		                constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
 		                    ALPAKA_TYPEOF(getApi(device)){},
 		                    ALPAKA_TYPEOF(getDeviceKind(device)){});
 		                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

 		                auto deviceDependency = onHost::Device{device.getSharedPtr()};

 		                T_Type* ptr = reinterpret_cast<T_Type*>(alpaka::core::alignedAlloc(alignment, memSizeInByte));
 		                // deviceDependency is captured to keep the device alive until the memory is deleted
 		                auto deleter = [ptr, deviceDependency]() { alpaka::core::alignedFree(alignment, ptr); };

 		                auto sharedBuffer = onHost::SharedBuffer{
 		                    deviceDependency,
 		                    ptr,
 		                    extents,
 		                    pitches,
 		                    std::move(deleter),
 		                    Alignment<alignment>{}};

 		                ALPAKA_LOG_INFO(
 		                    onHost::logger::memory + onHost::logger::device,
 		                    [&]()
 		                    {
 		                        std::stringstream ss;
 		                        ss << sharedBuffer;
 		                        return ss.str();
 		                    });
 		                return sharedBuffer;
 		            }
 		        };

 		        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
 		        struct AllocUnified::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
 		        {
 		            auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
 		            {
 		                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
 		                return Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>{}(device, extents);
 		            }
 		        };

 		        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
 		        struct AllocMapped::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
 		        {
 		            auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
 		            {
 		                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
 		                return Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>{}(device, extents);
 		            }
 		        };

 		        template<typename T_Platform, typename T_Any>
 		        struct IsDataAccessible::FirstPath<cpu::Device<T_Platform>, T_Any>
 		        {
 		            bool operator()(cpu::Device<T_Platform>& device, T_Any const& view) const
 		            {
 		                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
 		                if constexpr(
 		                    ALPAKA_TYPEOF(getApi(view)){} == api::host
 		                    && ALPAKA_TYPEOF(getDeviceKind(device)){} == deviceKind::cpu)
 		                    return true;
 		                else
 		                    return false;
 		            }
 		        };

 		        /** Set number of thread blocks and threads per block to one
 		         *
 		         * There is no need to emulate blocks if we have only one thread.
 		         */
 		        template<
 		            typename T_Platform,
 		            onHost::concepts::FrameSpec T_FrameSpec,
 		            alpaka::concepts::KernelBundle T_KernelBundle>
 		        struct AdjustThreadSpec::Op<cpu::Device<T_Platform>, exec::CpuSerial, T_FrameSpec, T_KernelBundle>
 		        {
 		            using T_NumThreads = T_FrameSpec::ThreadExtentsVecType;

 		            auto operator()(
 		                cpu::Device<T_Platform> const& device,
 		                exec::CpuSerial const& executor,
 		                T_FrameSpec const& dataBlocking,
 		                T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_NumThreads>
 		            {
 		                ALPAKA_LOG_FUNCTION(onHost::logger::kernel);
 		                /// @todo add shortcut to create a CVec with equal values
 		                auto const allOne
 		                    = ALPAKA_TYPEOF(iotaCVec<typename T_NumThreads::type, T_NumThreads::dim()>())::template all<1u>();
 		                return ThreadSpec{allOne, allOne};
 		            }

 		            auto operator()(
 		                cpu::Device<T_Platform> const& device,
 		                exec::CpuSerial const& executor,
 		                T_FrameSpec const& dataBlocking,
 		                T_KernelBundle const& kernelBundle) const
 		            {
 		                ALPAKA_LOG_FUNCTION(onHost::logger::kernel);
 		                /// @todo add shortcut to create a CVec with equal values
 		                auto const allOne
 		                    = ALPAKA_TYPEOF(iotaCVec<typename T_NumThreads::type, T_NumThreads::dim()>())::template all<1u>();
 		                return ThreadSpec{allOne, allOne};
 		            }
 		        };

 		        template<
 		            typename T_Platform,
 		            alpaka::concepts::Executor T_Executor,
 		            onHost::concepts::FrameSpec T_FrameSpec,
 		            alpaka::concepts::KernelBundle T_KernelBundle>
 		        requires exec::isSeqExecutor_v<T_Executor>
 		        struct AdjustThreadSpec::Op<cpu::Device<T_Platform>, T_Executor, T_FrameSpec, T_KernelBundle>
 		        {
 		            using T_NumThreads = T_FrameSpec::ThreadExtentsVecType;

 		            auto operator()(
 		                cpu::Device<T_Platform> const& device,
 		                T_Executor const& executor,
 		                T_FrameSpec const& dataBlocking,
 		                T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_NumThreads>
 		            {
 		                ALPAKA_LOG_FUNCTION(onHost::logger::kernel);
 		                auto numThreadBlocks = dataBlocking.getThreadSpec().m_numBlocks;
 		                return ThreadSpec{numThreadBlocks, T_NumThreads::template all<1u>()};
 		            }

 		            auto operator()(
 		                cpu::Device<T_Platform> const& device,
 		                T_Executor const& executor,
 		                T_FrameSpec const& dataBlocking,
 		                T_KernelBundle const& kernelBundle) const
 		            {
 		                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::kernel);
 		                auto numThreadBlocks = dataBlocking.getThreadSpec().m_numBlocks;
 		#if 0
 		               using IdxType = typename T_NumBlocks::type;
 		               // @todo get this number from device properties
 		               static auto const maxBlocks = device.m_properties.m_multiProcessorCount;


 		               while(numThreadBlocks.product() > maxBlocks)
 		               {
 		                   uint32_t maxIdx = 0u;
 		                   auto maxValue = numThreadBlocks[0];
 		                   for(auto i = 0u; i < T_NumBlocks::dim(); ++i)
 		                       if(maxValue < numThreadBlocks[i])
 		                       {
 		                           maxIdx = i;
 		                           maxValue = numThreadBlocks[i];
 		                       }
 		                   if(numThreadBlocks.product() > maxBlocks)
 		                       numThreadBlocks[maxIdx] = divCeil(numThreadBlocks[maxIdx], IdxType{2u});

 		               }
 		#endif
 		                auto const numThreads = Vec<typename T_NumThreads::type, T_NumThreads::dim()>::all(1);
 		                return ThreadSpec{numThreadBlocks, numThreads};
 		            }
 		        };

 		        template<typename T_Platform>
 		        struct GetDeviceProperties::Op<cpu::Device<T_Platform>>
 		        {
 		            DeviceProperties operator()(cpu::Device<T_Platform> const& device) const
 		            {
 		                return device.m_properties;
 		            }
 		        };
 		    } // namespace internal
 		} // namespace alpaka::onHost

 		namespace alpaka::internal
 		{
 		    template<typename T_Platform>
 		    struct GetApi::Op<onHost::cpu::Device<T_Platform>>
 		    {
 		        inline constexpr auto operator()(auto&& device) const
 		        {
 		            return alpaka::getApi(device.m_platform);
 		        }
 		    };
 		} // namespace alpaka::internal
 		// ==
 		// == ./include/alpaka/api/host/Device.hpp ==
 		// ============================================================================

 	// #include "alpaka/api/host/Event.hpp"    // amalgamate: file already inlined
 		// ============================================================================
 		// == ./include/alpaka/api/host/Platform.hpp ==
 		// ==
 		/* Copyright 2024 René Widera
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/api/host/Device.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/api/host/sysInfo.hpp ==
 			// ==
 			/* Copyright 2022 Benjamin Worpitz, Daniel Vollmer, Erik Zenker, René Widera, Bernhard Manfred Gruber, Andrea Bocci
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 			#if ALPAKA_OS_WINDOWS || ALPAKA_OS_CYGWIN
 			#    ifndef NOMINMAX
 			#        define NOMINMAX
 			#    endif
 			#    ifndef WIN32_LEAN_AND_MEAN
 			#        define WIN32_LEAN_AND_MEAN
 			#    endif
 			// We could use some more macros to reduce the number of sub-headers included, but this would restrict user code.
 			#    include <windows.h>
 			#elif ALPAKA_OS_LINUX || ALPAKA_OS_MACOS
 			#    include <sys/param.h>
 			#    include <sys/types.h>
 			#    include <unistd.h>

 			// #    include <cstdint>    // amalgamate: file already included
 			#    if ALPAKA_OS_BSD || ALPAKA_OS_MACOS
 			#        include <sys/sysctl.h>
 			#    endif
 			#endif

 			#if ALPAKA_OS_LINUX
 			#    include <fstream>
 			#endif

 			// #include <cstdint>    // amalgamate: file already included
 			// #include <cstring>    // amalgamate: file already included
 			// #include <stdexcept>    // amalgamate: file already included
 			// #include <string>    // amalgamate: file already included

 			#if ALPAKA_ARCH_X86
 			#    if ALPAKA_COMP_GNUC || ALPAKA_COMP_CLANG || ALPAKA_COMP_PGI
 			#        include <cpuid.h>
 			#    elif ALPAKA_COMP_MSVC || defined(ALPAKA_COMP_MSVC_EMULATED)
 			// #        include <intrin.h>    // amalgamate: file already included
 			#    endif
 			#endif

 			namespace alpaka::onHost
 			{
 			    constexpr int NO_CPUID = 0;
 			    constexpr int UNKNOWN_CPU = 0;
 			    constexpr int UNKNOWN_COMPILER = 1;
 			#if ALPAKA_ARCH_X86
 			#    if ALPAKA_COMP_GNUC || ALPAKA_COMP_CLANG || ALPAKA_COMP_PGI
 			    inline auto cpuid(std::uint32_t level, std::uint32_t subfunction, std::uint32_t ex[4]) -> void
 			    {
 			        __cpuid_count(level, subfunction, ex[0], ex[1], ex[2], ex[3]);
 			    }

 			#    elif ALPAKA_COMP_MSVC || defined(ALPAKA_COMP_MSVC_EMULATED)
 			    inline auto cpuid(std::uint32_t level, std::uint32_t subfunction, std::uint32_t ex[4]) -> void
 			    {
 			        __cpuidex(reinterpret_cast<int*>(ex), level, subfunction);
 			    }
 			#    else
 			    inline auto cpuid(std::uint32_t, std::uint32_t, std::uint32_t ex[4]) -> void
 			    {
 			        ex[0] = ex[2] = ex[3] = NO_CPUID;
 			        ex[1] = UNKNOWN_COMPILER;
 			    }
 			#    endif
 			#else
 			    inline auto cpuid(std::uint32_t, std::uint32_t, std::uint32_t ex[4]) -> void
 			    {
 			        ex[0] = ex[2] = ex[3] = NO_CPUID;
 			        ex[1] = UNKNOWN_CPU;
 			    }
 			#endif
 			    //! \return The name of the CPU the code is running on.
 			    inline auto getCpuName() -> std::string
 			    {
 			        // Get extended ids.
 			        std::uint32_t ex[4] = {0};
 			        cpuid(0x8000'0000, 0, ex);
 			        std::uint32_t const nExIds(ex[0]);

 			        if(!nExIds)
 			        {
 			            switch(ex[1])
 			            {
 			            case UNKNOWN_COMPILER:
 			                return "<unknown: compiler>";
 			            case UNKNOWN_CPU:
 			                return "<unknown: CPU>";
 			            default:
 			                return "<unknown>";
 			            }
 			        }
 			#if ALPAKA_ARCH_X86
 			        // Get the information associated with each extended ID.
 			        char cpuBrandString[0x40] = {0};
 			        for(std::uint32_t i(0x8000'0000); i <= nExIds; ++i)
 			        {
 			            cpuid(i, 0, ex);

 			            // Interpret CPU brand string and cache information.
 			            if(i == 0x8000'0002)
 			            {
 			                std::memcpy(cpuBrandString, ex, sizeof(ex));
 			            }
 			            else if(i == 0x8000'0003)
 			            {
 			                std::memcpy(cpuBrandString + 16, ex, sizeof(ex));
 			            }
 			            else if(i == 0x8000'0004)
 			            {
 			                std::memcpy(cpuBrandString + 32, ex, sizeof(ex));
 			            }
 			        }
 			        return std::string(cpuBrandString);
 			#else
 			        return std::string("unknown");
 			#endif
 			    }

 			    //! \return Pagesize in bytes used by the system.
 			    inline size_t getPageSize()
 			    {
 			#if ALPAKA_OS_WINDOWS || ALPAKA_OS_CYGWIN
 			        SYSTEM_INFO si;
 			        GetSystemInfo(&si);
 			        return si.dwPageSize;
 			#elif ALPAKA_OS_LINUX || ALPAKA_OS_MACOS
 			#    if defined(_SC_PAGESIZE)
 			        return static_cast<std::size_t>(sysconf(_SC_PAGESIZE));
 			#    else
 			        // this is legacy and only used as fallback
 			        return = static_cast<size_t>(getpagesize());
 			#    endif
 			#else
 			#    error "getPageSize not implemented for this system!"
 			        return 0;
 			#endif
 			    }

 			    //! \return The total number of bytes of global memory.
 			    //! Adapted from David Robert Nadeau:
 			    //! http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system
 			    inline auto getTotalGlobalMemSizeBytes() -> std::size_t
 			    {
 			#if ALPAKA_OS_WINDOWS
 			        MEMORYSTATUSEX status;
 			        status.dwLength = sizeof(status);
 			        GlobalMemoryStatusEx(&status);
 			        return static_cast<std::size_t>(status.ullTotalPhys);

 			#elif ALPAKA_OS_CYGWIN
 			        // New 64-bit MEMORYSTATUSEX isn't available.
 			        MEMORYSTATUS status;
 			        status.dwLength = sizeof(status);
 			        GlobalMemoryStatus(&status);
 			        return static_cast<std::size_t>(status.dwTotalPhys);

 			#elif ALPAKA_OS_LINUX || ALPAKA_OS_MACOS
 			        // Unix : Prefer sysctl() over sysconf() except sysctl() with HW_REALMEM and HW_PHYSMEM which are not
 			        // always reliable
 			#    if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
 			        int mib[2]
 			            = {CTL_HW,
 			#        if defined(HW_MEMSIZE) // OSX
 			               HW_MEMSIZE
 			#        elif defined(HW_PHYSMEM64) // NetBSD, OpenBSD.
 			               HW_PHYSMEM64
 			#        endif
 			            };
 			        std::uint64_t size(0);
 			        std::size_t sizeLen{sizeof(size)};
 			        if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
 			            throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
 			        return static_cast<std::size_t>(size);

 			#    elif defined(_SC_AIX_REALMEM) // AIX.
 			        return static_cast<std::size_t>(sysconf(_SC_AIX_REALMEM)) * static_cast<std::size_t>(1024);

 			#    elif defined(_SC_PHYS_PAGES) // Linux, FreeBSD, OpenBSD, Solaris.
 			        return static_cast<std::size_t>(sysconf(_SC_PHYS_PAGES)) * getPageSize();

 			#    elif defined(CTL_HW)                                                                                             \
 			        && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) // FreeBSD, DragonFly BSD, NetBSD, OpenBSD, and OSX.
 			        int mib[2]
 			            = {CTL_HW,
 			#        if defined(HW_REALMEM) // FreeBSD.
 			               HW_REALMEM
 			#        elif defined(HW_PYSMEM) // Others.
 			               HW_PHYSMEM
 			#        endif
 			            };
 			        std::uint32_t size(0);
 			        std::size_t const sizeLen{sizeof(size)};
 			        if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
 			            throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
 			        return static_cast<std::size_t>(size);
 			#    endif

 			#else
 			#    error "getTotalGlobalMemSizeBytes not implemented for this system!"
 			#endif
 			    }

 			    //! \return The free number of bytes of global memory.
 			    //! \throws std::logic_error if not implemented on the system and std::runtime_error on other errors.
 			    inline auto getFreeGlobalMemSizeBytes() -> std::size_t
 			    {
 			#if ALPAKA_OS_WINDOWS
 			        MEMORYSTATUSEX status;
 			        status.dwLength = sizeof(status);
 			        GlobalMemoryStatusEx(&status);
 			        return static_cast<std::size_t>(status.ullAvailPhys);
 			#elif ALPAKA_OS_LINUX
 			#    if defined(_SC_AVPHYS_PAGES)
 			        return static_cast<std::size_t>(sysconf(_SC_AVPHYS_PAGES)) * getPageSize();
 			#    else
 			        // this is legacy and only used as fallback
 			        return static_cast<std::size_t>(get_avphys_pages()) * getPageSize();
 			#    endif
 			#elif ALPAKA_OS_MACOS
 			        int free_pages = 0;
 			        std::size_t len = sizeof(free_pages);
 			        if(sysctlbyname("vm.page_free_count", &free_pages, &len, nullptr, 0) < 0)
 			        {
 			            throw std::logic_error("getFreeGlobalMemSizeBytes failed calling sysctl(vm.page_free_count)!");
 			        }

 			        return static_cast<std::size_t>(free_pages) * getPageSize();
 			#else
 			#    error "getFreeGlobalMemSizeBytes not implemented for this system!"
 			#endif
 			    }

 			} // namespace alpaka::onHost
 			// ==
 			// == ./include/alpaka/api/host/sysInfo.hpp ==
 			// ============================================================================

 		// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined

 		// #include <memory>    // amalgamate: file already included
 		// #include <sstream>    // amalgamate: file already included

 		namespace alpaka::onHost
 		{
 		    namespace cpu
 		    {
 		        template<alpaka::concepts::DeviceKind T_DeviceKind>
 		        struct Platform : std::enable_shared_from_this<Platform<T_DeviceKind>>
 		        {
 		        public:
 		            Platform() = default;

 		            Platform(Platform const&) = delete;
 		            Platform& operator=(Platform const&) = delete;

 		            Platform(Platform&&) = delete;
 		            Platform& operator=(Platform&&) = delete;

 		        private:
 		            void _()
 		            {
 		                static_assert(internal::concepts::Platform<Platform>);
 		            }

 		            std::weak_ptr<cpu::Device<Platform>> device;

 		            std::shared_ptr<Platform> getSharedPtr()
 		            {
 		                return this->shared_from_this();
 		            }

 		            friend struct alpaka::internal::GetName;

 		            std::string getName() const
 		            {
 		                return "host::Platform";
 		            }

 		            friend struct internal::GetDeviceCount;

 		            uint32_t getDeviceCount() const
 		            {
 		                constexpr bool isSupportedDev = trait::IsDeviceSupportedBy::Op<T_DeviceKind, api::Host>::value;
 		                if constexpr(isSupportedDev)
 		                    return 1;

 		                return 0;
 		            }

 		            friend struct internal::MakeDevice;

 		            Handle<cpu::Device<Platform>> makeDevice(uint32_t const& idx)
 		            {
 		                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
 		                uint32_t const numDevices = getDeviceCount();
 		                if(idx >= numDevices)
 		                {
 		                    std::stringstream ssErr;
 		                    ssErr << "Unable to return device handle with index " << idx << " because there are only "
 		                          << numDevices << " devices of type '" << alpaka::onHost::getStaticName(T_DeviceKind{})
 		                          << "' !";
 		                    throw std::runtime_error(ssErr.str());
 		                }
 		                if(auto sharedPtr = device.lock())
 		                {
 		                    return sharedPtr;
 		                }
 		                auto thisHandle = getSharedPtr();
 		                auto newDevice = std::make_shared<cpu::Device<Platform>>(std::move(thisHandle), idx);
 		                device = newDevice;
 		                return newDevice;
 		            }

 		            friend struct internal::GetDeviceProperties;

 		            friend struct alpaka::internal::GetDeviceType;

 		            T_DeviceKind getDeviceKind() const
 		            {
 		                return T_DeviceKind{};
 		            }
 		        };
 		    } // namespace cpu

 		    namespace internal
 		    {
 		        template<alpaka::concepts::DeviceKind T_DeviceKind>
 		        struct MakePlatform::Op<api::Host, T_DeviceKind>
 		        {
 		            auto operator()(api::Host, T_DeviceKind) const
 		            {
 		                return make_sharedSingleton<cpu::Platform<T_DeviceKind>>();
 		            }
 		        };

 		        template<alpaka::concepts::DeviceKind T_DeviceKind>
 		        struct GetDeviceProperties::Op<cpu::Platform<T_DeviceKind>>
 		        {
 		            DeviceProperties operator()(cpu::Platform<T_DeviceKind> const& platform, uint32_t deviceIdx) const
 		            {
 		                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
 		                auto prop = DeviceProperties{};
 		                prop.m_name = getCpuName();
 		                prop.m_maxThreadsPerBlock = std::numeric_limits<uint32_t>::max();
 		                prop.m_warpSize = 1u;
 		                prop.m_multiProcessorCount = std::thread::hardware_concurrency();

 		                return prop;
 		            }
 		        };
 		    } // namespace internal
 		} // namespace alpaka::onHost

 		namespace alpaka::internal
 		{
 		    template<alpaka::concepts::DeviceKind T_DeviceKind>
 		    struct GetApi::Op<onHost::cpu::Platform<T_DeviceKind>>
 		    {
 		        inline constexpr auto operator()(auto&& platform) const
 		        {
 		            return api::Host{};
 		        }
 		    };
 		} // namespace alpaka::internal
 		// ==
 		// == ./include/alpaka/api/host/Platform.hpp ==
 		// ============================================================================

 	// #include "alpaka/api/host/Queue.hpp"    // amalgamate: file already inlined
 		// ============================================================================
 		// == ./include/alpaka/api/host/atomic.hpp ==
 		// ==
 		/* Copyright 2022 Felice Pantaleo, Andrea Bocci, Jan Stephan
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include "alpaka/api/host/tag.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/onAcc/atomicOp.hpp ==
 			// ==
 			/* Copyright 2020 Benjamin Worpitz, Bernhard Manfred Gruber
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 			// #include <algorithm>    // amalgamate: file already included
 			#include <type_traits>

 			namespace alpaka::onAcc
 			{
 			    //! The addition function object.
 			    struct AtomicAdd
 			    {
 			        //! \return The old value of addr.
 			        template<typename T>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;
 			#if ALPAKA_COMP_GNUC
 			#    pragma GCC diagnostic push
 			#    pragma GCC diagnostic ignored "-Wconversion"
 			#endif
 			            ref += value;
 			            return old;
 			#if ALPAKA_COMP_GNUC
 			#    pragma GCC diagnostic pop
 			#endif
 			        }
 			    };

 			    //! The subtraction function object.
 			    struct AtomicSub
 			    {
 			        //! \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;
 			#if ALPAKA_COMP_GNUC
 			#    pragma GCC diagnostic push
 			#    pragma GCC diagnostic ignored "-Wconversion"
 			#endif
 			            ref -= value;
 			#if ALPAKA_COMP_GNUC
 			#    pragma GCC diagnostic pop
 			#endif
 			            return old;
 			        }
 			    };

 			    //! The minimum function object.
 			    struct AtomicMin
 			    {
 			        //! \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;
 			            ref = std::min(ref, value);
 			            return old;
 			        }
 			    };

 			    //! The maximum function object.
 			    struct AtomicMax
 			    {
 			        //! \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;
 			            ref = std::max(ref, value);
 			            return old;
 			        }
 			    };

 			    //! The exchange function object.
 			    struct AtomicExch
 			    {
 			        //! \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;
 			            ref = value;
 			            return old;
 			        }
 			    };

 			    //! The increment function object.
 			    struct AtomicInc
 			    {
 			        //! Increments up to value, then reset to 0.
 			        //!
 			        //! \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;
 			            ref = ((old >= value) ? static_cast<T>(0) : static_cast<T>(old + static_cast<T>(1)));
 			            return old;
 			        }
 			    };

 			    //! The decrement function object.
 			    struct AtomicDec
 			    {
 			        //! Decrement down to 0, then reset to value.
 			        //!
 			        //! \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;
 			            ref = (((old == static_cast<T>(0)) || (old > value)) ? value : static_cast<T>(old - static_cast<T>(1)));
 			            return old;
 			        }
 			    };

 			    //! The and function object.
 			    struct AtomicAnd
 			    {
 			        //! \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;
 			            ref &= value;
 			            return old;
 			        }
 			    };

 			    //! The or function object.
 			    struct AtomicOr
 			    {
 			        //! \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;
 			            ref |= value;
 			            return old;
 			        }
 			    };

 			    //! The exclusive or function object.
 			    struct AtomicXor
 			    {
 			        //! \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;
 			            ref ^= value;
 			            return old;
 			        }
 			    };

 			    //! The compare and swap function object.
 			    struct AtomicCas
 			    {
 			        //! AtomicCas for non floating point values
 			        // \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T, std::enable_if_t<!std::is_floating_point_v<T>, bool> = true>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* addr, T const& compare, T const& value) const -> T
 			        {
 			            auto const old = *addr;
 			            auto& ref = *addr;

 			            // check if values are bit-wise equal
 			            ref = ((old == compare) ? value : old);
 			            return old;
 			        }

 			        //! AtomicCas for floating point values
 			        // \return The old value of addr.
 			        ALPAKA_NO_HOST_ACC_WARNING
 			        template<typename T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
 			        ALPAKA_FN_HOST_ACC auto operator()(T* addr, T const& compare, T const& value) const -> T
 			        {
 			            static_assert(sizeof(T) == 4u || sizeof(T) == 8u, "AtomicCas is supporting only 32bit and 64bit values!");
 			            // Type to reinterpret too to perform the bit comparison
 			            using BitType = std::conditional_t<sizeof(T) == 4u, unsigned int, unsigned long long>;

 			            // type used to have a safe way to reinterprete the data into another type
 			            // std::variant can not be used because clang8 has issues to compile std::variant
 			            struct BitUnion
 			            {
 			                union
 			                {
 			                    T value;
 			                    BitType r;
 			                };
 			            };

 			            auto const old = *addr;
 			            auto& ref = *addr;


 			            BitUnion o{old};
 			            BitUnion c{compare};

 			            ref = ((o.r == c.r) ? value : old);
 			            return old;
 			        }
 			    };
 			} // namespace alpaka::onAcc
 			// ==
 			// == ./include/alpaka/onAcc/atomicOp.hpp ==
 			// ============================================================================

 		// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/onAcc/scope.hpp ==
 			// ==
 			/* Copyright 2025 Mehmet Yusufoglu, René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include <string>    // amalgamate: file already included

 			/**
 			 * @brief Provides scopes for atomic and memory fence operations, analogous to NVIDIA CUDA's atomic and fence scopes.
 			 *
 			 * This namespace defines the visibility scopes for atomic operations and memory fences,
 			 * which control the visibility of memory operations across threads, blocks, and devices.
 			 * The provided scopes are:
 			 * - Block: Visibility within a thread block.
 			 * - Device: Visibility across all thread blocks on the same device.
 			 * - System: System-wide visibility, mapped to the strongest available atomic/fence by the backend.
 			 *
 			 * @see alpaka::onAcc::atomicAdd, alpaka::onAcc::memFence
 			 */
 			namespace alpaka::onAcc::scope
 			{
 			    /**
 			     * @brief Scope for atomic and fence operations visible only within the same thread block.
 			     *
 			     * When used with atomic operations (e.g., atomicAdd), only threads within the same block
 			     * will see the updated value. When used with threadFence, it ensures that all writes
 			     * from the current thread are visible to all other threads in the same block.
 			     *
 			     * @note Analogous to CUDA's `atomicAdd_block` and `threadFence_block`.
 			     */
 			    struct Block
 			    {
 			        static std::string getName()
 			        {
 			            return "Block";
 			        }
 			    };

 			    inline constexpr Block block{};

 			    /**
 			     * @brief Scope for atomic and fence operations visible across all thread blocks on the same device.
 			     *
 			     * When used with atomic operations, all threads on the same device will see the updated value.
 			     * When used with threadFence, it ensures that all writes from the current thread are visible
 			     * to all other threads on the same device.
 			     *
 			     * @note This scope is stronger than Block but weaker than System.
 			     */
 			    struct Device
 			    {
 			        static std::string getName()
 			        {
 			            return "Device";
 			        }
 			    };

 			    inline constexpr Device device{};

 			    /**
 			     * @brief Scope for atomic and fence operations with system-wide visibility.
 			     *
 			     * When used with atomic operations, all threads in the system (potentially across multiple devices)
 			     * will see the updated value. When used with threadFence, it ensures that all writes from the current
 			     * thread are visible to all other threads in the system.
 			     *
 			     * @attention System operations are only visible to other threads of the same device kind.
 			     * Operations executed on a host compute device will not be visible to threads in, for example, CUDA/HIP or oneAPI
 			     * kernels, and vice versa.
 			     *
 			     * @note This is the strongest scope, analogous to CUDA's `atomicAdd_system` and the strongest fence.
 			     */
 			    struct System
 			    {
 			        static std::string getName()
 			        {
 			            return "System";
 			        }
 			    };

 			    inline constexpr System system{};
 			} // namespace alpaka::onAcc::scope
 			// ==
 			// == ./include/alpaka/onAcc/scope.hpp ==
 			// ============================================================================


 		// #include <array>    // amalgamate: file already included
 		// #include <atomic>    // amalgamate: file already included
 		#include <type_traits>


 		#ifdef ALPAKA_DISABLE_STD_ATOMIC_REF
 		#    include <boost/atomic.hpp>
 		#endif

 		namespace alpaka::onAcc
 		{
 		    namespace detail
 		    {
 		#if defined(ALPAKA_DISABLE_STD_ATOMIC_REF)
 		        template<typename T>
 		        using atomic_ref = boost::atomic_ref<T>;
 		#else
 		        template<typename T>
 		        using atomic_ref = std::atomic_ref<T>;
 		#endif
 		    } // namespace detail

 		    //! The atomic ops based on atomic_ref for CPU accelerators.
 		    //
 		    //  Atomics can be used in the grids, blocks and threads hierarchy levels.
 		    //

 		    class AtomicAtomicRef
 		    {
 		    };

 		    template<typename T>
 		    void isSupportedByAtomicAtomicRef()
 		    {
 		        static_assert(
 		            std::is_trivially_copyable_v<T> && detail::atomic_ref<T>::required_alignment <= alignof(T),
 		            "Type not supported by AtomicAtomicRef, please recompile defining "
 		            "ALPAKA_DISABLE_ATOMIC_ATOMICREF.");
 		    }

 		    namespace internalCompute
 		    {
 		        //! The CPU accelerators AtomicAdd.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicAdd, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                return ref.fetch_add(value);
 		            }
 		        };

 		        //! The CPU accelerators AtomicSub.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicSub, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                return ref.fetch_sub(value);
 		            }
 		        };

 		        //! The CPU accelerators AtomicMin.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicMin, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                T old = ref;
 		                T result = old;
 		                result = std::min(result, value);
 		                while(!ref.compare_exchange_weak(old, result))
 		                {
 		                    result = old;
 		                    result = std::min(result, value);
 		                }
 		                return old;
 		            }
 		        };

 		        //! The CPU accelerators AtomicMax.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicMax, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                T old = ref;
 		                T result = old;
 		                result = std::max(result, value);
 		                while(!ref.compare_exchange_weak(old, result))
 		                {
 		                    result = old;
 		                    result = std::max(result, value);
 		                }
 		                return old;
 		            }
 		        };

 		        //! The CPU accelerators AtomicExch.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicExch, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                T old = ref;
 		                T result = value;
 		                while(!ref.compare_exchange_weak(old, result))
 		                {
 		                    result = value;
 		                }
 		                return old;
 		            }
 		        };

 		        //! The CPU accelerators AtomicInc.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicInc, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                T old = ref;
 		                T result = ((old >= value) ? 0 : static_cast<T>(old + 1));
 		                while(!ref.compare_exchange_weak(old, result))
 		                {
 		                    result = ((old >= value) ? 0 : static_cast<T>(old + 1));
 		                }
 		                return old;
 		            }
 		        };

 		        //! The CPU accelerators AtomicDec.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicDec, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                T old = ref;
 		                T result = ((old >= value) ? 0 : static_cast<T>(old - 1));
 		                while(!ref.compare_exchange_weak(old, result))
 		                {
 		                    result = ((old >= value) ? 0 : static_cast<T>(old - 1));
 		                }
 		                return old;
 		            }
 		        };

 		        //! The CPU accelerators AtomicAnd.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicAnd, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                return ref.fetch_and(value);
 		            }
 		        };

 		        //! The CPU accelerators AtomicOr.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicOr, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                return ref.fetch_or(value);
 		            }
 		        };

 		        //! The CPU accelerators AtomicXor.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicXor, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                return ref.fetch_xor(value);
 		            }
 		        };

 		        //! The CPU accelerators AtomicCas.
 		        template<typename T, typename T_Scope>
 		        struct Atomic::Op<AtomicCas, internal::StlAtomic, T, T_Scope>
 		        {
 		            ALPAKA_FN_HOST static auto atomicOp(
 		                internal::StlAtomic const&,
 		                T* const addr,
 		                T const& compare,
 		                T const& value) -> T
 		            {
 		                isSupportedByAtomicAtomicRef<T>();
 		                detail::atomic_ref<T> ref(*addr);
 		                T old = ref;
 		                T result;
 		                do
 		                {
 		#if ALPAKA_COMP_GNUC || ALPAKA_COMP_CLANG
 		#    pragma GCC diagnostic push
 		#    pragma GCC diagnostic ignored "-Wfloat-equal"
 		#endif
 		                    result = ((old == compare) ? value : old);
 		#if ALPAKA_COMP_GNUC || ALPAKA_COMP_CLANG
 		#    pragma GCC diagnostic pop
 		#endif
 		                } while(!ref.compare_exchange_weak(old, result));
 		                return old;
 		            }
 		        };
 		    } // namespace internalCompute
 		} // namespace alpaka::onAcc
 		// ==
 		// == ./include/alpaka/api/host/atomic.hpp ==
 		// ============================================================================

 		// ============================================================================
 		// == ./include/alpaka/api/host/memFence.hpp ==
 		// ==
 		/* Copyright 2025 Mehmet Yusufoglu, René Widera
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/api/host/executor.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/api/host/tag.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

 		// #include <atomic>    // amalgamate: file already included
 		#include <type_traits>

 		namespace alpaka::onAcc::internalCompute
 		{
 		    namespace detail
 		    {
 		        // suppress warning: `warning: 'atomic_thread_fence' is not supported with '-fsanitize=thread' [-Wtsan]`
 		#if defined(__GNUC__) && !defined(__clang__)
 		#    pragma GCC diagnostic push
 		#    pragma GCC diagnostic ignored "-Wtsan"
 		#endif

 		        // Serial executor fence implementation
 		        // Block scope: nothing to do for serial
 		        inline void hostMemoryFenceImpl(exec::CpuSerial const&, scope::Block const)
 		        {
 		            // Block scope: NO-OP since threads within a block
 		        }

 		        inline void hostMemoryFenceImpl(exec::CpuSerial const&, scope::Device const)
 		        {
 		            std::atomic_thread_fence(std::memory_order_acq_rel);
 		        }

 		        inline void hostMemoryFenceImpl(exec::CpuSerial const&, scope::System const)
 		        {
 		            std::atomic_thread_fence(std::memory_order_acq_rel);
 		        }

 		        inline void hostMemoryFenceImpl(exec::CpuOmpBlocks const&, scope::Block const)
 		        {
 		            // Block scope: NO-OP for OMP since single-threaded within a block
 		        }

 		        // TBB doesn’t have a separate “thread fence”.
 		        inline void hostMemoryFenceImpl(exec::CpuOmpBlocks const&, scope::Device const)
 		        {
 		            std::atomic_thread_fence(std::memory_order_acq_rel);
 		        }

 		        inline void hostMemoryFenceImpl(exec::CpuOmpBlocks const&, scope::System const)
 		        {
 		            std::atomic_thread_fence(std::memory_order_acq_rel);
 		        }

 		        inline void hostMemoryFenceImpl(exec::CpuTbbBlocks const&, scope::Block const)
 		        {
 		            // Block scope: NO-OP for TBB since simulated single-thread blocks
 		        }

 		        inline void hostMemoryFenceImpl(exec::CpuTbbBlocks const&, scope::Device const)
 		        {
 		            std::atomic_thread_fence(std::memory_order_acq_rel);
 		        }

 		        inline void hostMemoryFenceImpl(exec::CpuTbbBlocks const&, scope::System const)
 		        {
 		            std::atomic_thread_fence(std::memory_order_acq_rel);
 		        }

 		#if defined(__GNUC__) && !defined(__clang__)
 		#    pragma GCC diagnostic pop
 		#endif
 		    } // namespace detail

 		    // Host API: dispatch to executor-specific implementation
 		    template<typename T_Scope>
 		    struct MemoryFence::Op<api::Host, T_Scope>
 		    {
 		        void operator()(onAcc::concepts::Acc<api::Host> auto const& acc, T_Scope const scope) const
 		        {
 		            detail::hostMemoryFenceImpl(acc[object::exec], scope);
 		        }
 		    };
 		} // namespace alpaka::onAcc::internalCompute
 		// ==
 		// == ./include/alpaka/api/host/memFence.hpp ==
 		// ============================================================================

 	// ==
 	// == ./include/alpaka/api/cpu.hpp ==
 	// ============================================================================

 	// ============================================================================
 	// == ./include/alpaka/api/oneApi.hpp ==
 	// ==
 	/* Copyright 2025 Simeon Ehrig
 	 * SPDX-License-Identifier: MPL-2.0
 	 */

 	// #pragma once
 	// #include "alpaka/api/oneApi/Api.hpp"    // amalgamate: file already inlined
 		// ============================================================================
 		// == ./include/alpaka/api/oneApi/Device.hpp ==
 		// ==
 		/* Copyright 2024 René Widera
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 			// ============================================================================
 			// == ./include/alpaka/api/syclGeneric/Device.hpp ==
 			// ==
 			/* Copyright 2025 Simeon Ehrig, René Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 			#if ALPAKA_LANG_SYCL

 				// ============================================================================
 				// == ./include/alpaka/api/syclGeneric/Queue.hpp ==
 				// ==
 				/* Copyright 2025 Simeon Ehrig, René Widera, Mehmet Yusufoglu, Andrea Bocci
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 				#if ALPAKA_LANG_SYCL

 					// ============================================================================
 					// == ./include/alpaka/api/syclGeneric/Event.hpp ==
 					// ==
 					/* Copyright 2025 René Widera
 					 * SPDX-License-Identifier: MPL-2.0
 					 */

 					// #pragma once
 					// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 					#if ALPAKA_LANG_SYCL

 					// #    include "alpaka/api/util.hpp"    // amalgamate: file already inlined
 					// #    include "alpaka/core/CallbackThread.hpp"    // amalgamate: file already inlined
 					// #    include "alpaka/interface.hpp"    // amalgamate: file already inlined
 					// #    include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 					// #    include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
 					// #    include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
 					// #    include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined

 					#    include <sycl/sycl.hpp>

 					// #    include <algorithm>    // amalgamate: file already included
 					// #    include <sstream>    // amalgamate: file already included

 					namespace alpaka::onHost
 					{
 					    namespace syclGeneric
 					    {
 					        template<typename T_Device>
 					        struct Event : std::enable_shared_from_this<Event<T_Device>>
 					        {
 					        private:
 					            friend struct alpaka::internal::GetApi;

 					        public:
 					            Event(internal::concepts::DeviceHandle auto device, uint32_t const idx)
 					                : m_device(std::move(device))
 					                , m_idx(idx)
 					            {
 					                ALPAKA_LOG_FUNCTION(onHost::logger::event);
 					            }

 					            Event(Event const&) = delete;
 					            Event& operator=(Event const&) = delete;

 					            Event(Event&&) = delete;
 					            Event& operator=(Event&&) = delete;

 					            ~Event()
 					            {
 					                ALPAKA_LOG_FUNCTION(onHost::logger::event);
 					                try
 					                {
 					                    m_event.wait_and_throw();
 					                }
 					                catch(sycl::exception const& err)
 					                {
 					                    std::cerr << "Caught SYCL exception while destructing a SYCL event: " << err.what() << " ("
 					                              << err.code() << ')' << std::endl;
 					                }
 					                catch(std::exception const& err)
 					                {
 					                    std::cerr << "The following runtime error(s) occurred while destructing a SYCL event:"
 					                              << err.what() << std::endl;
 					                }
 					            }

 					            std::shared_ptr<Event> getSharedPtr()
 					            {
 					                return this->shared_from_this();
 					            }

 					            [[nodiscard]] auto getNativeHandle() const noexcept
 					            {
 					                return m_event;
 					            }

 					            void wait()
 					            {
 					                ALPAKA_LOG_FUNCTION(onHost::logger::event);
 					                m_event.wait_and_throw();
 					            }

 					            std::string getName() const
 					            {
 					                std::stringstream ss;
 					                ss << "Queue<" << getApi(m_device).getName() << ">";
 					                ss << " id=" << m_idx;
 					                return ss.str();
 					            }

 					        private:
 					            friend struct alpaka::internal::GetDeviceType;
 					            friend struct alpaka::onHost::internal::Enqueue;

 					            auto getDeviceKind() const
 					            {
 					                return alpaka::internal::getDeviceKind(*m_device.get());
 					            }

 					            auto getDevice() const
 					            {
 					                return m_device;
 					            }

 					            friend struct onHost::internal::GetDevice;

 					            friend struct onHost::internal::IsEventComplete;

 					            /** Check if the event is complete.
 					             *
 					             * @return true if the event is complete, false otherwise
 					             */
 					            bool isEventComplete() noexcept
 					            {
 					                auto const status = m_event.template get_info<sycl::info::event::command_execution_status>();
 					                return (status == sycl::info::event_command_status::complete);
 					            }

 					            friend struct internal::WaitFor;
 					            friend struct internal::Wait;

 					            void setEvent(sycl::event const& event)
 					            {
 					                m_event = event;
 					            }

 					            Handle<T_Device> m_device;
 					            sycl::event m_event{};
 					            uint32_t m_idx = 0u;
 					        };


 					    } // namespace syclGeneric
 					} // namespace alpaka::onHost

 					namespace alpaka::internal

 					{
 					    template<typename T_Device>
 					    struct GetApi::Op<alpaka::onHost::syclGeneric::Event<T_Device>>
 					    {
 					        inline constexpr auto operator()(auto&& event) const
 					        {
 					            return alpaka::getApi(event.m_device);
 					        }
 					    };
 					} // namespace alpaka::internal

 					#endif
 					// ==
 					// == ./include/alpaka/api/syclGeneric/Event.hpp ==
 					// ============================================================================

 				// #    include "alpaka/api/util.hpp"    // amalgamate: file already inlined
 				// #    include "alpaka/core/CallbackThread.hpp"    // amalgamate: file already inlined
 				// #    include "alpaka/interface.hpp"    // amalgamate: file already inlined
 				// #    include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 				// #    include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
 				// #    include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
 				// #    include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
 				// #    include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
 				// #    include "alpaka/onHost/mem/SharedBuffer.hpp"    // amalgamate: file already inlined
 				// #    include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined

 				// #    include <sycl/sycl.hpp>    // amalgamate: file already included

 				// #    include <algorithm>    // amalgamate: file already included
 				// #    include <future>    // amalgamate: file already included
 				// #    include <sstream>    // amalgamate: file already included

 				namespace alpaka::onHost
 				{
 				    namespace syclGeneric
 				    {
 				        template<typename T_Device>
 				        struct Queue : std::enable_shared_from_this<Queue<T_Device>>
 				        {
 				        private:
 				            friend struct alpaka::internal::GetApi;

 				            template<alpaka::concepts::Vector TVec>
 				            static constexpr auto vecToSyclRange(TVec vec)
 				            {
 				                constexpr auto dim = std::decay_t<TVec>::dim();
 				                return [&vec]<auto... I>(std::index_sequence<I...>)
 				                // TODO: check if this is the correct order
 				                { return sycl::range<dim>(vec[I]...); }(std::make_index_sequence<dim>{});
 				            };

 				        public:
 				            Queue(internal::concepts::DeviceHandle auto device, uint32_t const idx, bool isBlocking)
 				                : m_device(std::move(device))
 				                , m_idx(idx)
 				                , m_queue(
 				                      m_device->getNativeHandle().second,
 				                      m_device->getNativeHandle().first,
 				                      {sycl::property::queue::in_order{}})
 				                , m_isBlocking(isBlocking)
 				            {
 				                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
 				            }

 				            [[nodiscard]] bool isBlocking() const noexcept
 				            {
 				                return m_isBlocking;
 				            }

 				            Queue(Queue const&) = delete;
 				            Queue& operator=(Queue const&) = delete;

 				            Queue(Queue&&) = delete;
 				            Queue& operator=(Queue&&) = delete;

 				            ~Queue()
 				            {
 				                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
 				                try
 				                {
 				                    m_queue.wait_and_throw();
 				                }
 				                catch(sycl::exception const& err)
 				                {
 				                    std::cerr << "Caught SYCL exception while destructing a SYCL queue: " << err.what() << " ("
 				                              << err.code() << ')' << std::endl;
 				                }
 				                catch(std::exception const& err)
 				                {
 				                    std::cerr << "The following runtime error(s) occurred while destructing a SYCL queue:"
 				                              << err.what() << std::endl;
 				                }
 				            }

 				            std::shared_ptr<Queue> getSharedPtr()
 				            {
 				                return this->shared_from_this();
 				            }

 				            [[nodiscard]] auto getNativeHandle() const noexcept
 				            {
 				                return m_queue;
 				            }

 				            void wait()
 				            {
 				                m_queue.wait_and_throw();
 				            }

 				            std::string getName() const
 				            {
 				                std::stringstream ss;
 				                ss << "Queue<" << getApi(m_device).getName() << ">";
 				                ss << " id=" << m_idx;
 				                return ss.str();
 				            }

 				        private:
 				            friend struct alpaka::internal::GetDeviceType;
 				            friend struct alpaka::onHost::internal::Enqueue;
 				            friend struct onHost::internal::AllocDeferred;

 				            auto getDeviceKind() const
 				            {
 				                return alpaka::internal::getDeviceKind(*m_device.get());
 				            }

 				            auto getDevice() const
 				            {
 				                return m_device;
 				            }

 				            friend struct onHost::internal::GetDevice;

 				            friend struct alpaka::onHost::internal::WaitFor;

 				            void waitFor(syclGeneric::Event<T_Device>& event)
 				            {
 				                ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::queue);
 				                sycl::event sycl_event = event.getNativeHandle();
 				                [[maybe_unused]] sycl::event ev
 				                    = m_queue.submit([sycl_event](sycl::handler& cgh) { cgh.depends_on(sycl_event); });
 				                if(isBlocking())
 				                    ev.wait_and_throw();
 				            }

 				            Handle<T_Device> m_device;
 				            uint32_t m_idx = 0u;
 				            sycl::queue m_queue;
 				            core::CallbackThread m_callBackThread;
 				            bool m_isBlocking{false};
 				        };


 				    } // namespace syclGeneric

 				    template<typename T_Device, typename T_Task>
 				    struct internal::Enqueue::Task<syclGeneric::Queue<T_Device>, T_Task>

 				    {
 				        /** It is not allowed to execute sycl methods within a SYCL host_task therefore we use a callback host
 				         * thread to execute the host function which is allowing to use sycl methods.
 				         */
 				        static void callHostTask(syclGeneric::Queue<T_Device>& queue, T_Task task)
 				        {
 				            auto f = queue.m_callBackThread.submit([t = std::move(task)] { t(); });
 				            f.wait();
 				        }

 				        void operator()(syclGeneric::Queue<T_Device>& queue, T_Task const& task) const
 				        {
 				            ALPAKA_LOG_FUNCTION(onHost::logger::queue);
 				            // using the queue by reference is fine here, because the queue is not destroyed while the task is
 				            // executed.
 				            [[maybe_unused]] sycl::event ev
 				                = queue.m_queue.submit([&queue, task](sycl::handler& cgh)
 				                                       { cgh.host_task([&queue, task]() { callHostTask(queue, task); }); });
 				            if(queue.isBlocking())
 				                ev.wait_and_throw();
 				        }
 				    };

 				    template<typename T_Device, typename T_Event>
 				    struct internal::Enqueue::Event<syclGeneric::Queue<T_Device>, T_Event>
 				    {
 				        void operator()(syclGeneric::Queue<T_Device>& queue, T_Event& event) const
 				        {
 				            ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::queue);
 				            sycl::event emulatedEvent = queue.m_queue.submit([](sycl::handler& cgh) { cgh.single_task([]() {}); });
 				            event.setEvent(emulatedEvent);

 				            if(queue.isBlocking())
 				                emulatedEvent.wait_and_throw();
 				        }
 				    };

 				    template<typename T_Device, typename T_Dest, typename T_Extents>
 				    requires(alpaka::trait::getDim_v<T_Extents> == 1u)
 				    struct internal::Memset::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Extents>
 				    {
 				        void operator()(syclGeneric::Queue<T_Device>& queue, auto&& dest, uint8_t byteValue, T_Extents const& extents)
 				            const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
 				        {
 				            ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
 				            // TODO: implement generic version for multidimensional memory
 				            sycl::queue sycl_queue = queue.getNativeHandle();
 				            [[maybe_unused]] sycl::event ev = sycl_queue.memset(
 				                internal::Data::data(dest),
 				                byteValue,
 				                extents.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
 				            if(queue.isBlocking())
 				                ev.wait_and_throw();
 				        }
 				    };

 				    template<typename T_Device, typename T_Dest, typename T_Source, typename T_Extents>
 				    requires(alpaka::trait::getDim_v<T_Extents> == 1u)
 				    struct internal::Memcpy::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Source, T_Extents>
 				    {
 				        void operator()(
 				            syclGeneric::Queue<T_Device>& queue,
 				            auto&& dest,
 				            T_Source const& source,
 				            T_Extents const& extents) const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
 				        {
 				            ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
 				            // TODO: implement generic version for multidimensional memory
 				            sycl::queue sycl_queue = queue.getNativeHandle();
 				            [[maybe_unused]] sycl::event ev = sycl_queue.memcpy(
 				                toVoidPtr(internal::Data::data(dest)),
 				                toVoidPtr(internal::Data::data(source)),
 				                extents.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
 				            if(queue.isBlocking())
 				                ev.wait_and_throw();
 				        }
 				    };

 				    template<typename T_Device, typename T_Dest, typename T_Value, typename T_Extents>
 				    requires(alpaka::trait::getDim_v<T_Extents> == 1u)
 				    struct internal::Fill::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Value, T_Extents>
 				    {
 				        void operator()(
 				            syclGeneric::Queue<T_Device>& queue,
 				            auto&& dest,
 				            T_Value elementValue,
 				            T_Extents const& extents) const
 				            requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
 				                     && std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
 				        {
 				            ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
 				            sycl::queue sycl_queue = queue.getNativeHandle();
 				            [[maybe_unused]] sycl::event ev = sycl_queue.fill(internal::Data::data(dest), elementValue, extents.x());
 				            if(queue.isBlocking())
 				                ev.wait_and_throw();
 				        }
 				    };

 				    /** The code is a copy of the Alloc::Op with the difference that the memory is allocated and freed
 				     * within a queue
 				     */
 				    template<typename T_Type, typename T_Device, alpaka::concepts::Vector T_Extents>
 				    struct internal::AllocDeferred::Op<T_Type, syclGeneric::Queue<T_Device>, T_Extents>
 				    {
 				        auto operator()(syclGeneric::Queue<T_Device>& queue, T_Extents const& extents) const
 				        {
 				            ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
 				            auto device = queue.getDevice();
 				            constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
 				                ALPAKA_TYPEOF(getApi(device)){},
 				                ALPAKA_TYPEOF(getDeviceKind(device)){});
 				            auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

 				            auto deviceDependency = onHost::Device{queue.getDevice()->getSharedPtr()};
 				            sycl::queue sycl_queue = queue.getNativeHandle();
 				            auto queueDependency = queue.getSharedPtr();


 				            T_Type* ptr = reinterpret_cast<T_Type*>(sycl::aligned_alloc_device(alignment, memSizeInByte, sycl_queue));

 				            // guarantees that the allocation is blocking the queue if necessary.
 				            if(queue.isBlocking())
 				                sycl_queue.wait_and_throw();

 				            auto deleter = [queueDep = std::move(queueDependency), ptr]()
 				            {
 				                sycl::queue sycl_queue = queueDep->getNativeHandle();
 				                /* Always enqueue into a queue, even if the queue is blocking, to track possible in queue dependencies.
 				                 * sycl::free() is safe to be called within a hast_task
 				                 */
 				                [[maybe_unused]] sycl::event ev = sycl_queue.submit(
 				                    [&](sycl::handler& cgh) { cgh.host_task([=]() { sycl::free(toVoidPtr(ptr), sycl_queue); }); });
 				                if(queueDep->isBlocking())
 				                    ev.wait_and_throw();
 				            };

 				            auto sharedBuffer = onHost::SharedBuffer{
 				                deviceDependency,
 				                ptr,
 				                extents,
 				                pitches,
 				                std::move(deleter),
 				                Alignment<alignment>{}};
 				            return sharedBuffer;
 				        }
 				    };
 				} // namespace alpaka::onHost

 				namespace alpaka::internal

 				{
 				    template<typename T_Device>
 				    struct GetApi::Op<alpaka::onHost::syclGeneric::Queue<T_Device>>
 				    {
 				        inline constexpr auto operator()(auto&& queue) const
 				        {
 				            return alpaka::getApi(queue.m_device);
 				        }
 				    };
 				} // namespace alpaka::internal

 				#endif
 				// ==
 				// == ./include/alpaka/api/syclGeneric/Queue.hpp ==
 				// ============================================================================

 			// #    include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/api/syclGeneric/Event.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/api/syclGeneric/Queue.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/api/util.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/onHost/mem/SharedBuffer.hpp"    // amalgamate: file already inlined

 			// #    include <sycl/sycl.hpp>    // amalgamate: file already included

 			namespace alpaka::onHost
 			{
 			    namespace syclGeneric
 			    {
 			        template<typename T_Platform>
 			        struct Device : std::enable_shared_from_this<Device<T_Platform>>
 			        {
 			        public:
 			            Device(internal::concepts::PlatformHandle auto platform, auto const& dev, uint32_t const idx)
 			                : m_platform(std::move(platform))
 			                , m_idx(idx)
 			                , m_sycl_dev(dev)
 			                , m_properties{internal::getDeviceProperties(*m_platform.get(), m_idx)}
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::device);
 			            }

 			            ~Device()
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::device);
 			            }

 			            Device(Device const&) = delete;
 			            Device& operator=(Device const&) = delete;

 			            Device(Device&&) = delete;
 			            Device& operator=(Device&&) = delete;

 			            auto getName() const
 			            {
 			                return m_sycl_dev.get_info<sycl::info::device::name>();
 			            }

 			            std::shared_ptr<Device<T_Platform>> getSharedPtr()
 			            {
 			                return this->shared_from_this();
 			            }

 			            [[nodiscard]] Handle<syclGeneric::Queue<Device>> makeQueue(alpaka::concepts::QueueKind auto kind)
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::queue + onHost::logger::device);
 			                static_assert(
 			                    kind == queueKind::blocking || kind == queueKind::nonBlocking,
 			                    "Unsupported queue kind.");
 			                auto thisHandle = this->getSharedPtr();
 			                std::lock_guard<std::mutex> lk{m_writeGuard};

 			                constexpr bool isBlocking = kind == queueKind::blocking;
 			                auto newQueue
 			                    = std::make_shared<syclGeneric::Queue<Device>>(std::move(thisHandle), queues.size(), isBlocking);

 			                queues.emplace_back(newQueue);
 			                return newQueue;
 			            }

 			            [[nodiscard]] std::pair<sycl::device, sycl::context> getNativeHandle() const noexcept
 			            {
 			                return {m_sycl_dev, m_platform->getContext()};
 			            }

 			            void wait()
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::device);
 			                // Copy queue weak refs under lock then release to avoid blocking other operations while waiting.
 			                std::vector<std::weak_ptr<syclGeneric::Queue<Device>>> tmpQueues;
 			                {
 			                    std::lock_guard<std::mutex> lk{m_writeGuard};
 			                    tmpQueues = queues;
 			                }
 			                for(auto& weakQueue : tmpQueues)
 			                {
 			                    if(auto queue = weakQueue.lock())
 			                    {
 			                        queue->wait();
 			                    }
 			                }
 			            }

 			        private:
 			            friend struct internal::MakeEvent;

 			            Handle<syclGeneric::Event<Device>> makeEvent()
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::device);
 			                auto thisHandle = this->getSharedPtr();
 			                std::lock_guard<std::mutex> lk{m_writeGuard};
 			                auto newEvent = std::make_shared<syclGeneric::Event<Device>>(std::move(thisHandle), events.size());

 			                events.emplace_back(newEvent);
 			                return newEvent;
 			            }

 			            void _()
 			            {
 			                static_assert(internal::concepts::Device<Device>);
 			            }

 			            friend struct alpaka::internal::GetDeviceType;

 			            auto getDeviceKind() const
 			            {
 			                return alpaka::internal::getDeviceKind(*m_platform.get());
 			            }

 			            Handle<T_Platform> m_platform;
 			            uint32_t m_idx = 0u;
 			            sycl::device m_sycl_dev;

 			            std::vector<std::weak_ptr<syclGeneric::Queue<Device>>> queues;
 			            std::vector<std::weak_ptr<syclGeneric::Event<Device>>> events;
 			            std::mutex m_writeGuard;

 			            DeviceProperties m_properties;

 			            friend struct alpaka::internal::GetApi;
 			            friend struct internal::GetDeviceProperties;
 			            friend struct internal::AdjustThreadSpec;
 			            friend struct onHost::internal::AllocDeferred;
 			            friend struct onHost::internal::AllocUnified;
 			            friend struct onHost::internal::AllocMapped;
 			            friend struct onHost::internal::IsDataAccessible;
 			        };
 			    } // namespace syclGeneric

 			    namespace internal
 			    {

 			        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
 			        struct Alloc::Op<T_Type, syclGeneric::Device<T_Platform>, T_Extents>
 			        {
 			            auto operator()(syclGeneric::Device<T_Platform>& device, T_Extents const& extents) const
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
 			                constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
 			                    ALPAKA_TYPEOF(getApi(device)){},
 			                    ALPAKA_TYPEOF(getDeviceKind(device)){});
 			                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

 			                auto deviceDependency = onHost::Device{device.getSharedPtr()};
 			                auto [sycl_device, sycl_context] = device.getNativeHandle();

 			                T_Type* ptr = reinterpret_cast<T_Type*>(
 			                    sycl::aligned_alloc_device(alignment, memSizeInByte, sycl_device, sycl_context));
 			                auto deleter = [ctx = sycl_context, ptr]() { sycl::free(toVoidPtr(ptr), ctx); };

 			                auto sharedBuffer = onHost::SharedBuffer{
 			                    deviceDependency,
 			                    ptr,
 			                    extents,
 			                    pitches,
 			                    std::move(deleter),
 			                    Alignment<alignment>{}};
 			                return sharedBuffer;
 			            }
 			        };

 			        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
 			        struct AllocUnified::Op<T_Type, syclGeneric::Device<T_Platform>, T_Extents>
 			        {
 			            auto operator()(syclGeneric::Device<T_Platform>& device, T_Extents const& extents) const
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
 			                constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
 			                    ALPAKA_TYPEOF(getApi(device)){},
 			                    ALPAKA_TYPEOF(getDeviceKind(device)){});
 			                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

 			                auto deviceDependency = onHost::Device{device.getSharedPtr()};
 			                auto [sycl_device, sycl_context] = device.getNativeHandle();

 			                bool isManagedMemorySupported = sycl_device.has(sycl::aspect::usm_shared_allocations);
 			                if(!isManagedMemorySupported)
 			                {
 			                    throw std::runtime_error("Sycl device does not support unified memory allocations.");
 			                }

 			                T_Type* ptr = reinterpret_cast<T_Type*>(
 			                    sycl::aligned_alloc_shared(alignment, memSizeInByte, sycl_device, sycl_context));
 			                auto deleter = [ctx = sycl_context, ptr]() { sycl::free(toVoidPtr(ptr), ctx); };

 			                auto sharedBuffer = onHost::SharedBuffer{
 			                    deviceDependency,
 			                    ptr,
 			                    extents,
 			                    pitches,
 			                    std::move(deleter),
 			                    Alignment<alignment>{}};
 			                return sharedBuffer;
 			            }
 			        };

 			        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
 			        struct AllocMapped::Op<T_Type, syclGeneric::Device<T_Platform>, T_Extents>
 			        {
 			            auto operator()(syclGeneric::Device<T_Platform>& device, T_Extents const& extents) const
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
 			                constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
 			                    ALPAKA_TYPEOF(getApi(device)){},
 			                    ALPAKA_TYPEOF(getDeviceKind(device)){});
 			                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

 			                auto deviceDependency = onHost::Device{device.getSharedPtr()};
 			                auto [_, sycl_context] = device.getNativeHandle();

 			                T_Type* ptr
 			                    = reinterpret_cast<T_Type*>(sycl::aligned_alloc_host(alignment, memSizeInByte, sycl_context));
 			                auto deleter = [ctx = sycl_context, ptr]() { sycl::free(toVoidPtr(ptr), ctx); };

 			                auto sharedBuffer = onHost::SharedBuffer{
 			                    deviceDependency,
 			                    ptr,
 			                    extents,
 			                    pitches,
 			                    std::move(deleter),
 			                    Alignment<alignment>{}};
 			                return sharedBuffer;
 			            }
 			        };

 			        template<typename T_Platform, typename T_Any>
 			        struct IsDataAccessible::FirstPath<syclGeneric::Device<T_Platform>, T_Any>
 			        {
 			            bool operator()(syclGeneric::Device<T_Platform>& device, T_Any const& view) const
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
 			                auto [sycl_device, sycl_context] = device.getNativeHandle();
 			                auto sycl_alloc_type = sycl::get_pointer_type(data(view), sycl_context);

 			                if(sycl_alloc_type != sycl::usm::alloc::unknown)
 			                {
 			                    try
 			                    {
 			                        sycl::device deviceAssociatedWithData = sycl::get_pointer_device(data(view), sycl_context);
 			                        if(deviceAssociatedWithData == sycl_device)
 			                        {
 			                            // sycl device allocated the memory
 			                            return true;
 			                        }
 			                    }
 			                    catch(...)
 			                    {
 			                    }
 			                }

 			                if(sycl_alloc_type == sycl::usm::alloc::shared)
 			                {
 			                    // is shared within the device context
 			                    return true;
 			                }
 			                else if(sycl_alloc_type == sycl::usm::alloc::unknown)
 			                {
 			                    // assume that a sycl cpu device can always access host memory
 			                    if constexpr(
 			                        ALPAKA_TYPEOF(getApi(view)){} == api::host
 			                        && ALPAKA_TYPEOF(getDeviceKind(device)){} == deviceKind::cpu)
 			                        return true;
 			                }

 			                return false;
 			            }
 			        };

 			        template<typename T_Platform>
 			        struct GetDeviceProperties::Op<syclGeneric::Device<T_Platform>>
 			        {
 			            DeviceProperties operator()(syclGeneric::Device<T_Platform> const& device) const
 			            {
 			                return device.m_properties;
 			            }
 			        };

 			        template<
 			            typename T_Platform,
 			            alpaka::concepts::Executor T_Executor,
 			            onHost::concepts::FrameSpec T_FrameSpec,
 			            alpaka::concepts::KernelBundle T_KernelBundle>
 			        struct AdjustThreadSpec::Op<syclGeneric::Device<T_Platform>, T_Executor, T_FrameSpec, T_KernelBundle>
 			        {
 			            using T_NumThreads = T_FrameSpec::ThreadExtentsVecType;

 			            auto operator()(
 			                syclGeneric::Device<T_Platform> const& device,
 			                T_Executor const& executor,
 			                T_FrameSpec const& dataBlocking,
 			                T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_NumThreads>
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::device);
 			                auto numThreads = dataBlocking.getThreadSpec().m_numThreads;

 			                /** This limit is not exact but for typical GPUs, Intel, NVIDIA and AMD we can at least use 1024
 			                 * threads per block.
 			                 *  @todo Check if this produces issues on FPGAs, in this case the deviceKind should be used and the
 			                 * limit should be different for each deviceKind.
 			                 */
 			                constexpr typename ALPAKA_TYPEOF(numThreads)::type hardwareLimitThreadsPerBlock = 1024u;

 			                constexpr auto result = api::util::adjustToLimit<hardwareLimitThreadsPerBlock, 0u, 1u>(numThreads);
 			                return ThreadSpec{dataBlocking.getThreadSpec().m_numBlocks, result};
 			            }

 			            auto operator()(
 			                syclGeneric::Device<T_Platform> const& device,
 			                T_Executor const& executor,
 			                T_FrameSpec const& dataBlocking,
 			                T_KernelBundle const& kernelBundle) const
 			            {
 			                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::device);
 			                auto numThreadsPerBlocks = dataBlocking.getThreadSpec().m_numThreads;
 			                auto const maxThreadsPerBlock = device.m_properties.m_maxThreadsPerBlock;

 			                auto result = api::util::adjustToLimit(numThreadsPerBlocks, maxThreadsPerBlock);
 			                return ThreadSpec{dataBlocking.getThreadSpec().m_numBlocks, result};
 			            }
 			        };

 			    } // namespace internal
 			} // namespace alpaka::onHost

 			namespace alpaka::internal
 			{
 			    template<typename T_Platform>
 			    struct GetApi::Op<onHost::syclGeneric::Device<T_Platform>>
 			    {
 			        decltype(auto) operator()(auto&& device) const
 			        {
 			            return internal::getApi(*device.m_platform.get());
 			        }
 			    };
 			} // namespace alpaka::internal

 			#endif
 			// ==
 			// == ./include/alpaka/api/syclGeneric/Device.hpp ==
 			// ============================================================================

 		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined
 		// #include "executor.hpp"    // amalgamate: file already inlined

 		namespace alpaka::onHost::trait
 		{
 		#if ALPAKA_LANG_ONEAPI
 		    template<typename T_Platform>
 		    struct IsExecutorSupportedBy::Op<alpaka::exec::OneApi, alpaka::onHost::syclGeneric::Device<T_Platform>>
 		        : std::true_type
 		    {
 		    };
 		#endif

 		} // namespace alpaka::onHost::trait
 		// ==
 		// == ./include/alpaka/api/oneApi/Device.hpp ==
 		// ============================================================================

 		// ============================================================================
 		// == ./include/alpaka/api/oneApi/Platform.hpp ==
 		// ==
 		/* Copyright 2025 Simeon Ehrig
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 		#if ALPAKA_LANG_ONEAPI
 		// #    include "alpaka/api/oneApi/Api.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/api/syclGeneric/Platform.hpp ==
 			// ==
 			/* Copyright 2025 Simeon Ehrig
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 			#if ALPAKA_LANG_SYCL

 			// #    include "Device.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
 				// ============================================================================
 				// == ./include/alpaka/core/Sycl.hpp ==
 				// ==
 				/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/meta/IntegerSequence.hpp"    // amalgamate: file already inlined

 				// #include <array>    // amalgamate: file already included
 				// #include <cstddef>    // amalgamate: file already included
 				#include <cstdio> // the #define printf(...) breaks <cstdio> if it is included afterwards
 				// #include <iostream>    // amalgamate: file already included
 				// #include <stdexcept>    // amalgamate: file already included
 				// #include <string>    // amalgamate: file already included
 				#include <type_traits>
 				// #include <utility>    // amalgamate: file already included

 				#if ALPAKA_LANG_SYCL

 				// #    include <sycl/sycl.hpp>    // amalgamate: file already included

 				// if SYCL is enabled with the AMD backend the printf will be killed because of missing compiler support
 				#    ifdef __AMDGCN__
 				#        define printf(...)
 				#    else

 				#        ifdef __SYCL_DEVICE_ONLY__
 				using AlpakaFormat = char const* [[clang::opencl_constant]];
 				#        else
 				using AlpakaFormat = char const*;
 				#        endif

 				#        if ALPAKA_COMP_CLANG
 				#            pragma clang diagnostic push
 				#            pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
 				#        endif

 				#        define printf(FORMAT, ...)                                                                                   \
 				            do                                                                                                        \
 				            {                                                                                                         \
 				                static auto const format = AlpakaFormat{FORMAT};                                                      \
 				                sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__);                                       \
 				            } while(false)

 				#        if ALPAKA_COMP_CLANG
 				#            pragma clang diagnostic pop
 				#        endif

 				#    endif

 				// SYCL vector types trait specializations.
 				namespace alpaka
 				{
 				    namespace detail
 				    {
 				        // Remove std::is_same boilerplate
 				        template<typename T, typename... Ts>
 				        struct is_any : std::bool_constant<(std::is_same_v<T, Ts> || ...)>
 				        {
 				        };
 				    } // namespace detail

 				    //! In contrast to CUDA SYCL doesn't know 1D vectors. It does
 				    //! support OpenCL's data types which have additional requirements
 				    //! on top of those in the C++ standard. Note that SYCL's equivalent
 				    //! to CUDA's dim3 type is a different class type and thus not used
 				    //! here.
 				    template<typename T>
 				    struct IsSyclBuiltInType
 				        : detail::is_any<
 				              T,
 				              // built-in scalar types - these are the standard C++ built-in types, std::size_t, std::byte and
 				              // sycl::half
 				              sycl::half,

 				              // 2 component vector types
 				              sycl::char2,
 				              sycl::uchar2,
 				              sycl::short2,
 				              sycl::ushort2,
 				              sycl::int2,
 				              sycl::uint2,
 				              sycl::long2,
 				              sycl::ulong2,
 				              sycl::float2,
 				              sycl::double2,
 				              sycl::half2,

 				              // 3 component vector types
 				              sycl::char3,
 				              sycl::uchar3,
 				              sycl::short3,
 				              sycl::ushort3,
 				              sycl::int3,
 				              sycl::uint3,
 				              sycl::long3,
 				              sycl::ulong3,
 				              sycl::float3,
 				              sycl::double3,
 				              sycl::half3,

 				              // 4 component vector types
 				              sycl::char4,
 				              sycl::uchar4,
 				              sycl::short4,
 				              sycl::ushort4,
 				              sycl::int4,
 				              sycl::uint4,
 				              sycl::long4,
 				              sycl::ulong4,
 				              sycl::float4,
 				              sycl::double4,
 				              sycl::half4,

 				              // 8 component vector types
 				              sycl::char8,
 				              sycl::uchar8,
 				              sycl::short8,
 				              sycl::ushort8,
 				              sycl::int8,
 				              sycl::uint8,
 				              sycl::long8,
 				              sycl::ulong8,
 				              sycl::float8,
 				              sycl::double8,
 				              sycl::half8,

 				              // 16 component vector types
 				              sycl::char16,
 				              sycl::uchar16,
 				              sycl::short16,
 				              sycl::ushort16,
 				              sycl::int16,
 				              sycl::uint16,
 				              sycl::long16,
 				              sycl::ulong16,
 				              sycl::float16,
 				              sycl::double16,
 				              sycl::half16>
 				    {
 				    };
 				} // namespace alpaka
 				#endif
 				// ==
 				// == ./include/alpaka/core/Sycl.hpp ==
 				// ============================================================================

 			// #    include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined

 			// #    include <sycl/sycl.hpp>    // amalgamate: file already included

 			#    include <map>
 			// #    include <memory>    // amalgamate: file already included
 			#    include <numeric>
 			#    include <optional>

 			namespace alpaka
 			{
 			    namespace detail
 			    {
 			        template<typename T_DeviceKind>
 			        struct SYCLDeviceSelector;

 			        struct Context
 			        {
 			            Context() = default;

 			            sycl::platform getPlatformByName(std::string const& platformName)
 			            {
 			                auto platforms = sycl::platform::get_platforms();

 			                for(auto const& platform : platforms)
 			                {
 			                    if(platform.get_info<sycl::info::platform::name>() == platformName)
 			                    {
 			                        return platform;
 			                    }
 			                }

 			                throw std::runtime_error("Platform not found");
 			            }

 			            auto getContext(sycl::platform platform)
 			            {
 			                std::string platformName = platform.get_info<sycl::info::platform::name>();
 			                if(contextMap.contains(platformName))
 			                {
 			                    return contextMap[platformName];
 			                }

 			                std::vector<sycl::device> devices;
 			                try
 			                {
 			                    devices = platform.get_devices();
 			                }
 			                catch(...)
 			                {
 			                    devices.clear();
 			                }
 			                if(devices.size())
 			                {
 			                    auto context = sycl::context{
 			                        platform.get_devices(),
 			                        [](sycl::exception_list exceptions)
 			                        {
 			                            auto ss_err = std::stringstream{};
 			                            ss_err << "Caught asynchronous SYCL exception(s):\n";
 			                            for(std::exception_ptr e : exceptions)
 			                            {
 			                                try
 			                                {
 			                                    std::rethrow_exception(e);
 			                                }
 			                                catch(sycl::exception const& err)
 			                                {
 			                                    ss_err << err.what() << " (" << err.code() << ")\n";
 			                                }
 			                            }
 			                            throw std::runtime_error(ss_err.str());
 			                        }};
 			                    return contextMap[platformName] = context;
 			                }
 			                return sycl::context{};
 			            }

 			            std::map<std::string, sycl::context> contextMap;
 			        };
 			    } // namespace detail

 			    namespace onHost

 			    {
 			        namespace syclGeneric
 			        {
 			            template<typename T_ApiInterface, alpaka::concepts::DeviceKind T_DeviceKind>
 			            struct Platform : std::enable_shared_from_this<Platform<T_ApiInterface, T_DeviceKind>>
 			            {
 			            public:
 			                Platform() : contextManager{make_sharedSingleton<detail::Context>()}
 			                {
 			                    try
 			                    {
 			                        syclPlatform = sycl::platform{detail::SYCLDeviceSelector<T_DeviceKind>{}};
 			                        syclDevices = syclPlatform->get_devices();
 			                        devices.resize(syclDevices.size());
 			                        syclContext = contextManager->getContext(syclPlatform.value());
 			                    }
 			                    catch(...)
 			                    {
 			                        syclContext.reset();
 			                        syclPlatform.reset();
 			                        syclDevices.clear();
 			                        devices.clear();
 			                    }
 			                }

 			                Platform(Platform const&) = delete;
 			                Platform& operator=(Platform const&) = delete;

 			                Platform(Platform&&) = delete;
 			                Platform& operator=(Platform&&) = delete;

 			                std::shared_ptr<Platform<T_ApiInterface, T_DeviceKind>> getSharedPtr()
 			                {
 			                    return this->shared_from_this();
 			                }

 			                auto getContext() const
 			                {
 			                    if(!syclContext.has_value())
 			                        throw std::runtime_error("The underlying SYCL context is invalid.");
 			                    return syclContext.value();
 			                }

 			                uint32_t getDeviceCount() const
 			                {
 			                    ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
 			                    constexpr bool isSupportedDev = trait::IsDeviceSupportedBy::
 			                        Op<T_DeviceKind, ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval<Platform>()))>::value;
 			                    if constexpr(isSupportedDev)
 			                    {
 			                        auto numDevices = devices.size();
 			                        return static_cast<uint32_t>(numDevices);
 			                    }
 			                    return 0u;
 			                }

 			                Handle<syclGeneric::Device<Platform<T_ApiInterface, T_DeviceKind>>> makeDevice(uint32_t const& idx)
 			                {
 			                    ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
 			                    uint32_t const numDevices = getDeviceCount();
 			                    if(idx >= numDevices)
 			                    {
 			                        std::stringstream ssErr;
 			                        ssErr << "Unable to return device handle for SYCL device with index " << idx
 			                              << " because there are only " << numDevices << " devices!";
 			                        throw std::runtime_error(ssErr.str());
 			                    }

 			                    std::lock_guard<std::mutex> lk{deviceGuard};

 			                    if(auto sharedPtr = devices[idx].lock())
 			                    {
 			                        return sharedPtr;
 			                    }

 			                    auto newDevice = std::make_shared<syclGeneric::Device<Platform<T_ApiInterface, T_DeviceKind>>>(
 			                        std::move(getSharedPtr()),
 			                        syclDevices[idx],
 			                        idx);
 			                    devices[idx] = newDevice;
 			                    return newDevice;
 			                }

 			                static constexpr auto getName()
 			                {
 			                    return onHost::demangledName<syclGeneric::Platform<T_ApiInterface, T_DeviceKind>>();
 			                }

 			                friend struct internal::GetDeviceProperties::Op<syclGeneric::Platform<T_ApiInterface, T_DeviceKind>>;

 			            private:
 			                friend struct onHost::internal::IsDataAccessible;
 			                friend struct GetDeviceProperties;

 			                // The context manager is required to be able to use the same sycl context for different device types
 			                std::shared_ptr<alpaka::detail::Context> contextManager;
 			                std::optional<sycl::context> syclContext;
 			                // native sycl platform for the corresponding device kind this platform is representing
 			                std::optional<sycl::platform> syclPlatform;
 			                // native sycl devices for the corresponding device kind this platform is representing
 			                std::vector<sycl::device> syclDevices;
 			                // alpaka devices for the internal hierarchy
 			                std::vector<std::weak_ptr<syclGeneric::Device<Platform<T_ApiInterface, T_DeviceKind>>>> devices;

 			                std::mutex deviceGuard;

 			                void _()
 			                {
 			                    static_assert(internal::concepts::Platform<Platform>);
 			                }
 			            };
 			        } // namespace syclGeneric

 			        namespace internal
 			        {
 			            template<typename T_ApiInterface, alpaka::concepts::DeviceKind T_DeviceKind>
 			            struct GetDeviceProperties::Op<syclGeneric::Platform<T_ApiInterface, T_DeviceKind>>
 			            {
 			                DeviceProperties operator()(
 			                    syclGeneric::Platform<T_ApiInterface, T_DeviceKind> const& platform,
 			                    uint32_t deviceIdx) const
 			                {
 			                    ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
 			                    if(deviceIdx >= platform.syclDevices.size())
 			                    {
 			                        std::stringstream ssErr;
 			                        ssErr << "Unable to return device properties for SYCL device with index " << deviceIdx
 			                              << " because there are only " << platform.getDeviceCount() << " devices!";
 			                        throw std::runtime_error(ssErr.str());
 			                    }
 			                    sycl::device const dev = platform.syclDevices[deviceIdx];

 			                    auto prop = DeviceProperties{};
 			                    prop.m_name = dev.get_info<sycl::info::device::name>();
 			                    prop.m_maxThreadsPerBlock = dev.get_info<sycl::info::device::max_work_group_size>();
 			                    std::vector<std::size_t> wrap_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
 			                    // @todo do not reduce wrap size to a single value, return all values
 			                    prop.m_warpSize = static_cast<uint32_t>(std::reduce(
 			                        wrap_sizes.begin(),
 			                        wrap_sizes.end(),
 			                        std::size_t{0},
 			                        [](std::size_t a, std::size_t b)
 			                        {
 			                            // The CPU runtime supports a sub-group size of 64, but the SYCL implementation
 			                            // currently does not
 			                            return std::max(a, b) <= 32 ? std::max(a, b) : 32;
 			                        }));
 			                    prop.m_multiProcessorCount = dev.get_info<sycl::info::device::max_compute_units>();

 			                    return prop;
 			                }
 			            };
 			        } // namespace internal

 			    } // namespace onHost
 			} // namespace alpaka
 			#endif
 			// ==
 			// == ./include/alpaka/api/syclGeneric/Platform.hpp ==
 			// ============================================================================

 		// #    include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
 		// #    include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
 		// #    include "alpaka/tag.hpp"    // amalgamate: file already inlined

 		namespace alpaka
 		{
 		    namespace detail
 		    {
 		        template<>
 		        struct SYCLDeviceSelector<deviceKind::Cpu>
 		        {
 		            auto operator()(sycl::device const& dev) const -> int
 		            {
 		                return dev.is_cpu() ? 1 : -1;
 		            }
 		        };

 		        template<>
 		        struct SYCLDeviceSelector<deviceKind::IntelGpu>
 		        {
 		            auto operator()(sycl::device const& dev) const -> int
 		            {
 		                auto const& vendor = dev.get_info<sycl::info::device::vendor>();
 		                auto const is_intel_gpu = dev.is_gpu() && (vendor.find("Intel(R) Corporation") != std::string::npos);

 		                return is_intel_gpu ? 1 : -1;
 		            }
 		        };

 		        template<>
 		        struct SYCLDeviceSelector<deviceKind::NvidiaGpu>
 		        {
 		            auto operator()(sycl::device const& dev) const -> int
 		            {
 		                auto const& vendor = dev.get_info<sycl::info::device::vendor>();
 		                auto const is_nvidia_gpu = dev.is_gpu() && (vendor.find("NVIDIA") != std::string::npos);

 		                return is_nvidia_gpu ? 1 : -1;
 		            }
 		        };

 		        template<>
 		        struct SYCLDeviceSelector<deviceKind::AmdGpu>
 		        {
 		            auto operator()(sycl::device const& dev) const -> int
 		            {
 		                auto const& vendor = dev.get_info<sycl::info::device::vendor>();
 		                auto const is_amd_gpu = dev.is_gpu() && (vendor.find("AMD") != std::string::npos);

 		                return is_amd_gpu ? 1 : -1;
 		            }
 		        };

 		    } // namespace detail

 		    namespace onHost
 		    {
 		        namespace internal
 		        {
 		            template<alpaka::concepts::DeviceKind T_DeviceKind>
 		            struct MakePlatform::Op<api::OneApi, T_DeviceKind>
 		            {
 		                auto operator()(api::OneApi const&, T_DeviceKind) const
 		                {
 		                    return onHost::make_sharedSingleton<syclGeneric::Platform<api::OneApi, T_DeviceKind>>();
 		                }
 		            };
 		        } // namespace internal
 		    } // namespace onHost

 		    namespace internal
 		    {
 		        template<alpaka::concepts::DeviceKind T_DeviceKind>
 		        struct GetApi::Op<onHost::syclGeneric::Platform<api::OneApi, T_DeviceKind>>
 		        {
 		            decltype(auto) operator()(auto&& platform) const
 		            {
 		                return api::OneApi{};
 		            }
 		        };

 		        template<alpaka::concepts::DeviceKind T_DeviceKind>
 		        struct GetDeviceType::Op<onHost::syclGeneric::Platform<api::OneApi, T_DeviceKind>>
 		        {
 		            decltype(auto) operator()(auto&& platform) const
 		            {
 		                return T_DeviceKind{};
 		            }
 		        };
 		    } // namespace internal
 		} // namespace alpaka

 		namespace alpaka::onHost::internal
 		{
 		    template<alpaka::concepts::DeviceKind T_DeviceKind, typename T_Any>
 		    struct IsDataAccessible::SecondPath<api::OneApi, T_DeviceKind, T_Any>
 		    {
 		        static void getPtrType(auto const& platform, auto& sycl_data_alloc_type, auto const& view)
 		        {
 		            auto sycl_context = platform->getContext();
 		            auto sycl_alloc_type = get_pointer_type(Data::data(view), sycl_context);

 		            if(sycl_alloc_type != sycl::usm::alloc::unknown)
 		                sycl_data_alloc_type = sycl_alloc_type;
 		        }

 		        bool operator()(api::OneApi usedApi, T_DeviceKind deviceKind, T_Any const& view) const
 		        {
 		            auto deviceKindList = onHost::supportedDevices(usedApi);
 		            auto sycl_data_alloc_type = sycl::usm::alloc::unknown;
 		            alpaka::apply(
 		                [&sycl_data_alloc_type, &view](auto... devKind)
 		                {
 		                    (getPtrType(
 		                         onHost::make_sharedSingleton<syclGeneric::Platform<api::OneApi, ALPAKA_TYPEOF(devKind)>>(),
 		                         sycl_data_alloc_type,
 		                         view),
 		                     ...);
 		                },
 		                deviceKindList);

 		            if(deviceKind == deviceKind::cpu)
 		            {
 		                /* If the device kind is not CPU and usm alloc type is shared, we do not know if the memory is shared
 		                 * within the same sycl context. Therefor only know we mark only shared and host alloced memory
 		                 * accessible in case the device kind is CPU.
 		                 */
 		                if(sycl_data_alloc_type == sycl::usm::alloc::shared || sycl_data_alloc_type == sycl::usm::alloc::host)
 		                    return true;
 		            }
 		            return false;
 		        }
 		    };
 		} // namespace alpaka::onHost::internal

 		#endif
 		// ==
 		// == ./include/alpaka/api/oneApi/Platform.hpp ==
 		// ============================================================================

 		// ============================================================================
 		// == ./include/alpaka/api/oneApi/Queue.hpp ==
 		// ==
 		/* Copyright 2025 Simeon Ehrig, René Widera
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 		#if ALPAKA_LANG_ONEAPI

 		// #    include "alpaka/api/generic.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/api/oneApi/StaticSharedMemory.hpp ==
 			// ==
 			/* Copyright 2025 Rene Widera
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 			#if ALPAKA_LANG_ONEAPI

 			// #    include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/core/Assert.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/tag.hpp"    // amalgamate: file already inlined

 			// #    include <sycl/sycl.hpp>    // amalgamate: file already included

 			// #    include <functional>    // amalgamate: file already included

 			namespace alpaka::onAcc
 			{
 			    namespace oneApi
 			    {
 			        namespace detail
 			        {
 			            /** Pointer lookup table
 			             *
 			             * Provides a dynamic lookup table to map an unique id to a pointer.
 			             */
 			            class PtrLookupTable
 			            {
 			                struct MetaData
 			                {
 			                    //! pointer to allocated data
 			                    std::byte* ptr = nullptr;
 			                    //! Unique id if the next data chunk.
 			                    size_t id = std::numeric_limits<size_t>::max();
 			                };

 			                static constexpr uint32_t metaDataSize = sizeof(MetaData);

 			            public:
 			#    ifndef NDEBUG
 			                PtrLookupTable(std::byte* mem, uint32_t capacity)
 			                    : m_mem(reinterpret_cast<MetaData*>(mem))
 			                    , m_capacity(capacity / metaDataSize)
 			                {
 			                    ALPAKA_ASSERT_ACC((m_mem == nullptr) == (m_capacity == 0u));
 			                }
 			#    else
 			                PtrLookupTable(std::byte* mem, uint32_t) : m_mem(reinterpret_cast<MetaData*>(mem))
 			                {
 			                }
 			#    endif

 			                /** number of bytes required for bookkeeping of maxNumberOfAllocations unique allocations
 			                 *
 			                 * @param maxNumUniqueAllocations number of unique allocation a user is allowed to perform
 			                 * @return bytes required to store lookup meta data
 			                 */
 			                static consteval uint32_t sizeLookupBufferInBytes(uint32_t maxNumUniqueAllocations)
 			                {
 			                    return metaDataSize * maxNumUniqueAllocations;
 			                }

 			                /* With oneApi 2025.2 the behaviour of shared memory allocation has changed. IT behaves like cuda
 			                 * shared memory. Therefore, we need a unique data type to avoid pointer aliasing. Using the helper
 			                 * class for data alignment is backward compatible to previous versions. The reason for using std::byte
 			                 * is that this guaranteed support for data types which are not trivially constructible.
 			                 */
 			                template<typename T, size_t T_id>
 			                struct alignas(T) SharedMemData
 			                {
 			                    std::byte data[sizeof(T)];
 			                };

 			                template<typename T, size_t T_id>
 			                T* alloc() const
 			                {
 			                    auto group = sycl::ext::oneapi::this_work_item::get_work_group<1>();
 			                    SharedMemData<T, T_id>* data
 			                        = sycl::ext::oneapi::group_local_memory_for_overwrite<SharedMemData<T, T_id>>(group);

 			                    MetaData& metaDataEntry = m_mem[m_numEntries];
 			                    ++m_numEntries;
 			                    ALPAKA_ASSERT_ACC(m_numEntries <= m_capacity);

 			                    // Update meta data with id and pointer to the current allocation
 			                    if(group.get_local_linear_id() == 0u)
 			                    {
 			                        // only one thread must update the pointer in shared memory
 			                        metaDataEntry.ptr = reinterpret_cast<std::byte*>(data);
 			                    }
 			                    metaDataEntry.id = T_id;

 			                    return reinterpret_cast<T*>(data);
 			                }

 			                //! Give the pointer to an exiting variable
 			                //!
 			                //! @tparam T type of the variable
 			                //! @param id unique id of the variable
 			                //! @return nullptr if variable with id not exists
 			                template<typename T>
 			                auto getVarPtr(size_t id) const -> T*
 			                {
 			                    // Iterate over metadata
 			                    for(uint32_t off = 0u; off < m_numEntries; ++off)
 			                    {
 			                        MetaData& metaDataEntry = m_mem[off];

 			                        if(metaDataEntry.id == id)
 			                            return reinterpret_cast<T*>(metaDataEntry.ptr);
 			                    }

 			                    // Variable not found.
 			                    return nullptr;
 			                }

 			            private:
 			                //! Number unqiue meta data entries stored
 			                mutable uint32_t m_numEntries = 0u;

 			                //! Memory layout
 			                //! |Header|Padding|Variable|Padding|Header|....uninitialized Data ....
 			                //! Size of padding can be zero if data after padding is already aligned.
 			                MetaData* const m_mem;
 			#    ifndef NDEBUG
 			                //! max number of meta data entries
 			                uint32_t const m_capacity;
 			#    endif
 			            };
 			        } // namespace detail

 			        class StaticSharedMemory : private detail::PtrLookupTable
 			        {
 			        public:
 			            /** number of bytes required for bookkeeping of mayNumberOfAllocations unique allcoations
 			             *
 			             * @param maxNumUniqueAllocations number of unique allocation a user is allowed to perform
 			             * @return bytes required to store lookup meta data
 			             */
 			            static consteval uint32_t sizeLookupBufferInBytes(uint32_t maxNumUniqueAllocations)
 			            {
 			                return detail::PtrLookupTable::sizeLookupBufferInBytes(maxNumUniqueAllocations);
 			            }

 			            StaticSharedMemory(StaticSharedMemory const&) = delete;

 			            /** Construct shared memory allocator
 			             * @param accessor local memory accessor to store lookup meta data
 			             *                 bytes required to store N unique allocation can be calculated with
 			             * sizeLookupBufferInBytes()
 			             */
 			            StaticSharedMemory(sycl::local_accessor<std::byte> const& accessor)
 			                : PtrLookupTable(
 			                      reinterpret_cast<std::byte*>(accessor.get_multi_ptr<sycl::access::decorated::no>().get()),
 			                      static_cast<uint32_t>(accessor.size()))

 			            {
 			            }

 			            using Base = detail::PtrLookupTable;

 			            template<typename T, size_t T_unique>
 			            T& allocVar()
 			            {
 			                T* data = Base::template getVarPtr<T>(T_unique);

 			                if(!data)
 			                {
 			                    data = Base::template alloc<T, T_unique>();
 			                }
 			                ALPAKA_ASSERT(data != nullptr);
 			                return *data;
 			            }
 			        };

 			    } // namespace oneApi
 			} // namespace alpaka::onAcc

 			#endif
 			// ==
 			// == ./include/alpaka/api/oneApi/StaticSharedMemory.hpp ==
 			// ============================================================================

 		// #    include "alpaka/api/syclGeneric/Queue.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/api/syclGeneric/onAcc.hpp ==
 			// ==
 			/* Copyright 2025 Simeon Ehrig
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 			#if ALPAKA_LANG_SYCL

 			// #    include "alpaka/Vec.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/core/Assert.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
 			// #    include "alpaka/tag.hpp"    // amalgamate: file already inlined

 			// #    include <sycl/sycl.hpp>    // amalgamate: file already included

 			// #    include <functional>    // amalgamate: file already included

 			namespace alpaka::onAcc
 			{
 			    namespace syclGeneric
 			    {
 			        template<auto T_syclDim, typename T_OptimizedThreadSpec>
 			        class BlockLayer
 			        {
 			            using IdxType = typename T_OptimizedThreadSpec::NumBlocksVecType::type;

 			            sycl::nd_item<T_syclDim> const& m_item;
 			            T_OptimizedThreadSpec const& m_optimizedThreadSpec;
 			            // dimension of the alpaka objects
 			            static constexpr uint32_t dim = T_OptimizedThreadSpec::dim();

 			        public:
 			            BlockLayer(sycl::nd_item<T_syclDim> const& item, T_OptimizedThreadSpec const& optimizedThreadSpec)
 			                : m_item(item)
 			                , m_optimizedThreadSpec(optimizedThreadSpec)
 			            {
 			            }

 			            constexpr auto idx() const -> Vec<IdxType, dim>
 			            {
 			                if constexpr(dim == 1)
 			                {
 			                    return Vec<IdxType, 1u>{m_item.get_group(0)};
 			                }
 			                else if constexpr(dim == 2)
 			                {
 			                    return Vec<IdxType, 2u>{m_item.get_group(0), m_item.get_group(1)};
 			                }
 			                else if constexpr(dim == 3)
 			                {
 			                    return Vec<IdxType, 3u>{m_item.get_group(0), m_item.get_group(1), m_item.get_group(2)};
 			                }
 			                else
 			                {
 			                    return mapToND(m_optimizedThreadSpec.m_numBlocks, static_cast<IdxType>(m_item.get_group(0)));
 			                }
 			            }

 			            constexpr auto count() const -> Vec<IdxType, dim>
 			            {
 			                if constexpr(dim == 1)
 			                {
 			                    return Vec<IdxType, 1u>{m_item.get_group_range(0)};
 			                }
 			                else if constexpr(dim == 2)
 			                {
 			                    return Vec<IdxType, 2u>{m_item.get_group_range(0), m_item.get_group_range(1)};
 			                }
 			                else if constexpr(dim == 3)
 			                {
 			                    return Vec<IdxType, 3u>{
 			                        m_item.get_group_range(0),
 			                        m_item.get_group_range(1),
 			                        m_item.get_group_range(2)};
 			                }
 			                else
 			                {
 			                    return m_optimizedThreadSpec.m_numBlocks;
 			                }
 			            }
 			        };

 			        template<auto T_syclDim, typename T_OptimizedThreadSpec>
 			        class ThreadLayer
 			        {
 			            using IdxType = typename T_OptimizedThreadSpec::NumThreadsVecType::type;

 			            sycl::nd_item<T_syclDim> const& m_item;
 			            T_OptimizedThreadSpec const& m_optimizedThreadSpec;
 			            // dimension of the alpaka objects
 			            static constexpr uint32_t dim = T_OptimizedThreadSpec::dim();

 			        public:
 			            ThreadLayer(sycl::nd_item<T_syclDim> const& item, T_OptimizedThreadSpec const& optimizedThreadSpec)
 			                : m_item(item)
 			                , m_optimizedThreadSpec(optimizedThreadSpec)
 			            {
 			            }

 			            constexpr auto idx() const -> Vec<IdxType, dim>
 			            {
 			                if constexpr(dim == 1)
 			                {
 			                    return Vec<IdxType, 1u>{m_item.get_local_id(0)};
 			                }
 			                else if constexpr(dim == 2)
 			                {
 			                    return Vec<IdxType, 2u>{m_item.get_local_id(0), m_item.get_local_id(1)};
 			                }
 			                else if constexpr(dim == 3)
 			                {
 			                    return Vec<IdxType, 3u>{m_item.get_local_id(0), m_item.get_local_id(1), m_item.get_local_id(2)};
 			                }
 			                else
 			                {
 			                    return mapToND(m_optimizedThreadSpec.m_numThreads, static_cast<IdxType>(m_item.get_local_id(0)));
 			                }
 			            }

 			            constexpr auto count() const -> Vec<IdxType, dim>
 			            {
 			                if constexpr(dim == 1)
 			                {
 			                    return Vec<IdxType, 1u>{m_item.get_local_range(0)};
 			                }
 			                else if constexpr(dim == 2)
 			                {
 			                    return Vec<IdxType, 2u>{m_item.get_local_range(0), m_item.get_local_range(1)};
 			                }
 			                else if constexpr(dim == 3)
 			                {
 			                    return Vec<IdxType, 3u>{
 			                        m_item.get_local_range(0),
 			                        m_item.get_local_range(1),
 			                        m_item.get_local_range(2)};
 			                }
 			                else
 			                {
 			                    return m_optimizedThreadSpec.m_numThreads;
 			                }
 			            }

 			            constexpr auto count() const
 			                requires alpaka::concepts::CVector<typename T_OptimizedThreadSpec::NumThreadsVecType>
 			            {
 			                return typename T_OptimizedThreadSpec::NumThreadsVecType{};
 			            }
 			        };

 			        template<auto T_syclDim>
 			        class Sync
 			        {
 			            sycl::nd_item<T_syclDim> const& m_item;

 			        public:
 			            Sync(sycl::nd_item<T_syclDim> const& item) : m_item(item)
 			            {
 			            }

 			            void operator()() const
 			            {
 			                m_item.barrier();
 			            }
 			        };

 			        class DynamicSharedMemory
 			        {
 			            sycl::local_accessor<std::byte> const& m_accessor;

 			        public:
 			            DynamicSharedMemory(sycl::local_accessor<std::byte> const& accessor) : m_accessor(accessor)
 			            {
 			            }

 			            template<typename T, size_t>
 			            T* allocDynamic(uint32_t)
 			            {
 			                return reinterpret_cast<T*>(m_accessor.get_multi_ptr<sycl::access::decorated::no>().get());
 			            }

 			            constexpr size_t byte_size() noexcept
 			            {
 			                return m_accessor.byte_size();
 			            }
 			        };
 			    } // namespace syclGeneric
 			} // namespace alpaka::onAcc

 			#endif
 			// ==
 			// == ./include/alpaka/api/syclGeneric/onAcc.hpp ==
 			// ============================================================================

 		// #    include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined


 		#    ifndef ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS
 		#        define ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS 32u
 		#    endif

 		#    ifndef SYCL_EXT_ONEAPI_MEMCPY2D
 		#        error                                                                                                        \
 		            "SYCL_EXT_ONEAPI_MEMCPY2D is not defined. Extension https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_oneapi_memcpy2d.asciidoc is required!"
 		#    endif

 		namespace alpaka::onHost::internal
 		{
 		    template<typename T_Device, typename T_Dest, typename T_Extents>
 		    requires(alpaka::trait::getDim_v<T_Extents> > 1u)
 		    struct Memset::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Extents>
 		    {
 		        void operator()(syclGeneric::Queue<T_Device>& queue, auto&& dest, uint8_t byteValue, T_Extents const& extents)
 		            const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
 		        {
 		            sycl::queue sycl_queue = queue.getNativeHandle();

 		            auto extentMd = pCast<size_t>(extents);
 		            auto const destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
 		            auto* destPtr = data(dest);

 		            constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;

 		            [[maybe_unused]] sycl::event ev;

 		            if constexpr(dim == 2u)
 		            {
 		                ev = sycl_queue.ext_oneapi_memset2d(
 		                    destPtr,
 		                    destPitchBytesWithoutColumn.back(),
 		                    byteValue,
 		                    extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
 		                    extentMd.y());
 		            }
 		            else if constexpr(dim >= 3u)
 		            {
 		                auto const dstExtentWithoutColumn = extentMd.eraseBack();
 		                ev = sycl_queue.ext_oneapi_memset2d(
 		                    destPtr,
 		                    destPitchBytesWithoutColumn.back(),
 		                    byteValue,
 		                    extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
 		                    dstExtentWithoutColumn.product());
 		            }

 		            if(queue.isBlocking())
 		                ev.wait_and_throw();
 		        }
 		    };

 		    template<typename T_Device, typename T_Dest, typename T_Source, typename T_Extents>
 		    requires(alpaka::trait::getDim_v<T_Extents> > 1u)
 		    struct internal::Memcpy::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Source, T_Extents>
 		    {
 		        void operator()(
 		            syclGeneric::Queue<T_Device>& queue,
 		            auto&& dest,
 		            T_Source const& source,
 		            T_Extents const& extents) const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
 		        {
 		            sycl::queue sycl_queue = queue.getNativeHandle();

 		            auto extentMd = pCast<size_t>(extents);
 		            auto const destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
 		            auto* destPtr = data(dest);
 		            auto const sourcePitchBytesWithoutColumn = source.getPitches().eraseBack();
 		            auto* sourcePtr = data(source);

 		            constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;

 		            [[maybe_unused]] sycl::event ev;

 		            if constexpr(dim == 2u)
 		            {
 		                ev = sycl_queue.ext_oneapi_memcpy2d(
 		                    destPtr,
 		                    destPitchBytesWithoutColumn.back(),
 		                    sourcePtr,
 		                    sourcePitchBytesWithoutColumn.back(),
 		                    extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
 		                    extentMd.y());
 		            }
 		            else if constexpr(dim >= 3u)
 		            {
 		                auto const dstExtentWithoutColumn = extentMd.eraseBack();
 		                ev = sycl_queue.ext_oneapi_memcpy2d(
 		                    destPtr,
 		                    destPitchBytesWithoutColumn.back(),
 		                    sourcePtr,
 		                    sourcePitchBytesWithoutColumn.back(),
 		                    extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
 		                    dstExtentWithoutColumn.product());
 		            }

 		            if(queue.isBlocking())
 		                ev.wait_and_throw();
 		        }
 		    };

 		    template<typename T_Device, typename T_Dest, typename T_Value, typename T_Extents>
 		    requires(alpaka::trait::getDim_v<T_Extents> > 1u)
 		    struct internal::Fill::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Value, T_Extents>
 		    {
 		        void operator()(
 		            syclGeneric::Queue<T_Device>& queue,
 		            auto&& dest,
 		            T_Value elementValue,
 		            T_Extents const& extents) const
 		            requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
 		                     && std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
 		        {
 		            auto executors = supportedExecutors(getDevice(queue), exec::allExecutors);
 		            // avoid that we pass a SharedBuffer and convert non alpaka data views
 		            auto dataView = makeView(dest);

 		            alpaka::internal::generic::fill(queue, std::get<0>(executors), dataView.getSubView(extents), elementValue);
 		        }
 		    };

 		    namespace detail
 		    {
 		        template<alpaka::concepts::Vector TVec>
 		        inline constexpr auto vecToSyclRange(TVec vec)
 		        {
 		            constexpr auto dim = std::decay_t<TVec>::dim();
 		            return [&vec]<auto... I>(std::index_sequence<I...>)
 		            // TODO: check if this is the correct order
 		            { return sycl::range<dim>(vec[I]...); }(std::make_index_sequence<dim>{});
 		        };

 		        template<alpaka::concepts::Vector T_NumBlocks, alpaka::concepts::Vector T_NumThreads>
 		        struct OptimizedThreadSpec
 		        {
 		            using NumBlocksVecType = typename T_NumBlocks::UniVec;
 		            using NumThreadsVecType = T_NumThreads;

 		            static consteval uint32_t dim()
 		            {
 		                return T_NumThreads::dim();
 		            }

 		            constexpr OptimizedThreadSpec(T_NumBlocks const&, T_NumThreads const&)
 		            {
 		            }
 		        };

 		        /** provides the sycl worker description
 		         *
 		         * @return A pair of the sycl nd range and an optimized thread spec. The thread spec is not holding any data
 		         * for dimension smaller equal to 3u
 		         */
 		        template<onHost::concepts::ThreadSpec T_ThreadSpec>
 		        inline constexpr auto getWorkerDescription(T_ThreadSpec const& threadSpec)
 		        {
 		            constexpr uint32_t dim = T_ThreadSpec::dim();
 		            // dimension of the sycl nd range
 		            constexpr uint32_t syclDim = dim >= 4u ? 1u : dim;

 		            sycl::nd_range<syclDim> gridRange;

 		            if constexpr(T_ThreadSpec::dim() >= 4u)
 		            {
 		                gridRange = sycl::nd_range<syclDim>{
 		                    (threadSpec.m_numBlocks * threadSpec.m_numThreads).product(),
 		                    threadSpec.m_numThreads.product()};
 		            }
 		            else
 		            {
 		                gridRange = sycl::nd_range<T_ThreadSpec::dim()>{
 		                    detail::vecToSyclRange(threadSpec.m_numBlocks * threadSpec.m_numThreads),
 		                    detail::vecToSyclRange(threadSpec.m_numThreads)};
 		            }

 		            using ThreadSpecType = std::conditional_t<
 		                dim >= 4u,
 		                ALPAKA_TYPEOF(threadSpec),
 		                detail::OptimizedThreadSpec<
 		                    typename ALPAKA_TYPEOF(threadSpec)::NumBlocksVecType,
 		                    typename ALPAKA_TYPEOF(threadSpec)::NumThreadsVecType>>;
 		            // thread spec which is only holding data if the dimension is larger than 3u
 		            auto optimizedThreadSpec = ThreadSpecType(threadSpec.m_numBlocks, threadSpec.m_numThreads);
 		            return std::make_pair(gridRange, optimizedThreadSpec);
 		        }
 		    } // namespace detail

 		    template<
 		        typename T_Device,
 		        alpaka::concepts::Executor T_Executor,
 		        onHost::concepts::ThreadSpec T_ThreadSpec,
 		        alpaka::concepts::KernelBundle T_KernelBundle>
 		    struct Enqueue::Kernel<syclGeneric::Queue<T_Device>, T_Executor, T_ThreadSpec, T_KernelBundle>
 		    {
 		        void operator()(
 		            syclGeneric::Queue<T_Device>& queue,
 		            T_Executor const executor,
 		            T_ThreadSpec const& threadBlocking,
 		            T_KernelBundle const& kernelBundle) const
 		        {
 		            constexpr auto st_shared_mem_bytes = onAcc::oneApi::StaticSharedMemory::sizeLookupBufferInBytes(
 		                ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS);
 		            // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
 		            u_int32_t blockDynSharedMemBytes
 		                = std::max(u_int32_t(1), onHost::getDynSharedMemBytes(executor, threadBlocking, kernelBundle));
 		            assert(
 		                st_shared_mem_bytes + blockDynSharedMemBytes
 		                <= queue.m_device->getNativeHandle().first.template get_info<sycl::info::device::local_mem_size>());

 		            [[maybe_unused]] sycl::event ev = queue.m_queue.submit(
 		                [threadBlocking, kernelBundle, blockDynSharedMemBytes](sycl::handler& cgh)
 		                {
 		                    using ApiType = decltype(getApi(queue));
 		                    using DeviceKindType = ALPAKA_TYPEOF(getDeviceKind(queue));

 		                    auto st_shared_accessor
 		                        = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};

 		                    auto dyn_shared_accessor
 		                        = sycl::local_accessor<std::byte>{sycl::range<1>{blockDynSharedMemBytes}, cgh};

 		                    auto workerDesc = detail::getWorkerDescription(threadBlocking);
 		                    auto optimizedThreadSpec = workerDesc.second;
 		                    constexpr uint32_t syclDim = workerDesc.first.dimensions;

 		                    cgh.parallel_for(
 		                        workerDesc.first,
 		                        [optimizedThreadSpec, st_shared_accessor, dyn_shared_accessor, kernelBundle](
 		                            sycl::nd_item<syclDim> work_item)
 		                        {
 		                            onAcc::oneApi::StaticSharedMemory ssm(st_shared_accessor);
 		                            onAcc::syclGeneric::DynamicSharedMemory dsm(dyn_shared_accessor);

 		                            static_assert(syclDim > 0);
 		                            static_assert(syclDim <= 3, "more the 3 dimensions are not supported");
 		                            auto acc = onAcc::Acc{Dict{
 		                                DictEntry(
 		                                    layer::block,
 		                                    onAcc::syclGeneric::BlockLayer{work_item, optimizedThreadSpec}),
 		                                DictEntry(
 		                                    layer::thread,
 		                                    onAcc::syclGeneric::ThreadLayer{work_item, optimizedThreadSpec}),
 		                                DictEntry(layer::shared, std::ref(ssm)),
 		                                DictEntry(layer::dynShared, std::ref(dsm)),
 		                                DictEntry(object::dynSharedMemBytes, dsm.byte_size()),
 		                                DictEntry(action::threadBlockSync, onAcc::syclGeneric::Sync{work_item}),
 		                                DictEntry(object::api, ApiType{}),
 		                                DictEntry(object::deviceKind, DeviceKindType{}),
 		                                DictEntry(object::exec, T_Executor{})}};

 		                            kernelBundle(acc);
 		                        });
 		                });

 		            if(queue.isBlocking())
 		                ev.wait_and_throw();
 		        }
 		    };

 		    template<
 		        typename T_Device,
 		        alpaka::concepts::Executor T_Executor,
 		        onHost::concepts::FrameSpec T_FrameSpec,
 		        alpaka::concepts::KernelBundle T_KernelBundle>
 		    struct Enqueue::Kernel<syclGeneric::Queue<T_Device>, T_Executor, T_FrameSpec, T_KernelBundle>
 		    {
 		        void operator()(
 		            syclGeneric::Queue<T_Device>& queue,
 		            T_Executor const executor,
 		            T_FrameSpec const& frameSpec,
 		            T_KernelBundle const& kernelBundle) const
 		        {
 		            auto const threadBlocking
 		                = internal::adjustThreadSpec(*queue.m_device.get(), executor, frameSpec, kernelBundle);

 		            constexpr auto st_shared_mem_bytes = onAcc::oneApi::StaticSharedMemory::sizeLookupBufferInBytes(
 		                ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS);

 		            // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
 		            u_int32_t blockDynSharedMemBytes
 		                = std::max(u_int32_t(1), onHost::getDynSharedMemBytes(executor, threadBlocking, kernelBundle));

 		            assert(
 		                st_shared_mem_bytes + blockDynSharedMemBytes
 		                <= queue.m_device->getNativeHandle().first.template get_info<sycl::info::device::local_mem_size>());

 		            [[maybe_unused]] sycl::event ev = queue.m_queue.submit(
 		                [threadBlocking, frameSpec, kernelBundle, blockDynSharedMemBytes](sycl::handler& cgh)
 		                {
 		                    using ApiType = decltype(getApi(queue));
 		                    using DeviceKindType = ALPAKA_TYPEOF(getDeviceKind(queue));
 		                    auto st_shared_accessor
 		                        = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
 		                    auto dyn_shared_accessor
 		                        = sycl::local_accessor<std::byte>{sycl::range<1>{blockDynSharedMemBytes}, cgh};

 		                    auto workerDesc = detail::getWorkerDescription(threadBlocking);
 		                    auto optimizedThreadSpec = workerDesc.second;
 		                    constexpr uint32_t syclDim = workerDesc.first.dimensions;

 		                    cgh.parallel_for(
 		                        workerDesc.first,
 		                        [optimizedThreadSpec, frameSpec, st_shared_accessor, dyn_shared_accessor, kernelBundle](
 		                            sycl::nd_item<syclDim> work_item)
 		                        {
 		                            onAcc::oneApi::StaticSharedMemory ssm(st_shared_accessor);
 		                            onAcc::syclGeneric::DynamicSharedMemory dsm(dyn_shared_accessor);

 		                            static_assert(syclDim > 0);
 		                            static_assert(syclDim <= 3, "more the 3 dimensions are not supported");
 		                            auto acc = onAcc::Acc{Dict{
 		                                DictEntry(
 		                                    layer::block,
 		                                    onAcc::syclGeneric::BlockLayer{work_item, optimizedThreadSpec}),
 		                                DictEntry(
 		                                    layer::thread,
 		                                    onAcc::syclGeneric::ThreadLayer{work_item, optimizedThreadSpec}),
 		                                DictEntry(layer::shared, std::ref(ssm)),
 		                                DictEntry(layer::dynShared, std::ref(dsm)),
 		                                DictEntry(object::dynSharedMemBytes, dsm.byte_size()),
 		                                DictEntry(action::threadBlockSync, onAcc::syclGeneric::Sync{work_item}),
 		                                DictEntry(object::api, ApiType{}),
 		                                DictEntry(object::deviceKind, DeviceKindType{}),
 		                                DictEntry(object::exec, T_Executor{}),
 		                                DictEntry(frame::count, frameSpec.m_numFrames),
 		                                DictEntry(frame::extent, frameSpec.m_frameExtent)}};

 		                            kernelBundle(acc);
 		                        });
 		                });
 		            if(queue.isBlocking())
 		                ev.wait_and_throw();
 		        }
 		    };

 		} // namespace alpaka::onHost::internal

 		#endif
 		// ==
 		// == ./include/alpaka/api/oneApi/Queue.hpp ==
 		// ============================================================================

 	// #include "alpaka/api/oneApi/executor.hpp"    // amalgamate: file already inlined
 		// ============================================================================
 		// == ./include/alpaka/api/oneApi/math.hpp ==
 		// ==
 		/* Copyright 2025 René Widera
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include "alpaka/api/oneApi/Api.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/api/syclGeneric/tag.hpp"    // amalgamate: file already inlined
 		// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined

 		namespace alpaka::trait
 		{
 		    template<>
 		    struct GetMathImpl::Op<alpaka::api::OneApi>
 		    {
 		        constexpr decltype(auto) operator()(alpaka::api::OneApi const) const
 		        {
 		            return alpaka::math::internal::syclMath;
 		        }
 		    };
 		} // namespace alpaka::trait
 		// ==
 		// == ./include/alpaka/api/oneApi/math.hpp ==
 		// ============================================================================

 	// #include "alpaka/api/syclGeneric/Event.hpp"    // amalgamate: file already inlined
 		// ============================================================================
 		// == ./include/alpaka/api/syclGeneric/atomic.hpp ==
 		// ==
 		/* Copyright 2025 Jan Stephan, Andrea Bocci, Luca Ferragina
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 		#if ALPAKA_LANG_SYCL

 		// #    include "alpaka/api/syclGeneric/tag.hpp"    // amalgamate: file already inlined
 		// #    include "alpaka/onAcc/atomicOp.hpp"    // amalgamate: file already inlined
 		// #    include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
 		// #    include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined

 		// #    include <sycl/sycl.hpp>    // amalgamate: file already included

 		// #    include <cstdint>    // amalgamate: file already included
 		#    include <type_traits>

 		namespace alpaka::detail
 		{
 		    template<typename T_Scope>
 		    struct SyclMemoryScope
 		    {
 		    };

 		    template<>
 		    struct SyclMemoryScope<alpaka::onAcc::scope::System>
 		    {
 		        static constexpr auto value = sycl::memory_scope::system;
 		    };

 		    template<>
 		    struct SyclMemoryScope<alpaka::onAcc::scope::Device>
 		    {
 		        static constexpr auto value = sycl::memory_scope::device;
 		    };

 		    template<>
 		    struct SyclMemoryScope<alpaka::onAcc::scope::Block>
 		    {
 		        static constexpr auto value = sycl::memory_scope::work_group;
 		    };

 		    template<typename T, typename T_Scope>
 		    using sycl_atomic_ref = sycl::atomic_ref<T, sycl::memory_order::relaxed, SyclMemoryScope<T_Scope>::value>;

 		    template<typename T_Scope, typename T, typename TOp>
 		    inline auto callAtomicOp(T* const addr, TOp&& op)
 		    {
 		        auto ref = sycl_atomic_ref<T, T_Scope>{*addr};
 		        return op(ref);
 		    }

 		    template<typename TRef, typename T, typename TEval>
 		    inline auto casWithCondition(T* const addr, TEval&& eval)
 		    {
 		        auto ref = TRef{*addr};
 		        auto old_val = ref.load();

 		        // prefer compare_exchange_weak when in a loop, assuming that eval is not expensive
 		        while(!ref.compare_exchange_weak(old_val, eval(old_val)))
 		        {
 		        }

 		        return old_val;
 		    }
 		} // namespace alpaka::detail

 		namespace alpaka::onAcc::internalCompute
 		{
 		    // Add.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicAdd, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
 		        {
 		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_add(value); });
 		        }
 		    };

 		    // Sub.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicSub, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
 		        {
 		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_sub(value); });
 		        }
 		    };

 		    // Min.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicMin, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
 		        {
 		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_min(value); });
 		        }
 		    };

 		    // Max.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicMax, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
 		        {
 		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_max(value); });
 		        }
 		    };

 		    // Exch.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicExch, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(
 		            (std::is_integral_v<T> || std::is_floating_point_v<T>) and (sizeof(T) == 4 || sizeof(T) == 8),
 		            "SYCL atomics do not support this type");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
 		        {
 		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.exchange(value); });
 		        }
 		    };

 		    // Inc.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicInc, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(
 		            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
 		            "SYCL atomics support only 32- and 64-bits unsigned integral types");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
 		        {
 		            auto inc = [&value](auto old_val)
 		            { return (old_val >= value) ? static_cast<T>(0) : (old_val + static_cast<T>(1)); };
 		            return alpaka::detail::casWithCondition<alpaka::detail::sycl_atomic_ref<T, T_Scope>>(addr, inc);
 		        }
 		    };

 		    // Dec.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicDec, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(
 		            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
 		            "SYCL atomics support only 32- and 64-bits unsigned integral types");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
 		        {
 		            auto dec = [&value](auto& old_val)
 		            { return ((old_val == 0) || (old_val > value)) ? value : (old_val - static_cast<T>(1)); };
 		            return alpaka::detail::casWithCondition<alpaka::detail::sycl_atomic_ref<T, T_Scope>>(addr, dec);
 		        }
 		    };

 		    // And.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicAnd, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
 		        {
 		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_and(value); });
 		        }
 		    };

 		    // Or.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicOr, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
 		        {
 		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_or(value); });
 		        }
 		    };

 		    // Xor.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicXor, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
 		        {
 		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_xor(value); });
 		        }
 		    };

 		    // Cas.
 		    //! The SYCL accelerator atomic operation.
 		    template<typename T, typename T_Scope>
 		    struct Atomic::Op<AtomicCas, onAcc::internal::SyclAtomic, T, T_Scope>
 		    {
 		        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");

 		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& expected, T const& desired)
 		            -> T
 		        {
 		            auto cas = [&expected, &desired](auto& ref)
 		            {
 		                auto expected_ = expected;
 		                // Atomically compares the value of `ref` with the value of `expected`.
 		                // If the values are equal, replaces the value of `ref` with `desired`.
 		                // Otherwise updates `expected` with the value of `ref`.
 		                // Returns a bool telling us if the exchange happened or not, but the Alpaka API does not make use of
 		                // it.
 		                ref.compare_exchange_strong(expected_, desired);

 		                // If the update succeded, return the previous value of `ref`.
 		                // Otherwise, return the current value of `ref`.
 		                return expected_;
 		            };

 		            return alpaka::detail::callAtomicOp<T_Scope>(addr, cas);
 		        }
 		    };
 		} // namespace alpaka::onAcc::internalCompute

 		#endif
 		// ==
 		// == ./include/alpaka/api/syclGeneric/atomic.hpp ==
 		// ============================================================================

 		// ============================================================================
 		// == ./include/alpaka/api/syclGeneric/math.hpp ==
 		// ==
 		/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bert Wesarg, Valentin Gehrke, René Widera,
 		 * Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Jeffrey Kelling, Sergei Bastrakov
 		 * SPDX-License-Identifier: MPL-2.0
 		 */

 		// #pragma once
 		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

 		#if ALPAKA_LANG_SYCL

 		// #    include "alpaka/api/api.hpp"    // amalgamate: file already inlined
 		// #    include "alpaka/api/syclGeneric/tag.hpp"    // amalgamate: file already inlined
 		// #    include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 			// ============================================================================
 			// == ./include/alpaka/math/internal/Complex.hpp ==
 			// ==
 			/* Copyright 2024 Sergei Bastrakov, Aurora Perego
 			 * SPDX-License-Identifier: MPL-2.0
 			 */

 			// #pragma once
 			// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
 				// ============================================================================
 				// == ./include/alpaka/math.hpp ==
 				// ==
 				/* Copyright 2024 René Widera
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
 				// #include "alpaka/math/internal/math.hpp"    // amalgamate: file already inlined

 				// #include <cmath>    // amalgamate: file already included

 				namespace alpaka::math
 				{
 				    constexpr auto abs(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Abs::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto sin(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Sin::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto acosh(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Acosh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto asinh(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Asinh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto sinh(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Sinh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto atan(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Atan::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto trunc(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Trunc::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto isinf(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Isinf::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto isfinite(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Isfinite::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto atanh(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Atanh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto tanh(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Tanh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto cbrt(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Cbrt::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto ceil(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Ceil::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    /** Computes the nearest integer value to arg (in floating-point format), rounding halfway cases away from zero,
 				     * regardless of the current rounding mode.
 				     */
 				    constexpr auto round(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Round::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    /** Computes the nearest integer value to arg (in in integer format), rounding halfway cases away from zero,
 				     * regardless of the current rounding mode.
 				     */
 				    constexpr auto lround(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Lround::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    /** Computes the nearest integer value to arg (in in integer format), rounding halfway cases away from zero,
 				     * regardless of the current rounding mode.
 				     */
 				    constexpr auto llround(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Llround::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    /** Creates a value with the magnitude of mag and the sign of sgn. */
 				    constexpr auto copysign(auto const& mag, auto const& sgn)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Copysign::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(mag), ALPAKA_TYPEOF(sgn)>{}(
 				            mathImpl,
 				            mag,
 				            sgn);
 				    }

 				    constexpr auto sincos(auto const& arg, auto& result_sin, auto& result_cos)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::SinCos::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(
 				            mathImpl,
 				            arg,
 				            result_sin,
 				            result_cos);
 				    }

 				    constexpr auto exp(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Exp::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto arg(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Arg::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto atan2(auto const& y, auto const& x)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Atan2::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(y), ALPAKA_TYPEOF(x)>{}(mathImpl, y, x);
 				    }

 				    // Square root function
 				    constexpr auto sqrt(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Sqrt::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    /* Computes the rsqrt.
 				     *
 				     * Valid real arguments are positive. For other values the result
 				     * may depend on the backend and compilation options, will likely
 				     * be NaN.
 				     */
 				    constexpr auto rsqrt(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Rsqrt::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    // Cosine function
 				    constexpr auto cos(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Cos::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto cosh(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Cosh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto erf(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Erf::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto floor(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Floor::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    /** Computes the natural (base e) logarithm of arg.
 				     *
 				     * Valid real arguments are non-negative. For other values the result
 				     * may depend on the backend and compilation options, will likely
 				     * be NaN.
 				     */
 				    constexpr auto log(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Log::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    /** Computes the natural (base 2) logarithm of arg.
 				     *
 				     * Valid real arguments are non-negative. For other values the result
 				     * may depend on the backend and compilation options, will likely
 				     * be NaN.
 				     */
 				    constexpr auto log2(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Log2::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    /* Computes the natural (base 10) logarithm of arg.
 				     *
 				     * Valid real arguments are non-negative. For other values the result
 				     * may depend on the backend and compilation options, will likely
 				     * be NaN.
 				     */
 				    constexpr auto log10(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Log10::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    // Tangent function
 				    constexpr auto tan(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Tan::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    // Arc cosine function
 				    constexpr auto acos(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Acos::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    // Arc sine function
 				    constexpr auto asin(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Asin::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto isnan(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Isnan::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    //! Computes the complex conjugate of arg.
 				    constexpr auto conj(auto const& arg)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Conj::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
 				    }

 				    constexpr auto min(auto const& a, auto const& b)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Min::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(a), ALPAKA_TYPEOF(b)>{}(mathImpl, a, b);
 				    }

 				    constexpr auto max(auto const& a, auto const& b)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Max::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(a), ALPAKA_TYPEOF(b)>{}(mathImpl, a, b);
 				    }

 				    constexpr auto pow(auto const& base, auto const& exp)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Pow::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(base), ALPAKA_TYPEOF(exp)>{}(
 				            mathImpl,
 				            base,
 				            exp);
 				    }

 				    constexpr auto fmod(auto const& x, auto const& y)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Fmod::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(x), ALPAKA_TYPEOF(y)>{}(mathImpl, x, y);
 				    }

 				    constexpr auto remainder(auto const& x, auto const& y)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Remainder::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(x), ALPAKA_TYPEOF(y)>{}(mathImpl, x, y);
 				    }

 				    constexpr auto fma(auto const& x, auto const& y, auto const& z)
 				    {
 				        auto const mathImpl = trait::getMathImpl(thisApi());
 				        return internal::Fma::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(x), ALPAKA_TYPEOF(y), ALPAKA_TYPEOF(z)>{}(
 				            mathImpl,
 				            x,
 				            y,
 				            z);
 				    }

 				} // namespace alpaka::math
 				// ==
 				// == ./include/alpaka/math.hpp ==
 				// ============================================================================

 				// ============================================================================
 				// == ./include/alpaka/math/floatEqualExact.hpp ==
 				// ==
 				/* Copyright 2021 Jiri Vyskocil
 				 * SPDX-License-Identifier: MPL-2.0
 				 */

 				// #pragma once
 				// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

 				#include <type_traits>

 				namespace alpaka
 				{
 				    namespace math
 				    {
 				        /** Compare two floating point numbers for exact equivalence. Use only when necessary, and be aware of the
 				         * implications. Most codes should not use this function and instead implement a correct epsilon-based
 				         * comparison. If you are unfamiliar with the topic, check out
 				         * https://www.geeksforgeeks.org/problem-in-comparing-floating-point-numbers-and-how-to-compare-them-correctly/
 				         * or Goldberg 1991: "What every computer scientist should know about floating-point arithmetic",
 				         * https://dl.acm.org/doi/10.1145/103162.103163
 				         *
 				         * This function calls the == operator for floating point types, but disables the warning issued by the
 				         * compiler when compiling with the float equality warning checks enabled. This warning is valid an valuable in
 				         * most codes and should be generally enabled, but there are specific instances where a piece of code might
 				         * need to do an exact comparison (e.g. @a CudaVectorArrayWrapperTest.cpp). The verbose name for the function
 				         * is intentional as it should raise a red flag if used while not absolutely needed. Users are advised to add a
 				         * justification whenever they use this function.
 				         *
 				         * @tparam T both operands have to be the same type and conform to std::is_floating_point
 				         * @param a first operand
 				         * @param b second operand
 				         * @return a == b
 				         */
 				        template<typename T>
 				        ALPAKA_FN_INLINE constexpr auto floatEqualExactNoWarning(T a, T b) -> bool
 				        {
 				            static_assert(std::is_floating_point_v<T>, "floatEqualExactNoWarning is for floating point values only!");

 				            // So far only GCC and Clang check for float comparison and both accept the GCC pragmas.
 				#ifdef __GNUC__
 				#    pragma GCC diagnostic push
 				#    pragma GCC diagnostic ignored "-Wfloat-equal"
 				#endif
 				            return a == b;
 				#ifdef __GNUC__
 				#    pragma GCC diagnostic pop
 				#endif
 				        }
 				    } // namespace math
 				} // namespace alpaka
 				// ==
 				// == ./include/alpaka/math/floatEqualExact.hpp ==
 				// ============================================================================

 			// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
 			// #include "math.hpp"    // amalgamate: file already inlined

 			// #include <cmath>    // amalgamate: file already included
 			// #include <complex>    // amalgamate: file already included
 			// #include <iostream>    // amalgamate: file already included
 			#include <type_traits>

 			namespace alpaka::math
 			{
 			    namespace internal
 			    {
 			        //! Implementation of a complex number useable on host and device.
 			        //!
 			        //! It follows the layout of std::complex and so array-oriented access.
 			        //! The class template implements all methods and operators as std::complex<T>.
 			        //! Additionally, it provides an implicit conversion to and from std::complex<T>.
 			        //! All methods besides operators << and >> are host-device.
 			        //! It does not provide non-member functions of std::complex besides the operators.
 			        //! Those are provided the same way as alpaka math functions for real numbers.
 			        //!
 			        //! Note that unlike most of alpaka, this is a concrete type template, and not merely a concept.
 			        //!
 			        //! Naming and order of the methods match https://en.cppreference.com/w/cpp/numeric/complex in C++17.
 			        //! Implementation chose to not extend it e.g. by adding constexpr to some places that would get it in C++20.
 			        //! The motivation is that with internal conversion to std::complex<T> for CPU backends, it would define the
 			        //! common interface for generic code anyways. So it is more clear to have alpaka's interface exactly matching
 			        //! when possible, and not "improving".
 			        //!
 			        //! @tparam T type of the real and imaginary part: float, double, or long double.
 			        template<typename T>
 			        class Complex
 			        {
 			        public:
 			            // Make sure the input type is floating-point
 			            static_assert(std::is_floating_point_v<T>);

 			            //! Type of the real and imaginary parts
 			            using value_type = T;

 			            //! Constructor from the given real and imaginary parts
 			            constexpr Complex(T const& real = T{}, T const& imag = T{}) : m_real(real), m_imag(imag)
 			            {
 			            }

 			            //! Copy constructor
 			            constexpr Complex(Complex const& oth
No results found