Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add PARTED kernels #382

Open
wants to merge 39 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
027d6f0
Add TRIAD_PARTED kernel
MrBurmark Oct 24, 2023
51057e3
Add TRIAD_PARTED_FUSED kernel
MrBurmark Oct 25, 2023
08e5c9f
Use direct dispatch in RAJA TRIAD_PARTED_FUSED
MrBurmark Oct 25, 2023
f1dc134
Add Geometric partition
MrBurmark Oct 27, 2023
84e1e6c
Add reuse tuning of TRIAD_PARTED_FUSED
MrBurmark Oct 30, 2023
36fb292
Add openmp TRIAD_PARTED tuning
MrBurmark Nov 1, 2023
8c06b66
Switch default block size of TRIAD_PARTED_FUSED
MrBurmark Nov 1, 2023
7c94b83
Add len to triad_holder and add gpu tuning
MrBurmark Nov 1, 2023
1bbd169
Add a smart memory pool tuning
MrBurmark Nov 1, 2023
b1d4e24
Add SOA reuse tuning
MrBurmark Nov 1, 2023
00fb6cf
Add option to shuffle_partition_sizes
MrBurmark Nov 1, 2023
cf08b3e
fixup part_type
MrBurmark Nov 1, 2023
72fd10c
Change part_size_order to have multiple options
MrBurmark Nov 1, 2023
94bd68f
fixup part_type
MrBurmark Nov 1, 2023
37e4475
fixup part_size_order
MrBurmark Nov 1, 2023
c2de49f
Add scanAOSreuse tuning
MrBurmark Nov 2, 2023
a072b3f
Add block wide search impl to triad_parted_fused_scan_aos
MrBurmark Nov 8, 2023
876a16b
force higher alignment on triad_holder
MrBurmark Nov 9, 2023
431c80e
Use device memory for hip triad parted fused
MrBurmark Nov 9, 2023
9141573
Use cuda managed device preferred host accessed
MrBurmark Nov 9, 2023
bbe8272
Remove block wide search code
MrBurmark Nov 21, 2023
8f884f5
Add some missing includes
MrBurmark Nov 21, 2023
0e567b1
reorder TRIAD_PARTED_FUSED gpu tuning declarations
MrBurmark Nov 27, 2023
ddf9c9d
add TRIAD_PARTED stream (non-omp) tuning
MrBurmark Nov 27, 2023
5926c63
Change res vector in TRIAD_PARTED gpu stream tunings
MrBurmark Nov 27, 2023
54d8094
Add gpu event tunings of TRIAD_PARTED
MrBurmark Nov 27, 2023
5ba0d3b
Rename parted_fused tunings
MrBurmark Jan 22, 2024
9c696d9
Add cuda graph tuning of TRIAD_PARTED_FUSED
MrBurmark Jan 22, 2024
489f23f
Add indirect dispatch tunings to TRIAD_PARTED_FUSED
MrBurmark Jan 23, 2024
18da3c7
Rename LaggedMemPool and add normal MemPool alias
MrBurmark Jan 23, 2024
c75aa00
Add dataspace_allocator
MrBurmark Jan 23, 2024
5337794
Use LaggedMemPool and dataspace_allocator
MrBurmark Jan 23, 2024
b82f291
Use MemPool alias and dataspace_allocator
MrBurmark Jan 23, 2024
1f82e28
fixup dataspace_allocator
MrBurmark Jan 23, 2024
457b829
fixup includes
MrBurmark Jan 23, 2024
2a469a3
Add openmp compile guards
MrBurmark Jan 23, 2024
4f143c3
Get DataSpace for fusers via function
MrBurmark Jan 23, 2024
455baee
Use getFuserDataSpace in TRIAD_PARTED_FUSED
MrBurmark Jan 23, 2024
f1d0120
Use getFuserDataSpace with comm
MrBurmark Jan 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,12 @@ blt_add_executable(
stream/TRIAD.cpp
stream/TRIAD-Seq.cpp
stream/TRIAD-OMPTarget.cpp
stream/TRIAD_PARTED.cpp
stream/TRIAD_PARTED-Seq.cpp
stream/TRIAD_PARTED-OMPTarget.cpp
stream/TRIAD_PARTED_FUSED.cpp
stream/TRIAD_PARTED_FUSED-Seq.cpp
stream/TRIAD_PARTED_FUSED-OMPTarget.cpp
common/DataUtils.cpp
common/Executor.cpp
common/KernelBase.cpp
Expand Down
46 changes: 24 additions & 22 deletions src/comm/HALO_EXCHANGE_FUSED-Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_CUDA)

#include "common/CudaDataUtils.hpp"
#include "common/MemPool.hpp"

#include <iostream>

Expand All @@ -21,33 +22,33 @@ namespace rajaperf
namespace comm
{

#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA \
#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA(vid) \
Real_ptr* pack_buffer_ptrs; \
Int_ptr* pack_list_ptrs; \
Real_ptr* pack_var_ptrs; \
Index_type* pack_len_ptrs; \
allocData(DataSpace::CudaPinned, pack_buffer_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, pack_list_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, pack_var_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, pack_len_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_list_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_var_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_len_ptrs, num_neighbors * num_vars); \
Real_ptr* unpack_buffer_ptrs; \
Int_ptr* unpack_list_ptrs; \
Real_ptr* unpack_var_ptrs; \
Index_type* unpack_len_ptrs; \
allocData(DataSpace::CudaPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, unpack_list_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars);

#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \
deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \
deallocData(DataSpace::CudaPinned, pack_list_ptrs); \
deallocData(DataSpace::CudaPinned, pack_var_ptrs); \
deallocData(DataSpace::CudaPinned, pack_len_ptrs); \
deallocData(DataSpace::CudaPinned, unpack_buffer_ptrs); \
deallocData(DataSpace::CudaPinned, unpack_list_ptrs); \
deallocData(DataSpace::CudaPinned, unpack_var_ptrs); \
deallocData(DataSpace::CudaPinned, unpack_len_ptrs);
allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), unpack_list_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), unpack_var_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), unpack_len_ptrs, num_neighbors * num_vars);

#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(vid) \
deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \
deallocData(getFuserDataSpace(vid), pack_list_ptrs); \
deallocData(getFuserDataSpace(vid), pack_var_ptrs); \
deallocData(getFuserDataSpace(vid), pack_len_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_len_ptrs);

template < size_t block_size >
__launch_bounds__(block_size)
Expand Down Expand Up @@ -99,7 +100,7 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid)

if ( vid == Base_CUDA ) {

HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA;
HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_CUDA(Base_CUDA);

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
Expand Down Expand Up @@ -198,7 +199,7 @@ void HALO_EXCHANGE_FUSED::runCudaVariantDirect(VariantID vid)
}
stopTimer();

HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA;
HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(Base_CUDA);

} else {
getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Cuda variant id = " << vid << std::endl;
Expand All @@ -216,7 +217,8 @@ void HALO_EXCHANGE_FUSED::runCudaVariantWorkGroup(VariantID vid)

if ( vid == RAJA_CUDA ) {

using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::cuda::pinned_mempool_type>;
using AllocatorHolder = RAJAPoolAllocatorHolder<
rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_CUDA)>>>;
using Allocator = AllocatorHolder::Allocator<char>;

AllocatorHolder allocatorHolder;
Expand Down
46 changes: 24 additions & 22 deletions src/comm/HALO_EXCHANGE_FUSED-Hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_HIP)

#include "common/HipDataUtils.hpp"
#include "common/MemPool.hpp"

#include <iostream>

Expand All @@ -21,33 +22,33 @@ namespace rajaperf
namespace comm
{

#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP \
#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP(vid) \
Real_ptr* pack_buffer_ptrs; \
Int_ptr* pack_list_ptrs; \
Real_ptr* pack_var_ptrs; \
Index_type* pack_len_ptrs; \
allocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::HipPinnedCoarse, pack_list_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::HipPinnedCoarse, pack_var_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::HipPinnedCoarse, pack_len_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_list_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_var_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_len_ptrs, num_neighbors * num_vars); \
Real_ptr* unpack_buffer_ptrs; \
Int_ptr* unpack_list_ptrs; \
Real_ptr* unpack_var_ptrs; \
Index_type* unpack_len_ptrs; \
allocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs, num_neighbors * num_vars);

#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP \
deallocData(DataSpace::HipPinnedCoarse, pack_buffer_ptrs); \
deallocData(DataSpace::HipPinnedCoarse, pack_list_ptrs); \
deallocData(DataSpace::HipPinnedCoarse, pack_var_ptrs); \
deallocData(DataSpace::HipPinnedCoarse, pack_len_ptrs); \
deallocData(DataSpace::HipPinnedCoarse, unpack_buffer_ptrs); \
deallocData(DataSpace::HipPinnedCoarse, unpack_list_ptrs); \
deallocData(DataSpace::HipPinnedCoarse, unpack_var_ptrs); \
deallocData(DataSpace::HipPinnedCoarse, unpack_len_ptrs);
allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), unpack_list_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), unpack_var_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), unpack_len_ptrs, num_neighbors * num_vars);

#define HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP(vid) \
deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \
deallocData(getFuserDataSpace(vid), pack_list_ptrs); \
deallocData(getFuserDataSpace(vid), pack_var_ptrs); \
deallocData(getFuserDataSpace(vid), pack_len_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_len_ptrs);

template < size_t block_size >
__launch_bounds__(block_size)
Expand Down Expand Up @@ -99,7 +100,7 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid)

if ( vid == Base_HIP ) {

HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP;
HALO_EXCHANGE_FUSED_MANUAL_FUSER_SETUP_HIP(Base_HIP);

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
Expand Down Expand Up @@ -198,7 +199,7 @@ void HALO_EXCHANGE_FUSED::runHipVariantDirect(VariantID vid)
}
stopTimer();

HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP;
HALO_EXCHANGE_FUSED_MANUAL_FUSER_TEARDOWN_HIP(Base_HIP);

} else {
getCout() << "\n HALO_EXCHANGE_FUSED : Unknown Hip variant id = " << vid << std::endl;
Expand All @@ -216,7 +217,8 @@ void HALO_EXCHANGE_FUSED::runHipVariantWorkGroup(VariantID vid)

if ( vid == RAJA_HIP ) {

using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::hip::pinned_mempool_type>;
using AllocatorHolder = RAJAPoolAllocatorHolder<
rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_HIP)>>>;
using Allocator = AllocatorHolder::Allocator<char>;

AllocatorHolder allocatorHolder;
Expand Down
4 changes: 3 additions & 1 deletion src/comm/HALO_EXCHANGE_FUSED-OMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

#if defined(RAJA_PERFSUITE_ENABLE_MPI)

#include "common/MemPool.hpp"

#include <iostream>

namespace rajaperf
Expand Down Expand Up @@ -307,7 +309,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPVariantWorkGroup(VariantID vid)
case RAJA_OpenMP : {

using AllocatorHolder = RAJAPoolAllocatorHolder<
RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_OpenMP)>>>;
using Allocator = AllocatorHolder::Allocator<char>;

AllocatorHolder allocatorHolder;
Expand Down
3 changes: 2 additions & 1 deletion src/comm/HALO_EXCHANGE_FUSED-OMPTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#if defined(RAJA_PERFSUITE_ENABLE_MPI) && defined(RAJA_ENABLE_TARGET_OPENMP)

#include "common/OpenMPTargetDataUtils.hpp"
#include "common/MemPool.hpp"

#include <iostream>

Expand Down Expand Up @@ -197,7 +198,7 @@ void HALO_EXCHANGE_FUSED::runOpenMPTargetVariantWorkGroup(VariantID vid)
if ( vid == RAJA_OpenMPTarget ) {

using AllocatorHolder = RAJAPoolAllocatorHolder<
RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_OpenMPTarget)>>>;
using Allocator = AllocatorHolder::Allocator<char>;

AllocatorHolder allocatorHolder;
Expand Down
4 changes: 3 additions & 1 deletion src/comm/HALO_EXCHANGE_FUSED-Seq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

#if defined(RAJA_PERFSUITE_ENABLE_MPI)

#include "common/MemPool.hpp"

#include <iostream>

namespace rajaperf
Expand Down Expand Up @@ -231,7 +233,7 @@ void HALO_EXCHANGE_FUSED::runSeqVariantWorkGroup(VariantID vid)
case RAJA_Seq : {

using AllocatorHolder = RAJAPoolAllocatorHolder<
RAJA::basic_mempool::MemPool<RAJA::basic_mempool::generic_allocator>>;
rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_Seq)>>>;
using Allocator = AllocatorHolder::Allocator<char>;

AllocatorHolder allocatorHolder;
Expand Down
46 changes: 24 additions & 22 deletions src/comm/HALO_PACKING_FUSED-Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#if defined(RAJA_ENABLE_CUDA)

#include "common/CudaDataUtils.hpp"
#include "common/MemPool.hpp"

#include <iostream>

Expand All @@ -21,33 +22,33 @@ namespace rajaperf
namespace comm
{

#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA \
#define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA(vid) \
Real_ptr* pack_buffer_ptrs; \
Int_ptr* pack_list_ptrs; \
Real_ptr* pack_var_ptrs; \
Index_type* pack_len_ptrs; \
allocData(DataSpace::CudaPinned, pack_buffer_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, pack_list_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, pack_var_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, pack_len_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_buffer_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_list_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_var_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), pack_len_ptrs, num_neighbors * num_vars); \
Real_ptr* unpack_buffer_ptrs; \
Int_ptr* unpack_list_ptrs; \
Real_ptr* unpack_var_ptrs; \
Index_type* unpack_len_ptrs; \
allocData(DataSpace::CudaPinned, unpack_buffer_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, unpack_list_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, unpack_var_ptrs, num_neighbors * num_vars); \
allocData(DataSpace::CudaPinned, unpack_len_ptrs, num_neighbors * num_vars);

#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA \
deallocData(DataSpace::CudaPinned, pack_buffer_ptrs); \
deallocData(DataSpace::CudaPinned, pack_list_ptrs); \
deallocData(DataSpace::CudaPinned, pack_var_ptrs); \
deallocData(DataSpace::CudaPinned, pack_len_ptrs); \
deallocData(DataSpace::CudaPinned, unpack_buffer_ptrs); \
deallocData(DataSpace::CudaPinned, unpack_list_ptrs); \
deallocData(DataSpace::CudaPinned, unpack_var_ptrs); \
deallocData(DataSpace::CudaPinned, unpack_len_ptrs);
allocData(getFuserDataSpace(vid), unpack_buffer_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), unpack_list_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), unpack_var_ptrs, num_neighbors * num_vars); \
allocData(getFuserDataSpace(vid), unpack_len_ptrs, num_neighbors * num_vars);

#define HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(vid) \
deallocData(getFuserDataSpace(vid), pack_buffer_ptrs); \
deallocData(getFuserDataSpace(vid), pack_list_ptrs); \
deallocData(getFuserDataSpace(vid), pack_var_ptrs); \
deallocData(getFuserDataSpace(vid), pack_len_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_buffer_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_list_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_var_ptrs); \
deallocData(getFuserDataSpace(vid), unpack_len_ptrs);

template < size_t block_size >
__launch_bounds__(block_size)
Expand Down Expand Up @@ -103,7 +104,7 @@ void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid)

if ( vid == Base_CUDA ) {

HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA;
HALO_PACKING_FUSED_MANUAL_FUSER_SETUP_CUDA(Base_CUDA);

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
Expand Down Expand Up @@ -189,7 +190,7 @@ void HALO_PACKING_FUSED::runCudaVariantDirect(VariantID vid)
}
stopTimer();

HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA;
HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN_CUDA(Base_CUDA);

} else {
getCout() << "\n HALO_PACKING_FUSED : Unknown Cuda variant id = " << vid << std::endl;
Expand All @@ -207,7 +208,8 @@ void HALO_PACKING_FUSED::runCudaVariantWorkGroup(VariantID vid)

if ( vid == RAJA_CUDA ) {

using AllocatorHolder = RAJAPoolAllocatorHolder<RAJA::cuda::pinned_mempool_type>;
using AllocatorHolder = RAJAPoolAllocatorHolder<
rajaperf::basic_mempool::MemPool<dataspace_allocator<getFuserDataSpace(RAJA_CUDA)>>>;
using Allocator = AllocatorHolder::Allocator<char>;

AllocatorHolder allocatorHolder;
Expand Down
Loading