Skip to content

Commit

Permalink
Merge pull request #15 from LLNL/exercise-pass-1
Browse files Browse the repository at this point in the history
Exercise pass 1
  • Loading branch information
artv3 authored Aug 9, 2023
2 parents 5b2ed52 + 30f99d8 commit cc9a65e
Show file tree
Hide file tree
Showing 33 changed files with 788 additions and 133 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@
###############################################################################

blt_add_executable(
NAME fractal-ex1-RAJA-loop
SOURCES fractal-ex1-RAJA-loop.cpp
NAME fractal-ex1-RAJA-seq
SOURCES fractal-ex1-RAJA-seq.cpp
DEPENDS_ON cuda RAJA umpire writeBMP)
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,21 @@
#include <sys/time.h>

#include "RAJA/RAJA.hpp"
#include "../tpl/writeBMP.hpp"
#include "umpire/Umpire.hpp"
#include "umpire/strategy/QuickPool.hpp"
#include "../../tpl/writeBMP.hpp"

#define xMin 0.74395
#define xMax 0.74973
#define yMin 0.11321
#define yMax 0.11899

// #define COMPILE

int main(int argc, char *argv[])
{
#if defined(COMPILE)

double dx, dy;
int width;
const int maxdepth = 256;
Expand All @@ -27,24 +33,17 @@ int main(int argc, char *argv[])

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

//TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the
// pixels of the fractal image.
//TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the
//pixels of the fractal image.
auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
auto allocator = rm.getAllocator("UM");
auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
cnt = pool.allocate(width * width * sizeof(unsigned char));
auto allocator = rm.getAllocator("???");
auto pool = ???
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));

//TODO: Create a RAJA Kernel Policy which uses the loop_exec policy. We want to start
//TODO: Create a RAJA Kernel Policy which uses the seq_exec policy. We want to start
//with a normal serial nested loop first before continuing onward.
using KERNEL_POLICY =
RAJA::KernelPolicy<
RAJA::statement::For<1, RAJA::loop_exec,
RAJA::statement::For<0, RAJA::loop_exec,
RAJA::statement::Lambda<0>
>
>
>;


/* start time */
gettimeofday(&start, NULL);
Expand All @@ -69,7 +68,7 @@ int main(int argc, char *argv[])
} while ((depth > 0) && ((x2 + y2) <= 5.0));
cnt[row * width + col] = depth & 255;
});

/* end time */
gettimeofday(&end, NULL);
printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);
Expand All @@ -80,6 +79,8 @@ int main(int argc, char *argv[])
}

//TODO: Use the Umpire pooled allocator to deallocate the memory.
pool.deallocate(cnt);

#endif

return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ int main(int argc, char *argv[])
auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));

//TODO: Create a RAJA Kernel Policy which uses the loop_exec policy. We want to start
//with a normal serial nested loop first before continuing onward.
//TODO: Create a RAJA Kernel Policy which uses the seq_exec policy. We want to start
//with a normal serial nested seq first before continuing onward.
using KERNEL_POLICY =
RAJA::KernelPolicy<
RAJA::statement::For<1, RAJA::loop_exec,
RAJA::statement::For<0, RAJA::loop_exec,
RAJA::statement::For<1, RAJA::seq_exec,
RAJA::statement::For<0, RAJA::seq_exec,
RAJA::statement::Lambda<0>
>
>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,9 @@ before continuing. It is important to note:
Look for the `TODO` comments in the source code. This is where you will need to fill in
what's needed. You will need to create an Umpire pooled allocator (just like you did for
lesson 12 from the Introduction Tutorial) for the fractal and
complete the appropriate `RAJA::Kernel` statement using the `RAJA::loop_exec` execution
complete the appropriate `RAJA::Kernel` statement using the `RAJA::seq_exec` execution
policy.

The `loop_exec` policy is a good first step because it allows us to get a sense of the
The `seq_exec` policy is a good first step because it allows us to get a sense of the
performance using serial nested loops. From here, we have a good baseline to compare against
when transitioning to CUDA, HIP, etc.

The `loop_exec` is better than just the `seq_exec` policy because it allows the compiler to
generate any optimizations that its heuristics deem beneficial.
In fact, in upcoming releases of RAJA, `seq_exec` will become deprecated.
To learn more about the `loop_exec` RAJA execution policy, see `here <https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html?highlight=loop_exec#raja-loop-kernel-execution-policies>`_.
35 changes: 13 additions & 22 deletions Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@
#define yMax 0.11899

/* TODO: create a variable called "THREADS" to be used when calling the kernel*/
#define THREADS 512

//#define COMPILE

int main(int argc, char *argv[])
{
#if defined(COMPILE)

double dx, dy;
int width;
const int maxdepth = 256;
Expand All @@ -38,36 +41,23 @@ int main(int argc, char *argv[])
printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

/* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */
unsigned char *cnt;
cudaHostAlloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostAllocDefault);


/* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */
unsigned char *d_cnt;
cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char));


/* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
* and one inner loop. Only the inner for loop will be calculating pixels.
*/
using KERNEL_POLICY = RAJA::KernelPolicy<
RAJA::statement::CudaKernel<
RAJA::statement::For<1, RAJA::cuda_block_x_loop,
RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
RAJA::statement::Lambda<0>
>
>
>
>;


/* compute fractal */
gettimeofday(&start, NULL);
/* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
* It should take range segments that go the same range as our for-loops from before.
* The iterators inside the kernel body will describe the row and col of the image.
*/
RAJA::kernel<KERNEL_POLICY>(
RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, width),
RAJA::TypedRangeSegment<int>(0, width)),
[=] RAJA_DEVICE (int row, int col) {

//Remember, RAJA takes care of finding the global thread ID, so just index into the image like normal
double x2, y2, x, y, cx, cy;
int depth;
Expand All @@ -86,21 +76,22 @@ int main(int argc, char *argv[])
depth--;
} while ((depth > 0) && ((x2 + y2) <= 5.0));
d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
});

gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.

printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);

/* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost);


/* verify result by writing it to a file */
if (width <= 2048) {
wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
}

/* TODO: Free the memory we allocated. */
cudaFreeHost(cnt);
cudaFree(d_cnt);


#endif
return 0;
}
106 changes: 106 additions & 0 deletions Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
// and RAJA project contributors. See the RAJA/LICENSE file for details.
//
// SPDX-License-Identifier: (BSD-3-Clause)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

#include <malloc.h>
#include <sys/time.h>

#include "RAJA/RAJA.hpp"
#include "../../tpl/writeBMP.hpp"

#define xMin 0.74395
#define xMax 0.74973
#define yMin 0.11321
#define yMax 0.11899

/* TODO: create a variable called "THREADS" to be used when calling the kernel*/
#define THREADS 512

int main(int argc, char *argv[])
{
double dx, dy;
int width;
const int maxdepth = 256;
struct timeval start, end;
writebmp wbmp;

/* check command line */
if(argc != 2) {fprintf(stderr, "usage: exe <width>\n"); exit(-1);}
width = atoi(argv[1]);
if (width < 10) {fprintf(stderr, "edge_length must be at least 10\n"); exit(-1);}

dx = (xMax - xMin) / width;
dy = (yMax - yMin) / width;

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

/* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */
unsigned char *cnt;
cudaHostAlloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostAllocDefault);

/* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */
unsigned char *d_cnt;
cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char));

/* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
* and one inner loop. Only the inner for loop will be calculating pixels.
*/
using KERNEL_POLICY = RAJA::KernelPolicy<
RAJA::statement::CudaKernel<
RAJA::statement::For<1, RAJA::cuda_block_x_loop,
RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
RAJA::statement::Lambda<0>
>
>
>
>;

/* compute fractal */
gettimeofday(&start, NULL);
/* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
* It should take range segments that go the same range as our for-loops from before.
* The iterators inside the kernel body will describe the row and col of the image.
*/
RAJA::kernel<KERNEL_POLICY>(
RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, width),
RAJA::TypedRangeSegment<int>(0, width)),
[=] RAJA_DEVICE (int row, int col) {
//Remember, RAJA takes care of finding the global thread ID, so just index into the image like normal
double x2, y2, x, y, cx, cy;
int depth;

/* compute fractal */
cy = yMin + row * dy; //compute row #
cx = xMin + col * dx; //compute column #
x = -cx;
y = -cy;
depth = maxdepth;
do {
x2 = x * x;
y2 = y * y;
y = 2 * x * y - cy;
x = x2 - y2 - cx;
depth--;
} while ((depth > 0) && ((x2 + y2) <= 5.0));
d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
});
gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.

printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);

/* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost);

/* verify result by writing it to a file */
if (width <= 2048) {
wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
}

/* TODO: Free the memory we allocated. */
cudaFreeHost(cnt);
cudaFree(d_cnt);
return 0;
}
26 changes: 13 additions & 13 deletions Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#define yMax 0.11899

/* TODO: create a variable called "THREADS" to be used when calling the kernel*/
#define THREADS 512
#define THREADS 512

int main(int argc, char *argv[])
{
Expand All @@ -39,25 +39,25 @@ int main(int argc, char *argv[])

/* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */
unsigned char *cnt;
hipHostMalloc((void**)&cnt, (width * width * sizeof(unsigned char)), hipHostRegisterDefault);
cudaHostMalloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostRegisterDefault);

/* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */
unsigned char *d_cnt;
hipMalloc((void**)&d_cnt, width * width * sizeof(unsigned char));
cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char));

/* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a hip kernel with one outer loop
* and one inner loop. Only the inner for loop will be calculating pixels.
/* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
* and one inner loop. Only the inner for loop will be calculating pixels.
*/
using KERNEL_POLICY = RAJA::KernelPolicy<
RAJA::statement::HipKernel<
RAJA::statement::For<1, RAJA::hip_block_x_loop,
RAJA::statement::For<0, RAJA::hip_thread_x_loop,
RAJA::statement::CudaKernel<
RAJA::statement::For<1, RAJA::cuda_block_x_loop,
RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
RAJA::statement::Lambda<0>
>
>
>
>
>;

/* compute fractal */
gettimeofday(&start, NULL);
/* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
Expand Down Expand Up @@ -92,15 +92,15 @@ int main(int argc, char *argv[])
printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);

/* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
hipMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), hipMemcpyDeviceToHost);
cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost);

/* verify result by writing it to a file */
if (width <= 2048) {
wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
}

/* TODO: Free the memory we allocated. */
hipHostFree(cnt);
hipFree(d_cnt);
cudaHostFree(cnt);
cudaFree(d_cnt);
return 0;
}
Loading

0 comments on commit cc9a65e

Please sign in to comment.