Merge pull request #15 from LLNL/exercise-pass-1

Exercise pass 1
LLNL · Aug 9, 2023 · cc9a65e · cc9a65e
2 parents 5b2ed52 + 30f99d8
commit cc9a65e
Show file tree

Hide file tree

Showing 33 changed files with 788 additions and 133 deletions.
diff --git a/Intermediate_Tutorial/01-LOOP/CMakeLists.txt → Intermediate_Tutorial/01-SEQ/CMakeLists.txt b/Intermediate_Tutorial/01-LOOP/CMakeLists.txt → Intermediate_Tutorial/01-SEQ/CMakeLists.txt
@@ -6,6 +6,6 @@
 ###############################################################################
 
 blt_add_executable(
-  NAME fractal-ex1-RAJA-loop
-  SOURCES fractal-ex1-RAJA-loop.cpp
+  NAME fractal-ex1-RAJA-seq
+  SOURCES fractal-ex1-RAJA-seq.cpp
   DEPENDS_ON cuda RAJA umpire writeBMP)
diff --git a/...-LOOP/solutions/fractal-ex1-RAJA-loop.cpp → ..._Tutorial/01-SEQ/fractal-ex1-RAJA-seq.cpp b/...-LOOP/solutions/fractal-ex1-RAJA-loop.cpp → ..._Tutorial/01-SEQ/fractal-ex1-RAJA-seq.cpp
@@ -2,15 +2,21 @@
 #include <sys/time.h>
 
 #include "RAJA/RAJA.hpp"
-#include "../tpl/writeBMP.hpp"
+#include "umpire/Umpire.hpp"
+#include "umpire/strategy/QuickPool.hpp"
+#include "../../tpl/writeBMP.hpp"
 
 #define xMin 0.74395
 #define xMax 0.74973
 #define yMin 0.11321
 #define yMax 0.11899
 
+// #define COMPILE
+
 int main(int argc, char *argv[])
 {
+#if defined(COMPILE)
+
   double dx, dy;
   int width;
   const int maxdepth = 256;
@@ -27,24 +33,17 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the 
-  // pixels of the fractal image.
+  //TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the
+  //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("UM");
-  auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
-  cnt = pool.allocate(width * width * sizeof(unsigned char));
+  auto allocator = rm.getAllocator("???");
+  auto pool = ???
+  cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
-  //TODO: Create a RAJA Kernel Policy which uses the loop_exec policy. We want to start
+  //TODO: Create a RAJA Kernel Policy which uses the seq_exec policy. We want to start
   //with a normal serial nested loop first before continuing onward.
-  using KERNEL_POLICY = 
-    RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::loop_exec,
-        RAJA::statement::For<0, RAJA::loop_exec,
-          RAJA::statement::Lambda<0>
-        >
-      > 
-    >;
+
 
   /* start time */
   gettimeofday(&start, NULL);
@@ -69,7 +68,7 @@ int main(int argc, char *argv[])
     } while ((depth > 0) && ((x2 + y2) <= 5.0));
     cnt[row * width + col] = depth & 255;
   });
-						
+
   /* end time */
   gettimeofday(&end, NULL);
   printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);
@@ -80,6 +79,8 @@ int main(int argc, char *argv[])
   }
 
   //TODO: Use the Umpire pooled allocator to deallocate the memory.
-  pool.deallocate(cnt);
+
+#endif
+
   return 0;
 }
diff --git a/...utorial/01-LOOP/fractal-ex1-RAJA-loop.cpp → ...01-SEQ/solutions/fractal-ex1-RAJA-seq.cpp b/...utorial/01-LOOP/fractal-ex1-RAJA-loop.cpp → ...01-SEQ/solutions/fractal-ex1-RAJA-seq.cpp
@@ -37,12 +37,12 @@ int main(int argc, char *argv[])
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
-  //TODO: Create a RAJA Kernel Policy which uses the loop_exec policy. We want to start
-  //with a normal serial nested loop first before continuing onward.
+  //TODO: Create a RAJA Kernel Policy which uses the seq_exec policy. We want to start
+  //with a normal serial nested seq first before continuing onward.
   using KERNEL_POLICY =
     RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::loop_exec,
-        RAJA::statement::For<0, RAJA::loop_exec,
+      RAJA::statement::For<1, RAJA::seq_exec,
+        RAJA::statement::For<0, RAJA::seq_exec,
           RAJA::statement::Lambda<0>
         >
       >

diff --git a/...diate_Tutorial/01-LOOP/tutorial-guide.rst → ...ediate_Tutorial/01-SEQ/tutorial-guide.rst b/...diate_Tutorial/01-LOOP/tutorial-guide.rst → ...ediate_Tutorial/01-SEQ/tutorial-guide.rst
@@ -11,14 +11,9 @@ before continuing. It is important to note:
 Look for the `TODO` comments in the source code. This is where you will need to fill in
 what's needed. You will need to create an Umpire pooled allocator (just like you did for
 lesson 12 from the Introduction Tutorial) for the fractal and
-complete the appropriate `RAJA::Kernel` statement using the `RAJA::loop_exec` execution
+complete the appropriate `RAJA::Kernel` statement using the `RAJA::seq_exec` execution
 policy.
 
-The `loop_exec` policy is a good first step because it allows us to get a sense of the
+The `seq_exec` policy is a good first step because it allows us to get a sense of the
 performance using serial nested loops. From here, we have a good baseline to compare against
 when transitioning to CUDA, HIP, etc. 
-
-The `loop_exec` is better than just the `seq_exec` policy because it allows the compiler to 
-generate any optimizations that its heuristics deem beneficial.
-In fact, in upcoming releases of RAJA, `seq_exec` will become deprecated.
-To learn more about the `loop_exec` RAJA execution policy, see `here <https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html?highlight=loop_exec#raja-loop-kernel-execution-policies>`_.
diff --git a/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp b/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp
@@ -17,10 +17,13 @@
 #define yMax 0.11899
 
 /* TODO: create a variable called "THREADS" to be used when calling the kernel*/
-#define THREADS 512 
+
+//#define COMPILE
 
 int main(int argc, char *argv[])
 {
+#if defined(COMPILE)
+
   double dx, dy;
   int width;
   const int maxdepth = 256;
@@ -38,36 +41,23 @@ int main(int argc, char *argv[])
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
   /* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */
-  unsigned char *cnt;
-  cudaHostAlloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostAllocDefault);
+
 
   /* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */
-  unsigned char *d_cnt;
-  cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char));
+
 
   /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop 
    * and one inner loop. Only the inner for loop will be calculating pixels. 
    */
-  using KERNEL_POLICY = RAJA::KernelPolicy<
-    RAJA::statement::CudaKernel<
-      RAJA::statement::For<1, RAJA::cuda_block_x_loop,
-        RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
-          RAJA::statement::Lambda<0>
-        >
-      > 
-    >
-  >;
+
 
   /* compute fractal */
   gettimeofday(&start, NULL);
   /* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
    * It should take range segments that go the same range as our for-loops from before.
    * The iterators inside the kernel body will describe the row and col of the image.
    */
-  RAJA::kernel<KERNEL_POLICY>(
-        RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, width),
-                         RAJA::TypedRangeSegment<int>(0, width)),
-        [=] RAJA_DEVICE (int row, int col) {
+
     //Remember, RAJA takes care of finding the global thread ID, so just index into the image like normal
     double x2, y2, x, y, cx, cy;
     int depth;
@@ -86,21 +76,22 @@ int main(int argc, char *argv[])
         depth--;
       } while ((depth > 0) && ((x2 + y2) <= 5.0));
       d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
-  });
+
   gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
 
   printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);
 
   /* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
-  cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
 
   /* verify result by writing it to a file */
   if (width <= 2048) {
     wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
   }
 
   /* TODO: Free the memory we allocated. */
-  cudaFreeHost(cnt);
-  cudaFree(d_cnt);
+
+
+#endif
   return 0;
 }
diff --git a/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA.cpp b/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA.cpp
@@ -0,0 +1,106 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <malloc.h>
+#include <sys/time.h>
+
+#include "RAJA/RAJA.hpp"
+#include "../../tpl/writeBMP.hpp"
+
+#define xMin 0.74395
+#define xMax 0.74973
+#define yMin 0.11321
+#define yMax 0.11899
+
+/* TODO: create a variable called "THREADS" to be used when calling the kernel*/
+#define THREADS 512 
+
+int main(int argc, char *argv[])
+{
+  double dx, dy;
+  int width;
+  const int maxdepth = 256;
+  struct timeval start, end;
+  writebmp wbmp;
+
+  /* check command line */
+  if(argc != 2) {fprintf(stderr, "usage: exe <width>\n"); exit(-1);}
+  width = atoi(argv[1]);
+  if (width < 10) {fprintf(stderr, "edge_length must be at least 10\n"); exit(-1);}
+
+  dx = (xMax - xMin) / width;
+  dy = (yMax - yMin) / width;
+
+  printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
+
+  /* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */
+  unsigned char *cnt;
+  cudaHostAlloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostAllocDefault);
+
+  /* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */
+  unsigned char *d_cnt;
+  cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char));
+
+  /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop 
+   * and one inner loop. Only the inner for loop will be calculating pixels. 
+   */
+  using KERNEL_POLICY = RAJA::KernelPolicy<
+    RAJA::statement::CudaKernel<
+      RAJA::statement::For<1, RAJA::cuda_block_x_loop,
+        RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+          RAJA::statement::Lambda<0>
+        >
+      > 
+    >
+  >;
+
+  /* compute fractal */
+  gettimeofday(&start, NULL);
+  /* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
+   * It should take range segments that go the same range as our for-loops from before.
+   * The iterators inside the kernel body will describe the row and col of the image.
+   */
+  RAJA::kernel<KERNEL_POLICY>(
+        RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, width),
+                         RAJA::TypedRangeSegment<int>(0, width)),
+        [=] RAJA_DEVICE (int row, int col) {
+    //Remember, RAJA takes care of finding the global thread ID, so just index into the image like normal
+    double x2, y2, x, y, cx, cy;
+    int depth;
+
+      /* compute fractal */
+      cy = yMin + row * dy; //compute row #
+      cx = xMin + col * dx; //compute column #
+      x = -cx;
+      y = -cy;
+      depth = maxdepth;
+      do {
+        x2 = x * x;
+        y2 = y * y;
+        y = 2 * x * y - cy;
+        x = x2 - y2 - cx;
+        depth--;
+      } while ((depth > 0) && ((x2 + y2) <= 5.0));
+      d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
+  });
+  gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
+
+  printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);
+
+  /* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
+  cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+  /* verify result by writing it to a file */
+  if (width <= 2048) {
+    wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
+  }
+
+  /* TODO: Free the memory we allocated. */
+  cudaFreeHost(cnt);
+  cudaFree(d_cnt);
+  return 0;
+}
diff --git a/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp b/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp
@@ -17,7 +17,7 @@
 #define yMax 0.11899
 
 /* TODO: create a variable called "THREADS" to be used when calling the kernel*/
-#define THREADS 512 
+#define THREADS 512
 
 int main(int argc, char *argv[])
 {
@@ -39,25 +39,25 @@ int main(int argc, char *argv[])
 
   /* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */
   unsigned char *cnt;
-  hipHostMalloc((void**)&cnt, (width * width * sizeof(unsigned char)), hipHostRegisterDefault);
+  cudaHostMalloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostRegisterDefault);
 
   /* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */
   unsigned char *d_cnt;
-  hipMalloc((void**)&d_cnt, width * width * sizeof(unsigned char));
+  cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char));
 
-  /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a hip kernel with one outer loop 
-   * and one inner loop. Only the inner for loop will be calculating pixels. 
+  /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
+   * and one inner loop. Only the inner for loop will be calculating pixels.
    */
   using KERNEL_POLICY = RAJA::KernelPolicy<
-    RAJA::statement::HipKernel<
-      RAJA::statement::For<1, RAJA::hip_block_x_loop,
-        RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+    RAJA::statement::CudaKernel<
+      RAJA::statement::For<1, RAJA::cuda_block_x_loop,
+        RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
           RAJA::statement::Lambda<0>
         >
-      > 
+      >
     >
   >;
-  
+
   /* compute fractal */
   gettimeofday(&start, NULL);
   /* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
@@ -92,15 +92,15 @@ int main(int argc, char *argv[])
   printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);
 
   /* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
-  hipMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), hipMemcpyDeviceToHost);
+  cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost);
 
   /* verify result by writing it to a file */
   if (width <= 2048) {
     wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
   }
 
   /* TODO: Free the memory we allocated. */
-  hipHostFree(cnt);
-  hipFree(d_cnt);
+  cudaHostFree(cnt);
+  cudaFree(d_cnt);
   return 0;
 }