Add a cli option to enable GPU offload

Add a "--gpu" that allows users to request the workload to be offloaded to the GPU. This works natively on macOS using Metal and in containers using Vulkan with llama.cpp's Kompute backend. Signed-off-by: Sergio Lopez <[email protected]>
containers · Oct 9, 2024 · 659a9d4 · 659a9d4
1 parent 96a0efb
commit 659a9d4
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 3 deletions.
diff --git a/ramalama/cli.py b/ramalama/cli.py
@@ -62,6 +62,13 @@ def init_cli():
         action="store_false",
         help="do not run RamaLama in the default container",
     )
+    parser.add_argument(
+        "--gpu",
+        dest="gpu",
+        default=False,
+        action="store_true",
+        help="offload the workload to the GPU",
+    )
     parser.add_argument(
         "--runtime",
         default="llama.cpp",
@@ -517,7 +524,7 @@ def run_container(args):
     if hasattr(args, "port"):
         conman_args += ["-p", f"{args.port}:{args.port}"]
 
-    if os.path.exists("/dev/dri"):
+    if args.gpu and (os.path.exists("/dev/dri") or sys.platform == "darwin"):
         conman_args += ["--device", "/dev/dri"]
 
     if os.path.exists("/dev/kfd"):

diff --git a/ramalama/model.py b/ramalama/model.py
@@ -1,5 +1,6 @@
 import os
 import sys
+from pathlib import Path
 from ramalama.common import container_manager, exec_cmd, default_image
 
 
@@ -13,8 +14,6 @@ class Model:
 
     def __init__(self, model):
         self.model = model
-        if sys.platform == "darwin":
-            self.common_params += ["-ngl", "99"]
 
     def login(self, args):
         raise NotImplementedError(f"ramalama login for {self.type} not implemented")
@@ -100,6 +99,12 @@ def run(self, args):
         if not args.ARGS:
             exec_args.append("-cnv")
 
+        if args.gpu:
+            if sys.platform == "darwin" or (sys.platform == "linux" and Path("/dev/dri").exists()):
+                exec_args.extend(["-ngl", "99"])
+            else:
+                print("GPU offload was requested but is not available on this system")
+
         exec_cmd(exec_args, False)
 
     def serve(self, args):