From 659a9d43c0261dcb37ed471622b57315d5efad8d Mon Sep 17 00:00:00 2001
From: Sergio Lopez <slp@redhat.com>
Date: Wed, 9 Oct 2024 15:50:17 +0200
Subject: [PATCH] Add a cli option to enable GPU offload

Add a "--gpu" that allows users to request the workload to be
offloaded to the GPU. This works natively on macOS using Metal and
in containers using Vulkan with llama.cpp's Kompute backend.

Signed-off-by: Sergio Lopez <slp@redhat.com>
---
 ramalama/cli.py   | 9 ++++++++-
 ramalama/model.py | 9 +++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/ramalama/cli.py b/ramalama/cli.py
index 0e85df6..2a52594 100644
--- a/ramalama/cli.py
+++ b/ramalama/cli.py
@@ -62,6 +62,13 @@ def init_cli():
         action="store_false",
         help="do not run RamaLama in the default container",
     )
+    parser.add_argument(
+        "--gpu",
+        dest="gpu",
+        default=False,
+        action="store_true",
+        help="offload the workload to the GPU",
+    )
     parser.add_argument(
         "--runtime",
         default="llama.cpp",
@@ -517,7 +524,7 @@ def run_container(args):
     if hasattr(args, "port"):
         conman_args += ["-p", f"{args.port}:{args.port}"]
 
-    if os.path.exists("/dev/dri"):
+    if args.gpu and (os.path.exists("/dev/dri") or sys.platform == "darwin"):
         conman_args += ["--device", "/dev/dri"]
 
     if os.path.exists("/dev/kfd"):
diff --git a/ramalama/model.py b/ramalama/model.py
index 533ce9b..09cdbff 100644
--- a/ramalama/model.py
+++ b/ramalama/model.py
@@ -1,5 +1,6 @@
 import os
 import sys
+from pathlib import Path
 from ramalama.common import container_manager, exec_cmd, default_image
 
 
@@ -13,8 +14,6 @@ class Model:
 
     def __init__(self, model):
         self.model = model
-        if sys.platform == "darwin":
-            self.common_params += ["-ngl", "99"]
 
     def login(self, args):
         raise NotImplementedError(f"ramalama login for {self.type} not implemented")
@@ -100,6 +99,12 @@ def run(self, args):
         if not args.ARGS:
             exec_args.append("-cnv")
 
+        if args.gpu:
+            if sys.platform == "darwin" or (sys.platform == "linux" and Path("/dev/dri").exists()):
+                exec_args.extend(["-ngl", "99"])
+            else:
+                print("GPU offload was requested but is not available on this system")
+
         exec_cmd(exec_args, False)
 
     def serve(self, args):