intel · changqi1 · Jun 14, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/src/common/allocator.h b/src/common/allocator.h
@@ -18,6 +18,10 @@
 #include <sys/mman.h>
 #include "environment.h"
 
+#ifdef GPU
+#include <CL/sycl.hpp>
+#endif
+
 namespace xft {
 
 constexpr size_t g_thp_threshold = (size_t)2 * 1024 * 1024;
@@ -26,11 +30,18 @@ static inline bool is_thp_alloc(size_t nbytes) {
     return (Env::getInstance().getTHPEnabled() && (nbytes >= g_thp_threshold));
 }
 
-static inline void *alloc(size_t nbytes, size_t alignment = 64) {
+static inline void *alloc(size_t nbytes, size_t alignment = 64, void *device = nullptr) {
     if (nbytes == 0) { return nullptr; }
 
     void *data;
 
+#ifdef GPU
+    if (device != nullptr) {
+        data = sycl::malloc_device<char>(nbytes, *static_cast<sycl::queue *>(device));
+        return data;
+    }
+#endif
+
     int err = posix_memalign(&data, alignment, nbytes);
     if (err != 0) {
         printf("Unable to allocate buffer with size of %zu, err=%d\n", nbytes, err);
@@ -47,4 +58,17 @@ static inline void *alloc(size_t nbytes, size_t alignment = 64) {
 
     return data;
 }
+
+static inline void dealloc(void *data, void *device = nullptr) {
+#ifdef GPU
+    if (device != nullptr) {
+        sycl::free(data, *static_cast<sycl::queue *>(device));
+        return;
+    }
+#endif
+
+    free(data);
+    return;
+}
+
 } // namespace xft
diff --git a/src/common/transformer_ctx.h b/src/common/transformer_ctx.h
@@ -111,6 +111,7 @@ struct DecoderContext {
     hpj::Matrix<float> imOut; // intermediate output
 
     MMHelper *mmHelper;
+    void *device;
 
     std::string configPath;
     INIReader configReader;
@@ -238,8 +239,12 @@ struct DecoderContext {
     bool cached(const std::string &name) { return SimpleMemPool::instance().cached(name); }
 
     template <typename T>
-    T *getBuffer(const std::string &name, size_t size, size_t alignment = 64) {
-        return (T *)SimpleMemPool::instance().getBuffer(name, sizeof(T) * size, alignment);
+    T *getBuffer(const std::string &name, size_t size, void *device = nullptr, size_t alignment = 64) {
+        return (T *)SimpleMemPool::instance().getBuffer(name, sizeof(T) * size, device, alignment);
+    }
+
+    void freeBuffer(const std::string &name, void *device = nullptr) {
+        SimpleMemPool::instance().freeBuffer(name, device);
     }
 
     void dump() {

diff --git a/src/layers/attention.h b/src/layers/attention.h
@@ -294,6 +294,11 @@ class Attention {
                 std::iota(posIds.begin(), posIds.end(), pastSeqLen);
             }
             qkpo.forward(query.Data(), key.Data(), query.Stride(), key.Stride(), qkShape, posIds.data());
+#ifdef GPU
+            sycl::queue *q = static_cast<sycl::queue *>(ctx->device);
+            int64_t size = ctx->batchSize * ctx->inputSeqLen * qkvCols * sizeof(float);
+            q->memcpy(qkvMatMul.Data(), query.Data(), size).wait();
+#endif
         }
         t3.release();
 

diff --git a/src/layers/dist_linear.h b/src/layers/dist_linear.h
@@ -59,14 +59,26 @@ class DistLinear {
 
         int K = inputSize;
         int N = this->splitSize;
-        weight.Resize(K, N);
+
         scaleWeight.Resize(N);
         zeroWeight.Resize(N);
 
         hpj::Matrix<WeiT> quantizedWeight;
         ctx->mmHelper->convertWeight(
                 true, K, N, w + splitOffset * K, nullptr, nullptr, quantizedWeight, scaleWeight, zeroWeight, sumWeight);
+#ifdef GPU
+        hpj::Matrix<WeiT> tWeight;
+        tWeight.Resize(K, N);
+        ctx->mmHelper->transposeWeight(true, quantizedWeight, tWeight);
+
+        sycl::queue *gpu_queue = static_cast<sycl::queue *>(ctx->device);
+        WeiT *input_data = sycl::malloc_device<WeiT>(K * N, *gpu_queue);
+        weight.Assign(input_data, K, N, N);
+        gpu_queue->memcpy(weight.Data(), tWeight.Data(), tWeight.Rows() * tWeight.Cols() * sizeof(WeiT)).wait();
+#else
+        weight.Resize(K, N);
         ctx->mmHelper->packWeight(true, quantizedWeight, weight);
+#endif
 
         // Copy Bias
         if (b) {

diff --git a/src/layers/mlp_llama.h b/src/layers/mlp_llama.h
@@ -275,8 +275,7 @@ class LlamaMLP : public SingletonBase<LlamaMLP<WeiT>> {
         }
     }
 
-    template <typename T1, typename T2>
-    void catGateUpProj(DecoderContext *ctx, hpj::Matrix<T1> &input, hpj::Matrix<T2> &output, hpj::Matrix<T2> &siluBuf) {
+    void catGateUpProj(DecoderContext *ctx, hpj::Matrix<InT> &input, hpj::Matrix<ImT> &output, hpj::Matrix<ImT> &siluBuf) {
         TimeLine t("catGateUpProj");
 
         assert(input.Rows() == output.Rows());
@@ -286,12 +285,12 @@ class LlamaMLP : public SingletonBase<LlamaMLP<WeiT>> {
         int M = input.Rows(), N = output.Cols(), K = input.Cols();
         int lda = input.Stride(), ldc = output.Stride();
 
-        const T1 *A = input.Data();
+        const InT *A = input.Data();
         const WeiT *B = catWeights.Data();
         const float *scaleB = catWeightsScale.Data();
         const float *zeroB = catWeightsZero.Data();
         const float *sumB = catWeightsSum.Data();
-        T2 *C = output.Data();
+        ImT *C = output.Data();
 
         ctx->mmHelper->compute(false, M, N, K, 1.0f, A, lda, B, scaleB, zeroB, sumB, 0.0f, C, ldc);
 

diff --git a/src/layers/rotary_embedding.cpp b/src/layers/rotary_embedding.cpp
@@ -28,8 +28,7 @@ LlamaRotaryEmbedding::LlamaRotaryEmbedding(DecoderContext *ctx) {
     ctx->GetAttr("rope_theta", &this->base, 10000);
     ctx->GetAttr("rope_type", &this->rope_type, std::to_string(-1));
 
-    if (this->rope_type == "linear") 
-        ctx->GetAttr("scaling_factor", &this->scaling_factor, 1.0f);
+    if (this->rope_type == "linear") ctx->GetAttr("scaling_factor", &this->scaling_factor, 1.0f);
 
     inv_freq_size = (dim + 1) / 2;
 
@@ -42,6 +41,19 @@ LlamaRotaryEmbedding::LlamaRotaryEmbedding(DecoderContext *ctx) {
             inv_freq[i] = 1.0 / pow(base, float(i * 2) / dim);
         }
         llamaCalEmb(inv_freq, max_position_embeddings);
+#ifdef GPU
+        if (device != nullptr) {
+            sycl::queue *gpu_queue = static_cast<sycl::queue *>(device);
+            float *emb_cos_bak = emb_cos;
+            float *emb_sin_bak = emb_sin;
+            emb_cos = ctx->getBuffer<float>(emb_cos_str + "_gpu", max_position_embeddings * inv_freq_size, gpu_queue);
+            emb_sin = ctx->getBuffer<float>(emb_sin_str + "_gpu", max_position_embeddings * inv_freq_size, gpu_queue);
+            gpu_queue->memcpy(emb_cos, emb_cos_bak, max_position_embeddings * inv_freq_size * sizeof(float)).wait();
+            gpu_queue->memcpy(emb_sin, emb_sin_bak, max_position_embeddings * inv_freq_size * sizeof(float)).wait();
+            ctx->freeBuffer(emb_cos_str);
+            ctx->freeBuffer(emb_sin_str);
+        }
+#endif
     } else if (dim != inv_freq_size * 2) {
         printf("Incorrect dim=%d, inv_freq_size=%d\n", dim, inv_freq_size);
         exit(-1);
@@ -112,6 +124,68 @@ void LlamaRotaryEmbedding::llamaCalEmb(const float *inv_freq, const int max_posi
 //   |_____|        |_____|
 //  head_size/2    head_size/2
 
+#ifdef GPU
+
+void LlamaRotaryEmbedding::forward(
+        float *query, float *key, int qStride, int kStride, const int *qkShape, const int *positionIds) {
+    const int batchSize = qkShape[0];
+    const int seqLen = qkShape[1];
+    const int qHeads = qkShape[2];
+    const int kHeads = qkShape[4];
+    const int head_num = std::max(qHeads, kHeads);
+    const int head_size = qkShape[3];
+    const int half_head_size = (head_size + 1) / 2;
+    using namespace sycl;
+
+    auto rope_kernel
+            = [](sycl::nd_item<3> &item, const float *embCos, const float *embSin, const int qHeads, const int kHeads,
+                      const int seq_size, const int head_size, const int half, float *query, float *key, int qStride,
+                      int kStride, const sycl::accessor<int, 1, sycl::access::mode::read> &positionIds) {
+                  size_t idx_bs_seq = item.get_global_id(0);
+                  size_t idx_head_num = item.get_global_id(1);
+                  size_t idx_half_head_dim = item.get_global_id(2);
+
+                  size_t pos = positionIds[idx_bs_seq % seq_size];
+                  float cos = embCos[pos * half + idx_half_head_dim];
+                  float sin = embSin[pos * half + idx_half_head_dim];
+
+                  float *q = query + idx_bs_seq * qStride + idx_head_num * head_size + idx_half_head_dim;
+                  float *k = key + idx_bs_seq * kStride + idx_head_num * head_size + idx_half_head_dim;
+
+                  if (idx_head_num < qHeads) {
+                      auto q1 = q[0];
+                      q[0] = q1 * cos - q[half] * sin;
+                      q[half] = q[half] * cos + q1 * sin;
+                  }
+                  if (idx_head_num < kHeads) {
+                      auto k1 = k[0];
+                      k[0] = k1 * cos - k[half] * sin;
+                      k[half] = k[half] * cos + k1 * sin;
+                  }
+              };
+
+    // Reorder input
+    sycl::queue *gpu_queue = static_cast<sycl::queue *>(device);
+    float *embCos = emb_cos;
+    float *embSin = emb_sin;
+
+    sycl::buffer<int, 1> positionIdsBuf(positionIds, sycl::range<1>(seqLen));
+    gpu_queue->submit([&](sycl::handler &cgh) {
+        sycl::accessor position(positionIdsBuf, cgh, sycl::read_only);
+        sycl::range<3> globalSize(batchSize * seqLen, head_num, half_head_size);
+        sycl::range<3> workGroupSize(1, 1, 1);
+
+        cgh.parallel_for<class kernel_rope>(
+                sycl::nd_range(globalSize, workGroupSize), [=, this](sycl::nd_item<3> item) {
+                    rope_kernel(item, embCos, embSin, qHeads, kHeads, seqLen, head_size, half_head_size, query, key,
+                            qStride, kStride, position);
+                });
+    });
+    gpu_queue->wait();
+}
+
+#else
+
 void LlamaRotaryEmbedding::forward(
         float *query, float *key, int qStride, int kStride, const int *qkShape, const int *positionIds) {
     int dim = inv_freq_size * 2;
@@ -214,3 +288,5 @@ void LlamaRotaryEmbedding::forward(
         }
     }
 }
+
+#endif // GPU
diff --git a/src/layers/rotary_embedding.h b/src/layers/rotary_embedding.h
@@ -58,4 +58,5 @@ class LlamaRotaryEmbedding {
     float *inv_freq = nullptr;
     float *emb_cos = nullptr;
     float *emb_sin = nullptr;
+    void *device = nullptr;
 };
diff --git a/src/models/common_decoder.h b/src/models/common_decoder.h
@@ -638,10 +638,17 @@ class CommonDecoder : public AbstractDecoder {
                     epsilon, vocabSize, embeddingSize, maxPositions, maxPosEmbed, maxSeqLength, tpRank, tpSize, ppSize,
                     ppRank, ropeParamsPtr, useLogN, useNTK));
 
+            int engineIdx = 0;
             if (env.getEngineKind() == xft::DeviceKind::iGPU && env.getEngineIndex() < 0) // Sequential assignment
-                this->context->mmHelper = new MMHelper(env.getEngineKind(), ppRank * tpSize + tpRank);
+                engineIdx = ppRank * tpSize + tpRank;
             else // assignment through the user
-                this->context->mmHelper = new MMHelper(env.getEngineKind(), env.getEngineIndex());
+                engineIdx = env.getEngineIndex();
+
+            this->context->mmHelper = new MMHelper(env.getEngineKind(), engineIdx);
+#ifdef GPU
+            auto devices = sycl::device::get_devices(sycl::info::device_type::gpu);
+            this->context->device = new sycl::queue(devices[this->context->mmHelper->getEngineCount() + engineIdx]);
+#endif
         }
 
         return this->context.get();

diff --git a/src/models/model_factory.h b/src/models/model_factory.h
@@ -109,4 +109,4 @@ class DecoderRegister {
     MODEL(IMPLEMENT, CLASS, NAME)
 
 #define REGISTER_MODEL(CLASS, NAME) \
-    MODEL(REGISTER, CLASS, NAME)
+    MODEL(REGISTER, CLASS, NAME)