Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Kernel] Add GPU kernels and enable LLaMA model. #372

Merged
merged 36 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
aff6786
[Kernel] Add GPU kernels.
changqi1 May 7, 2024
0b1be0e
format code
changqi1 May 7, 2024
3ef6143
fix running issue.
changqi1 May 7, 2024
619e788
Add RmsNorm kernel.
changqi1 May 15, 2024
39ec0b4
Fix some issues.
changqi1 May 27, 2024
2ef7f7a
Optimze alloc
changqi1 May 27, 2024
29f01c1
Use unified onednn engine
changqi1 May 27, 2024
532445b
merge from main
changqi1 May 27, 2024
17f224e
Fix compile
changqi1 May 27, 2024
b61fe52
Fix onednn gemm issue
changqi1 May 27, 2024
c5b7ac7
Fix build
changqi1 Jun 3, 2024
0faa9de
Add fp16 rope kernels
changqi1 Jun 4, 2024
3cfa650
Fix attention UT issue.
changqi1 Jun 4, 2024
881bc78
Fix ICX build issue.
changqi1 Jun 4, 2024
c958a1d
Merge branch 'main' into changqing/feature/gpu_rope
changqi1 Jun 4, 2024
9b2af7e
Fix build.
changqi1 Jun 4, 2024
c034849
Add rmsNorm impl and XFT_DEBUG
changqi1 Jun 7, 2024
ec463a3
Update.
changqi1 Jun 7, 2024
69dd33a
update.
changqi1 Jun 7, 2024
5f43897
Add GPU memory to run kernels.
changqi1 Jun 12, 2024
23d2053
Add gpu matmul kernels
changqi1 Jun 12, 2024
b7dc9eb
Fix CPU build issue.
changqi1 Jun 13, 2024
daec9dd
fix
changqi1 Jun 13, 2024
c3e83f2
fix
changqi1 Jun 13, 2024
277de9b
fix
changqi1 Jun 13, 2024
dd1d3fb
fix
changqi1 Jun 13, 2024
15fc202
Fix build issue.
changqi1 Jun 13, 2024
726d356
Fix build issue.
changqi1 Jun 13, 2024
003c46b
Fix LN bug
changqi1 Jun 13, 2024
4cb98cf
Fix final LN
changqi1 Jun 13, 2024
6a85769
Fix 2
changqi1 Jun 13, 2024
f6e6e64
Fix 3
changqi1 Jun 13, 2024
5f93020
Done
changqi1 Jun 13, 2024
175c4dc
Finish
changqi1 Jun 14, 2024
8d35cfc
change macro GPU to XFT_GPU
changqi1 Jun 14, 2024
ea1679d
Add requirements-gpu.txt
changqi1 Jun 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion src/common/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
#include <sys/mman.h>
#include "environment.h"

#ifdef GPU
#include <CL/sycl.hpp>
#endif

namespace xft {

constexpr size_t g_thp_threshold = (size_t)2 * 1024 * 1024;
Expand All @@ -26,11 +30,18 @@ static inline bool is_thp_alloc(size_t nbytes) {
return (Env::getInstance().getTHPEnabled() && (nbytes >= g_thp_threshold));
}

static inline void *alloc(size_t nbytes, size_t alignment = 64) {
static inline void *alloc(size_t nbytes, size_t alignment = 64, void *device = nullptr) {
if (nbytes == 0) { return nullptr; }

void *data;

#ifdef GPU
if (device != nullptr) {
data = sycl::malloc_device<char>(nbytes, *static_cast<sycl::queue *>(device));
pujiang2018 marked this conversation as resolved.
Show resolved Hide resolved
return data;
}
#endif

int err = posix_memalign(&data, alignment, nbytes);
if (err != 0) {
printf("Unable to allocate buffer with size of %zu, err=%d\n", nbytes, err);
Expand All @@ -47,4 +58,17 @@ static inline void *alloc(size_t nbytes, size_t alignment = 64) {

return data;
}

static inline void dealloc(void *data, void *device = nullptr) {
#ifdef GPU
if (device != nullptr) {
sycl::free(data, *static_cast<sycl::queue *>(device));
return;
}
#endif

free(data);
return;
}

} // namespace xft
9 changes: 7 additions & 2 deletions src/common/transformer_ctx.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ struct DecoderContext {
hpj::Matrix<float> imOut; // intermediate output

MMHelper *mmHelper;
void *device;

std::string configPath;
INIReader configReader;
Expand Down Expand Up @@ -238,8 +239,12 @@ struct DecoderContext {
bool cached(const std::string &name) { return SimpleMemPool::instance().cached(name); }

template <typename T>
T *getBuffer(const std::string &name, size_t size, size_t alignment = 64) {
return (T *)SimpleMemPool::instance().getBuffer(name, sizeof(T) * size, alignment);
T *getBuffer(const std::string &name, size_t size, void *device = nullptr, size_t alignment = 64) {
return (T *)SimpleMemPool::instance().getBuffer(name, sizeof(T) * size, device, alignment);
}

void freeBuffer(const std::string &name, void *device = nullptr) {
SimpleMemPool::instance().freeBuffer(name, device);
}

void dump() {
Expand Down
5 changes: 5 additions & 0 deletions src/layers/attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,11 @@ class Attention {
std::iota(posIds.begin(), posIds.end(), pastSeqLen);
}
qkpo.forward(query.Data(), key.Data(), query.Stride(), key.Stride(), qkShape, posIds.data());
#ifdef GPU
sycl::queue *q = static_cast<sycl::queue *>(ctx->device);
int64_t size = ctx->batchSize * ctx->inputSeqLen * qkvCols * sizeof(float);
q->memcpy(qkvMatMul.Data(), query.Data(), size).wait();
pujiang2018 marked this conversation as resolved.
Show resolved Hide resolved
#endif
}
t3.release();

Expand Down
14 changes: 13 additions & 1 deletion src/layers/dist_linear.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,26 @@ class DistLinear {

int K = inputSize;
int N = this->splitSize;
weight.Resize(K, N);

scaleWeight.Resize(N);
zeroWeight.Resize(N);

hpj::Matrix<WeiT> quantizedWeight;
ctx->mmHelper->convertWeight(
true, K, N, w + splitOffset * K, nullptr, nullptr, quantizedWeight, scaleWeight, zeroWeight, sumWeight);
#ifdef GPU
hpj::Matrix<WeiT> tWeight;
tWeight.Resize(K, N);
ctx->mmHelper->transposeWeight(true, quantizedWeight, tWeight);

sycl::queue *gpu_queue = static_cast<sycl::queue *>(ctx->device);
WeiT *input_data = sycl::malloc_device<WeiT>(K * N, *gpu_queue);
weight.Assign(input_data, K, N, N);
gpu_queue->memcpy(weight.Data(), tWeight.Data(), tWeight.Rows() * tWeight.Cols() * sizeof(WeiT)).wait();
#else
weight.Resize(K, N);
ctx->mmHelper->packWeight(true, quantizedWeight, weight);
#endif

// Copy Bias
if (b) {
Expand Down
7 changes: 3 additions & 4 deletions src/layers/mlp_llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,7 @@ class LlamaMLP : public SingletonBase<LlamaMLP<WeiT>> {
}
}

template <typename T1, typename T2>
void catGateUpProj(DecoderContext *ctx, hpj::Matrix<T1> &input, hpj::Matrix<T2> &output, hpj::Matrix<T2> &siluBuf) {
void catGateUpProj(DecoderContext *ctx, hpj::Matrix<InT> &input, hpj::Matrix<ImT> &output, hpj::Matrix<ImT> &siluBuf) {
pujiang2018 marked this conversation as resolved.
Show resolved Hide resolved
TimeLine t("catGateUpProj");

assert(input.Rows() == output.Rows());
Expand All @@ -286,12 +285,12 @@ class LlamaMLP : public SingletonBase<LlamaMLP<WeiT>> {
int M = input.Rows(), N = output.Cols(), K = input.Cols();
int lda = input.Stride(), ldc = output.Stride();

const T1 *A = input.Data();
const InT *A = input.Data();
const WeiT *B = catWeights.Data();
const float *scaleB = catWeightsScale.Data();
const float *zeroB = catWeightsZero.Data();
const float *sumB = catWeightsSum.Data();
T2 *C = output.Data();
ImT *C = output.Data();

ctx->mmHelper->compute(false, M, N, K, 1.0f, A, lda, B, scaleB, zeroB, sumB, 0.0f, C, ldc);

Expand Down
80 changes: 78 additions & 2 deletions src/layers/rotary_embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ LlamaRotaryEmbedding::LlamaRotaryEmbedding(DecoderContext *ctx) {
ctx->GetAttr("rope_theta", &this->base, 10000);
ctx->GetAttr("rope_type", &this->rope_type, std::to_string(-1));

if (this->rope_type == "linear")
ctx->GetAttr("scaling_factor", &this->scaling_factor, 1.0f);
if (this->rope_type == "linear") ctx->GetAttr("scaling_factor", &this->scaling_factor, 1.0f);

inv_freq_size = (dim + 1) / 2;

Expand All @@ -42,6 +41,19 @@ LlamaRotaryEmbedding::LlamaRotaryEmbedding(DecoderContext *ctx) {
inv_freq[i] = 1.0 / pow(base, float(i * 2) / dim);
}
llamaCalEmb(inv_freq, max_position_embeddings);
#ifdef GPU
if (device != nullptr) {
sycl::queue *gpu_queue = static_cast<sycl::queue *>(device);
float *emb_cos_bak = emb_cos;
float *emb_sin_bak = emb_sin;
emb_cos = ctx->getBuffer<float>(emb_cos_str + "_gpu", max_position_embeddings * inv_freq_size, gpu_queue);
emb_sin = ctx->getBuffer<float>(emb_sin_str + "_gpu", max_position_embeddings * inv_freq_size, gpu_queue);
gpu_queue->memcpy(emb_cos, emb_cos_bak, max_position_embeddings * inv_freq_size * sizeof(float)).wait();
gpu_queue->memcpy(emb_sin, emb_sin_bak, max_position_embeddings * inv_freq_size * sizeof(float)).wait();
ctx->freeBuffer(emb_cos_str);
ctx->freeBuffer(emb_sin_str);
}
#endif
} else if (dim != inv_freq_size * 2) {
printf("Incorrect dim=%d, inv_freq_size=%d\n", dim, inv_freq_size);
exit(-1);
Expand Down Expand Up @@ -112,6 +124,68 @@ void LlamaRotaryEmbedding::llamaCalEmb(const float *inv_freq, const int max_posi
// |_____| |_____|
// head_size/2 head_size/2

#ifdef GPU

void LlamaRotaryEmbedding::forward(
float *query, float *key, int qStride, int kStride, const int *qkShape, const int *positionIds) {
const int batchSize = qkShape[0];
const int seqLen = qkShape[1];
const int qHeads = qkShape[2];
const int kHeads = qkShape[4];
const int head_num = std::max(qHeads, kHeads);
const int head_size = qkShape[3];
const int half_head_size = (head_size + 1) / 2;
using namespace sycl;

auto rope_kernel
= [](sycl::nd_item<3> &item, const float *embCos, const float *embSin, const int qHeads, const int kHeads,
const int seq_size, const int head_size, const int half, float *query, float *key, int qStride,
int kStride, const sycl::accessor<int, 1, sycl::access::mode::read> &positionIds) {
size_t idx_bs_seq = item.get_global_id(0);
size_t idx_head_num = item.get_global_id(1);
size_t idx_half_head_dim = item.get_global_id(2);

size_t pos = positionIds[idx_bs_seq % seq_size];
float cos = embCos[pos * half + idx_half_head_dim];
float sin = embSin[pos * half + idx_half_head_dim];

float *q = query + idx_bs_seq * qStride + idx_head_num * head_size + idx_half_head_dim;
float *k = key + idx_bs_seq * kStride + idx_head_num * head_size + idx_half_head_dim;

if (idx_head_num < qHeads) {
auto q1 = q[0];
q[0] = q1 * cos - q[half] * sin;
q[half] = q[half] * cos + q1 * sin;
}
if (idx_head_num < kHeads) {
auto k1 = k[0];
k[0] = k1 * cos - k[half] * sin;
k[half] = k[half] * cos + k1 * sin;
}
};

// Reorder input
sycl::queue *gpu_queue = static_cast<sycl::queue *>(device);
float *embCos = emb_cos;
float *embSin = emb_sin;

sycl::buffer<int, 1> positionIdsBuf(positionIds, sycl::range<1>(seqLen));
gpu_queue->submit([&](sycl::handler &cgh) {
sycl::accessor position(positionIdsBuf, cgh, sycl::read_only);
sycl::range<3> globalSize(batchSize * seqLen, head_num, half_head_size);
sycl::range<3> workGroupSize(1, 1, 1);

cgh.parallel_for<class kernel_rope>(
sycl::nd_range(globalSize, workGroupSize), [=, this](sycl::nd_item<3> item) {
rope_kernel(item, embCos, embSin, qHeads, kHeads, seqLen, head_size, half_head_size, query, key,
qStride, kStride, position);
});
});
gpu_queue->wait();
}

#else

void LlamaRotaryEmbedding::forward(
float *query, float *key, int qStride, int kStride, const int *qkShape, const int *positionIds) {
int dim = inv_freq_size * 2;
Expand Down Expand Up @@ -214,3 +288,5 @@ void LlamaRotaryEmbedding::forward(
}
}
}

#endif // GPU
1 change: 1 addition & 0 deletions src/layers/rotary_embedding.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,5 @@ class LlamaRotaryEmbedding {
float *inv_freq = nullptr;
float *emb_cos = nullptr;
float *emb_sin = nullptr;
void *device = nullptr;
};
11 changes: 9 additions & 2 deletions src/models/common_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -638,10 +638,17 @@ class CommonDecoder : public AbstractDecoder {
epsilon, vocabSize, embeddingSize, maxPositions, maxPosEmbed, maxSeqLength, tpRank, tpSize, ppSize,
ppRank, ropeParamsPtr, useLogN, useNTK));

int engineIdx = 0;
if (env.getEngineKind() == xft::DeviceKind::iGPU && env.getEngineIndex() < 0) // Sequential assignment
this->context->mmHelper = new MMHelper(env.getEngineKind(), ppRank * tpSize + tpRank);
engineIdx = ppRank * tpSize + tpRank;
else // assignment through the user
this->context->mmHelper = new MMHelper(env.getEngineKind(), env.getEngineIndex());
engineIdx = env.getEngineIndex();

this->context->mmHelper = new MMHelper(env.getEngineKind(), engineIdx);
#ifdef GPU
auto devices = sycl::device::get_devices(sycl::info::device_type::gpu);
this->context->device = new sycl::queue(devices[this->context->mmHelper->getEngineCount() + engineIdx]);
#endif
}

return this->context.get();
Expand Down
2 changes: 1 addition & 1 deletion src/models/model_factory.h
pujiang2018 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,4 @@ class DecoderRegister {
MODEL(IMPLEMENT, CLASS, NAME)

#define REGISTER_MODEL(CLASS, NAME) \
MODEL(REGISTER, CLASS, NAME)
MODEL(REGISTER, CLASS, NAME)
Loading