Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

foreach: Format codes for slow path #948

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 7 additions & 55 deletions src/ATen/native/xpu/ForeachOpList.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
#include <ATen/native/ForeachUtils.h>
#include <ATen/ops/_foreach_add_native.h>
#include <ATen/ops/_foreach_addcdiv_native.h>
#include <ATen/ops/_foreach_addcmul_native.h>
#include <ATen/ops/_foreach_div_native.h>
#include <ATen/ops/_foreach_lerp_native.h>
#include <ATen/ops/_foreach_mul_native.h>

#include <ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h>
#include <ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h>
#include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
Expand All @@ -8,29 +15,6 @@
namespace at {
namespace native {

::std::vector<at::Tensor> foreach_tensor_mul_list_kernel_slow(
at::TensorList self,
at::TensorList other);
void foreach_tensor_mul_list_kernel_slow_(
at::TensorList self,
at::TensorList other);

::std::vector<at::Tensor> foreach_tensor_div_list_kernel_slow(
at::TensorList self,
at::TensorList other);
void foreach_tensor_div_list_kernel_slow_(
at::TensorList self,
at::TensorList other);

::std::vector<at::Tensor> foreach_tensor_add_list_kernel_slow(
at::TensorList self,
at::TensorList other,
const at::Scalar& alpha);
void foreach_tensor_add_list_kernel_slow_(
at::TensorList self,
at::TensorList other,
const at::Scalar& alpha);

#define FOREACH_BINARY_OP_LIST(NAME, DIVISION_OP) \
void foreach_tensor_##NAME##_list_kernel_xpu_( \
TensorList tensors1, TensorList tensors2) { \
Expand Down Expand Up @@ -81,28 +65,6 @@ FOREACH_BINARY_OP_LIST_ALPHA(add);
FOREACH_BINARY_OP_LIST(mul, false);
FOREACH_BINARY_OP_LIST(div, true);

::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_addcmul_scalarlist_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);

::std::vector<at::Tensor> foreach_tensor_addcdiv_scalarlist_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_addcdiv_scalarlist_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);

#define FOREACH_POINTWISE_OP_TENSOR(NAME) \
std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_xpu( \
TensorList input, \
Expand Down Expand Up @@ -142,11 +104,6 @@ void foreach_tensor_addcdiv_scalarlist_slow_(
FOREACH_POINTWISE_OP_TENSOR(addcmul)
FOREACH_POINTWISE_OP_TENSOR(addcdiv)

::std::vector<at::Tensor> foreach_tensor_ternary_lerp_slow(
at::TensorList self,
at::TensorList tensors1,
at::TensorList weights);

std::vector<at::Tensor> foreach_tensor_lerp_ternary_xpu(
TensorList tensors1,
TensorList tensors2,
Expand All @@ -166,11 +123,6 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_xpu(
return vec_res;
}

void foreach_tensor_ternary_lerp_slow_(
at::TensorList self,
at::TensorList tensors1,
at::TensorList weights);

void foreach_tensor_lerp_ternary_xpu_(
TensorList tensors1,
TensorList tensors2,
Expand Down
59 changes: 6 additions & 53 deletions src/ATen/native/xpu/ForeachOpScalar.cpp
Original file line number Diff line number Diff line change
@@ -1,34 +1,18 @@
#include <ATen/native/ForeachUtils.h>
#include <ATen/ops/_foreach_add_native.h>
#include <ATen/ops/_foreach_addcdiv_native.h>
#include <ATen/ops/_foreach_addcmul_native.h>
#include <ATen/ops/_foreach_div_native.h>
#include <ATen/ops/_foreach_lerp_native.h>
#include <ATen/ops/_foreach_mul_native.h>

#include <ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h>
#include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h>
#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>

namespace at {

namespace native {

::std::vector<at::Tensor> foreach_tensor_add_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_add_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);

::std::vector<at::Tensor> foreach_tensor_mul_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_mul_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);

::std::vector<at::Tensor> foreach_tensor_div_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_div_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);

#define FOREACH_BINARY_OP_SCALAR(NAME, DIV_OP) \
void foreach_tensor_##NAME##_scalar_kernel_xpu_( \
TensorList tensors, const Scalar& scalar) { \
Expand All @@ -54,28 +38,6 @@ FOREACH_BINARY_OP_SCALAR(add, /*div_op*/ false);
FOREACH_BINARY_OP_SCALAR(mul, /*div_op*/ false);
FOREACH_BINARY_OP_SCALAR(div, /*div_op*/ true);

::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value);
void foreach_tensor_addcmul_scalar_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value);

::std::vector<at::Tensor> foreach_tensor_addcdiv_scalar_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value);
void foreach_tensor_addcdiv_scalar_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value);

#define FOREACH_POINTWISE_OP_SCALAR(NAME) \
std::vector<Tensor> foreach_tensor_##NAME##_scalar_xpu( \
TensorList input, \
Expand Down Expand Up @@ -112,15 +74,6 @@ void foreach_tensor_addcdiv_scalar_slow_(
FOREACH_POINTWISE_OP_SCALAR(addcmul)
FOREACH_POINTWISE_OP_SCALAR(addcdiv)

::std::vector<at::Tensor> foreach_tensor_lerp_list_kernel_slow(
at::TensorList self,
at::TensorList tensors1,
const at::Scalar& weight);
void foreach_tensor_lerp_list_kernel_slow_(
at::TensorList self,
at::TensorList tensors1,
const at::Scalar& weight);

std::vector<at::Tensor> foreach_tensor_lerp_list_xpu(
TensorList tensors1,
TensorList tensors2,
Expand Down
84 changes: 5 additions & 79 deletions src/ATen/native/xpu/ForeachOpScalarList.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
#include <ATen/native/ForeachUtils.h>
#include <ATen/ops/_foreach_add_native.h>
#include <ATen/ops/_foreach_addcdiv_native.h>
#include <ATen/ops/_foreach_addcmul_native.h>
#include <ATen/ops/_foreach_div_native.h>
#include <ATen/ops/_foreach_mul_native.h>

#include <ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h>
#include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
Expand All @@ -8,44 +13,6 @@

namespace at {
namespace native {
::std::vector<at::Tensor> foreach_tensor_add_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_add_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);
::std::vector<at::Tensor> foreach_tensor_mul_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_mul_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);

::std::vector<at::Tensor> foreach_tensor_add_scalarlist_kernel_slow(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_add_scalarlist_kernel_slow_(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);
::std::vector<at::Tensor> foreach_tensor_mul_scalarlist_kernel_slow(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_mul_scalarlist_kernel_slow_(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);

::std::vector<at::Tensor> foreach_tensor_div_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_div_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);
::std::vector<at::Tensor> foreach_tensor_div_scalarlist_kernel_slow(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_div_scalarlist_kernel_slow_(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);

#define FOREACH_BINARY_OP_SCALARLIST(NAME, DIV_OP) \
void foreach_tensor_##NAME##_scalar_kernel_xpu_( \
Expand Down Expand Up @@ -74,47 +41,6 @@ FOREACH_BINARY_OP_SCALARLIST(add, /*div_op*/ false);
FOREACH_BINARY_OP_SCALARLIST(mul, /*div_op*/ false);
FOREACH_BINARY_OP_SCALARLIST(div, /*div_op*/ true);

void foreach_tensor_addcmul_scalar_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value = 1);
::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value = 1);
::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_addcmul_scalarlist_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_addcdiv_scalar_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value = 1);
::std::vector<at::Tensor> foreach_tensor_addcdiv_scalar_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value = 1);
::std::vector<at::Tensor> foreach_tensor_addcdiv_scalarlist_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_addcdiv_scalarlist_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);

#define FOREACH_POINTWISE_OP_SCALARLIST(NAME) \
std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_xpu( \
TensorList input, \
Expand Down
5 changes: 2 additions & 3 deletions src/ATen/native/xpu/ForeachUnaryOp.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
#include <ATen/native/ForeachUtils.h>
#include <ATen/ops/_foreach_sqrt_native.h>

#include <ATen/native/xpu/sycl/ForeachUnaryKernels.h>

namespace at {
namespace native {
// given a functor and a "dispatch function", creates the outplace and inplace
// operations

::std::vector<at::Tensor> foreach_tensor_sqrt_slow(at::TensorList self);
void foreach_tensor_sqrt_slow_(at::TensorList self);

#define FOREACH_UNARY_OP(op_name) \
std::vector<Tensor> foreach_tensor_##op_name##_xpu(TensorList tensors) { \
check_foreach_api_restrictions(tensors); \
Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/Nonzero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <ATen/native/xpu/sycl/OffsetCalculator.h>

namespace at {
namespace native{
namespace native {
Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
TORCH_CHECK(
self.numel() < std::numeric_limits<int>::max(),
Expand Down Expand Up @@ -38,5 +38,5 @@ Tensor nonzero_xpu(const Tensor& self) {
nonzero_out_xpu(self, out);
return out;
}
}
} // namespace native
} // namespace at
6 changes: 4 additions & 2 deletions src/ATen/native/xpu/RangeFactories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ Tensor& arange_out_xpu(

TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
TORCH_CHECK(
std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
std::isfinite(static_cast<double>(xstart)) &&
std::isfinite(static_cast<double>(xend)),
"unsupported range: ",
xstart,
" -> ",
Expand Down Expand Up @@ -99,7 +100,8 @@ Tensor& range_xpu_out(

TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
TORCH_CHECK(
std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
std::isfinite(static_cast<double>(xstart)) &&
std::isfinite(static_cast<double>(xend)),
"unsupported range: ",
xstart,
" -> ",
Expand Down
1 change: 0 additions & 1 deletion src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <ATen/OpMathType.h>
#include <ATen/TensorIterator.h>

#include <ATen/native/xpu/sycl/Loops.h>
#include <ATen/native/xpu/sycl/Loops.h>
#include <comm/SYCLContext.h>

Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
#include <ATen/native/xpu/sycl/ForeachFunctors.h>
#include <ATen/native/xpu/sycl/MultiTensorApply.h>

#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
#include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>

namespace at::native::xpu {

Expand Down
8 changes: 6 additions & 2 deletions src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@

namespace at::native::xpu {

TORCH_XPU_API void addcmul_kernel(TensorIteratorBase& iter, const Scalar& value);
TORCH_XPU_API void addcmul_kernel(
TensorIteratorBase& iter,
const Scalar& value);

TORCH_XPU_API void addcdiv_kernel(TensorIteratorBase& iter, const Scalar& value);
TORCH_XPU_API void addcdiv_kernel(
TensorIteratorBase& iter,
const Scalar& value);

TORCH_XPU_API void mse_backward_kernel(
TensorIterator& iter,
Expand Down
3 changes: 2 additions & 1 deletion src/ATen/native/xpu/sycl/PowKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ static inline c10::complex<T> pow_(c10::complex<T> base, c10::complex<T> exp) {
} // namespace impl

#ifdef _MSC_VER
// Divergence for MSVC due to accuracy issue. https://github.com/intel/torch-xpu-ops/issues/842.
// Divergence for MSVC due to accuracy issue.
// https://github.com/intel/torch-xpu-ops/issues/842.
template <typename scalar_t>
struct PowTensorTensorCastFunctor {
using opmath_t = at::opmath_type<scalar_t>;
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#include <ATen/Dispatch.h>

#include <ATen/native/xpu/sycl/Reduce.h>
#include <ATen/ops/imag.h>
#include <ATen/native/xpu/sycl/SharedReduceOps.h>
#include <ATen/ops/imag.h>

#include <ATen/native/xpu/sycl/ReduceNormKernel.h>

Expand Down
Loading