intel · fengyuan14 · Sep 30, 2024
diff --git a/src/ATen/native/xpu/ForeachOpList.cpp b/src/ATen/native/xpu/ForeachOpList.cpp
@@ -1,4 +1,11 @@
 #include <ATen/native/ForeachUtils.h>
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_addcdiv_native.h>
+#include <ATen/ops/_foreach_addcmul_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_lerp_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
+
 #include <ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
@@ -8,29 +15,6 @@
 namespace at {
 namespace native {
 
-::std::vector<at::Tensor> foreach_tensor_mul_list_kernel_slow(
-    at::TensorList self,
-    at::TensorList other);
-void foreach_tensor_mul_list_kernel_slow_(
-    at::TensorList self,
-    at::TensorList other);
-
-::std::vector<at::Tensor> foreach_tensor_div_list_kernel_slow(
-    at::TensorList self,
-    at::TensorList other);
-void foreach_tensor_div_list_kernel_slow_(
-    at::TensorList self,
-    at::TensorList other);
-
-::std::vector<at::Tensor> foreach_tensor_add_list_kernel_slow(
-    at::TensorList self,
-    at::TensorList other,
-    const at::Scalar& alpha);
-void foreach_tensor_add_list_kernel_slow_(
-    at::TensorList self,
-    at::TensorList other,
-    const at::Scalar& alpha);
-
 #define FOREACH_BINARY_OP_LIST(NAME, DIVISION_OP)                           \
   void foreach_tensor_##NAME##_list_kernel_xpu_(                            \
       TensorList tensors1, TensorList tensors2) {                           \
@@ -81,28 +65,6 @@ FOREACH_BINARY_OP_LIST_ALPHA(add);
 FOREACH_BINARY_OP_LIST(mul, false);
 FOREACH_BINARY_OP_LIST(div, true);
 
-::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_addcmul_scalarlist_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-
-::std::vector<at::Tensor> foreach_tensor_addcdiv_scalarlist_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_addcdiv_scalarlist_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-
 #define FOREACH_POINTWISE_OP_TENSOR(NAME)                                  \
   std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_xpu(             \
       TensorList input,                                                    \
@@ -142,11 +104,6 @@ void foreach_tensor_addcdiv_scalarlist_slow_(
 FOREACH_POINTWISE_OP_TENSOR(addcmul)
 FOREACH_POINTWISE_OP_TENSOR(addcdiv)
 
-::std::vector<at::Tensor> foreach_tensor_ternary_lerp_slow(
-    at::TensorList self,
-    at::TensorList tensors1,
-    at::TensorList weights);
-
 std::vector<at::Tensor> foreach_tensor_lerp_ternary_xpu(
     TensorList tensors1,
     TensorList tensors2,
@@ -166,11 +123,6 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_xpu(
   return vec_res;
 }
 
-void foreach_tensor_ternary_lerp_slow_(
-    at::TensorList self,
-    at::TensorList tensors1,
-    at::TensorList weights);
-
 void foreach_tensor_lerp_ternary_xpu_(
     TensorList tensors1,
     TensorList tensors2,

diff --git a/src/ATen/native/xpu/ForeachOpScalar.cpp b/src/ATen/native/xpu/ForeachOpScalar.cpp
@@ -1,34 +1,18 @@
 #include <ATen/native/ForeachUtils.h>
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_addcdiv_native.h>
+#include <ATen/ops/_foreach_addcmul_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_lerp_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
 
 #include <ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
 
 namespace at {
-
 namespace native {
 
-::std::vector<at::Tensor> foreach_tensor_add_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_add_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-
-::std::vector<at::Tensor> foreach_tensor_mul_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_mul_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-
-::std::vector<at::Tensor> foreach_tensor_div_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_div_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-
 #define FOREACH_BINARY_OP_SCALAR(NAME, DIV_OP)                             \
   void foreach_tensor_##NAME##_scalar_kernel_xpu_(                         \
       TensorList tensors, const Scalar& scalar) {                          \
@@ -54,28 +38,6 @@ FOREACH_BINARY_OP_SCALAR(add, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALAR(mul, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALAR(div, /*div_op*/ true);
 
-::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value);
-void foreach_tensor_addcmul_scalar_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value);
-
-::std::vector<at::Tensor> foreach_tensor_addcdiv_scalar_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value);
-void foreach_tensor_addcdiv_scalar_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value);
-
 #define FOREACH_POINTWISE_OP_SCALAR(NAME)                                   \
   std::vector<Tensor> foreach_tensor_##NAME##_scalar_xpu(                   \
       TensorList input,                                                     \
@@ -112,15 +74,6 @@ void foreach_tensor_addcdiv_scalar_slow_(
 FOREACH_POINTWISE_OP_SCALAR(addcmul)
 FOREACH_POINTWISE_OP_SCALAR(addcdiv)
 
-::std::vector<at::Tensor> foreach_tensor_lerp_list_kernel_slow(
-    at::TensorList self,
-    at::TensorList tensors1,
-    const at::Scalar& weight);
-void foreach_tensor_lerp_list_kernel_slow_(
-    at::TensorList self,
-    at::TensorList tensors1,
-    const at::Scalar& weight);
-
 std::vector<at::Tensor> foreach_tensor_lerp_list_xpu(
     TensorList tensors1,
     TensorList tensors2,

diff --git a/src/ATen/native/xpu/ForeachOpScalarList.cpp b/src/ATen/native/xpu/ForeachOpScalarList.cpp
@@ -1,4 +1,9 @@
 #include <ATen/native/ForeachUtils.h>
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_addcdiv_native.h>
+#include <ATen/ops/_foreach_addcmul_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
 
 #include <ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
@@ -8,44 +13,6 @@
 
 namespace at {
 namespace native {
-::std::vector<at::Tensor> foreach_tensor_add_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_add_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-::std::vector<at::Tensor> foreach_tensor_mul_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_mul_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-
-::std::vector<at::Tensor> foreach_tensor_add_scalarlist_kernel_slow(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_add_scalarlist_kernel_slow_(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
-::std::vector<at::Tensor> foreach_tensor_mul_scalarlist_kernel_slow(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_mul_scalarlist_kernel_slow_(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
-
-::std::vector<at::Tensor> foreach_tensor_div_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_div_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-::std::vector<at::Tensor> foreach_tensor_div_scalarlist_kernel_slow(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_div_scalarlist_kernel_slow_(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
 
 #define FOREACH_BINARY_OP_SCALARLIST(NAME, DIV_OP)                             \
   void foreach_tensor_##NAME##_scalar_kernel_xpu_(                             \
@@ -74,47 +41,6 @@ FOREACH_BINARY_OP_SCALARLIST(add, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALARLIST(mul, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALARLIST(div, /*div_op*/ true);
 
-void foreach_tensor_addcmul_scalar_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value = 1);
-::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value = 1);
-::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_addcmul_scalarlist_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_addcdiv_scalar_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value = 1);
-::std::vector<at::Tensor> foreach_tensor_addcdiv_scalar_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value = 1);
-::std::vector<at::Tensor> foreach_tensor_addcdiv_scalarlist_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_addcdiv_scalarlist_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-
 #define FOREACH_POINTWISE_OP_SCALARLIST(NAME)                                \
   std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_xpu(                \
       TensorList input,                                                      \

diff --git a/src/ATen/native/xpu/ForeachUnaryOp.cpp b/src/ATen/native/xpu/ForeachUnaryOp.cpp
@@ -1,14 +1,13 @@
 #include <ATen/native/ForeachUtils.h>
+#include <ATen/ops/_foreach_sqrt_native.h>
+
 #include <ATen/native/xpu/sycl/ForeachUnaryKernels.h>
 
 namespace at {
 namespace native {
 // given a functor and a "dispatch function", creates the outplace and inplace
 // operations
 
-::std::vector<at::Tensor> foreach_tensor_sqrt_slow(at::TensorList self);
-void foreach_tensor_sqrt_slow_(at::TensorList self);
-
 #define FOREACH_UNARY_OP(op_name)                                          \
   std::vector<Tensor> foreach_tensor_##op_name##_xpu(TensorList tensors) { \
     check_foreach_api_restrictions(tensors);                               \

diff --git a/src/ATen/native/xpu/Nonzero.cpp b/src/ATen/native/xpu/Nonzero.cpp
@@ -5,7 +5,7 @@
 #include <ATen/native/xpu/sycl/OffsetCalculator.h>
 
 namespace at {
-namespace native{
+namespace native {
 Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
   TORCH_CHECK(
       self.numel() < std::numeric_limits<int>::max(),
@@ -38,5 +38,5 @@ Tensor nonzero_xpu(const Tensor& self) {
   nonzero_out_xpu(self, out);
   return out;
 }
-}
+} // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
@@ -31,7 +31,8 @@ Tensor& arange_out_xpu(
 
         TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
         TORCH_CHECK(
-            std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
+            std::isfinite(static_cast<double>(xstart)) &&
+                std::isfinite(static_cast<double>(xend)),
             "unsupported range: ",
             xstart,
             " -> ",
@@ -99,7 +100,8 @@ Tensor& range_xpu_out(
 
   TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
   TORCH_CHECK(
-      std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
+      std::isfinite(static_cast<double>(xstart)) &&
+          std::isfinite(static_cast<double>(xend)),
       "unsupported range: ",
       xstart,
       " -> ",

diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
@@ -2,7 +2,6 @@
 #include <ATen/OpMathType.h>
 #include <ATen/TensorIterator.h>
 
-#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 

diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
@@ -5,8 +5,8 @@
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
-#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
+#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
 
 namespace at::native::xpu {
 

diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
@@ -4,9 +4,13 @@
 
 namespace at::native::xpu {
 
-TORCH_XPU_API void addcmul_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void addcmul_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& value);
 
-TORCH_XPU_API void addcdiv_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void addcdiv_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& value);
 
 TORCH_XPU_API void mse_backward_kernel(
     TensorIterator& iter,

diff --git a/src/ATen/native/xpu/sycl/PowKernels.cpp b/src/ATen/native/xpu/sycl/PowKernels.cpp
@@ -38,7 +38,8 @@ static inline c10::complex<T> pow_(c10::complex<T> base, c10::complex<T> exp) {
 } // namespace impl
 
 #ifdef _MSC_VER
-// Divergence for MSVC due to accuracy issue. https://github.com/intel/torch-xpu-ops/issues/842.
+// Divergence for MSVC due to accuracy issue.
+// https://github.com/intel/torch-xpu-ops/issues/842.
 template <typename scalar_t>
 struct PowTensorTensorCastFunctor {
   using opmath_t = at::opmath_type<scalar_t>;

diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
@@ -1,8 +1,8 @@
 #include <ATen/Dispatch.h>
 
 #include <ATen/native/xpu/sycl/Reduce.h>
-#include <ATen/ops/imag.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
+#include <ATen/ops/imag.h>
 
 #include <ATen/native/xpu/sycl/ReduceNormKernel.h>