Remove Intel code in common Triton GPU Dialect source code.

intel · Dec 17, 2024 · ede71d8 · ede71d8
1 parent f1a893a
commit ede71d8
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 62 deletions.
diff --git a/include/triton/Dialect/Triton/IR/Dialect.h b/include/triton/Dialect/Triton/IR/Dialect.h
@@ -76,6 +76,11 @@ class DialectInferLayoutInterface
   virtual LogicalResult
   verifyDotOpEncodingCompatibility(Operation *op, Attribute operandEncodingA,
                                    Attribute operandEncodingB) const = 0;
+
+  // Verify the dotOp layout encoding is legal if it uses 3rd party Triton GPU
+  // dialect attribute as parent.
+  virtual LogicalResult verifyDotOpEncoding(unsigned opIdx, Attribute parent,
+                                            unsigned kWidth) const = 0;
 };
 
 class DialectVerifyTensorLayoutInterface

diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -304,16 +304,6 @@ SmallVector<unsigned> getOrder(Attribute layout) {
   }
   if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
     auto rank = dotLayout.getWarpsPerCTA().size();
-    // FIXME: delete if branch for `DpasEncodingAttr` and provide more
-    // general solution to make `getOrderForDotOperand` function compatible
-    // with Intel layouts.
-    // More details:
-    // https://github.com/intel/intel-xpu-backend-for-triton/pull/2517
-    if (dyn_cast<intel::DpasEncodingAttr>(dotLayout.getParent())) {
-      SmallVector<unsigned> order(rank);
-      std::iota(order.rbegin(), order.rend(), 0);
-      return order;
-    }
     return getOrderForDotOperand(dotLayout.getOpIdx(), rank, /*kMajor*/ true);
   }
   if (auto sliceLayout = dyn_cast<SliceEncodingAttr>(layout)) {
@@ -1093,10 +1083,6 @@ unsigned DotOperandEncodingAttr::getTotalElemsPerThread(ArrayRef<int64_t> shape,
       return amdWmmaParent.getTotalElemsPerThreadForOperand(
           shape, eltTy, getKWidth(), getOpIdx());
     }
-    if (auto dpasParent = mlir::dyn_cast<intel::DpasEncodingAttr>(mmaParent)) {
-      return dpasParent.getTotalElemsPerThreadForOperand(
-          shape, eltTy, getKWidth(), getOpIdx());
-    }
   }
   if (auto blockedLayout = mlir::dyn_cast<BlockedEncodingAttr>(getParent())) {
     auto shapePerCTA = getShapePerCTA(*this, shape);
@@ -1161,17 +1147,8 @@ SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {
   return {};
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getThreadOrder() const {
-  // FIXME: delete if branch for `DpasEncodingAttr` and provide more
-  // general solution to make `getOrderForDotOperand` function compatible
-  // with Intel layouts.
-  // More details:
-  // https://github.com/intel/intel-xpu-backend-for-triton/pull/2517
-  if (mlir::dyn_cast<intel::DpasEncodingAttr>(getParent())) {
-    return ::getOrder(*this);
-  } else {
-    return getOrderForDotOperand(getOpIdx(), getWarpsPerCTA().size(),
-                                 /*kMajor*/ true);
-  }
+  return getOrderForDotOperand(getOpIdx(), getWarpsPerCTA().size(),
+                               /*kMajor*/ true);
 }
 
 LogicalResult DotOperandEncodingAttr::verify(
@@ -1184,42 +1161,6 @@ LogicalResult DotOperandEncodingAttr::verify(
   if (!parent) {
     return emitError() << "ttg.dot_op parent paramenter cannot be null";
   }
-  if (auto parentAttr = mlir::dyn_cast<NvidiaMmaEncodingAttr>(parent)) {
-    if (kWidth != 0 && !(parentAttr.isAmpere() || parentAttr.isHopper()))
-      return emitError() << "ttg.dot_op kWidth parameter can only be "
-                            "non-zero for Ampere or Hopper MMA parent";
-    if (kWidth == 0 && (parentAttr.isAmpere() || parentAttr.isHopper()))
-      return emitError() << "ttg.dot_op kWidth parameter is mandatory for "
-                            "Ampere or Hopper MMA parent";
-    if (opIdx != 0 && parentAttr.isHopper())
-      return emitError()
-             << "ttg.dot_op opIdx parameter must be 0 for "
-                "Hopper MMA parent, since Hopper WGMMA only allows first "
-                "operand to be in registers";
-    return success();
-  }
-
-  if (auto parentAttr = mlir::dyn_cast<AMDWmmaEncodingAttr>(parent)) {
-    if (kWidth != 16 && parentAttr.getVersion() == 1 ||
-        kWidth != 8 && parentAttr.getVersion() == 2)
-      return emitError() << "ttg.dot_op kWidth parameter must be 16 for "
-                            "gfx11 and 8 for gfx12";
-    return success();
-  }
-
-  if (auto parentAttr = mlir::dyn_cast<AMDMfmaEncodingAttr>(parent)) {
-    if (kWidth == 0)
-      return emitError() << "ttg.dot_op kWidth parameter is mandatory for "
-                            "MFMA parent";
-    return success();
-  }
-
-  if (auto parentAttr = mlir::dyn_cast<intel::DpasEncodingAttr>(parent)) {
-    if (kWidth != parentAttr.getOpsPerChannel())
-      return emitError() << "ttg.dot_op kWidth parameter must match the "
-                            "parent's opsPerChannel";
-    return success();
-  }
 
   if (auto parentAttr = mlir::dyn_cast<intel::WarpEncodingAttr>(parent)) {
     if (kWidth != 0)
@@ -1228,6 +1169,14 @@ LogicalResult DotOperandEncodingAttr::verify(
     return success();
   }
 
+  if (auto parentAttr = mlir::dyn_cast<MmaEncodingTrait>(parent)) {
+    Dialect &dialect = parentAttr.getDialect();
+    auto interface = mlir::cast<DialectInferLayoutInterface>(&dialect);
+    if (interface->verifyDotOpEncoding(opIdx, parent, kWidth).failed())
+      return emitError() << "ttg.dot_op is invalid with parent layout: "
+                         << parent;
+  }
+
   if (auto parentAttr = mlir::dyn_cast<BlockedEncodingAttr>(parent)) {
     if (kWidth != 0)
       return emitError() << "ttg.dot_op kWidth parameter is not supported "
@@ -2678,6 +2627,42 @@ struct TritonGPUInferLayoutInterface
     return success();
   }
 
+  LogicalResult verifyDotOpEncoding(unsigned opIdx, Attribute parent,
+                                    unsigned kWidth) const override {
+    ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError;
+    if (auto parentAttr = mlir::dyn_cast<NvidiaMmaEncodingAttr>(parent)) {
+      if (kWidth != 0 && !(parentAttr.isAmpere() || parentAttr.isHopper()))
+        return emitError() << "ttg.dot_op kWidth parameter can only be "
+                              "non-zero for Ampere or Hopper MMA parent";
+      if (kWidth == 0 && (parentAttr.isAmpere() || parentAttr.isHopper()))
+        return emitError() << "ttg.dot_op kWidth parameter is mandatory for "
+                              "Ampere or Hopper MMA parent";
+      if (opIdx != 0 && parentAttr.isHopper())
+        return emitError()
+               << "ttg.dot_op opIdx parameter must be 0 for "
+                  "Hopper MMA parent, since Hopper WGMMA only allows first "
+                  "operand to be in registers";
+      return success();
+    }
+
+    if (auto parentAttr = mlir::dyn_cast<AMDWmmaEncodingAttr>(parent)) {
+      if (kWidth != 16 && parentAttr.getVersion() == 1 ||
+          kWidth != 8 && parentAttr.getVersion() == 2)
+        return emitError() << "ttg.dot_op kWidth parameter must be 16 for "
+                              "gfx11 and 8 for gfx12";
+      return success();
+    }
+
+    if (auto parentAttr = mlir::dyn_cast<AMDMfmaEncodingAttr>(parent)) {
+      if (kWidth == 0)
+        return emitError() << "ttg.dot_op kWidth parameter is mandatory for "
+                              "MFMA parent";
+      return success();
+    }
+
+    return emitError() << "ttg.dot_op unknown parent layout: " << parent;
+  }
+
   // Given a src shape + encoding and a dst shape, our goal is to compute a dst
   // encoding that makes the reshape a "nop".  That is, if GPU thread [x,y,z]
   // contains elements [a,b,c,d] before the reshape, it contains those same

diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -639,6 +639,20 @@ struct TritonIntelGPUInferLayoutInterface
     return success();
   }
 
+  LogicalResult verifyDotOpEncoding(unsigned opIdx, Attribute parent,
+                                    unsigned kWidth) const override {
+    ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError;
+
+    if (auto parentAttr = mlir::dyn_cast<intel::DpasEncodingAttr>(parent)) {
+      if (kWidth != parentAttr.getOpsPerChannel())
+        return emitError() << "ttg.dot_op kWidth parameter must match the "
+                              "parent's opsPerChannel";
+      return success();
+    }
+
+    return emitError() << "ttg.dot_op unknown parent layout: " << parent;
+  }
+
   LogicalResult
   inferReshapeOpNoReorderEncoding(ArrayRef<int64_t> srcShape, Attribute srcEnc,
                                   ArrayRef<int64_t> dstShape, Attribute &dstEnc,

diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -515,7 +515,12 @@ struct LoadOpConversion
     const bool memoryRowMajor = (memoryLayoutInfo == "row_major");
 
     DotOperandEncodingAttr dotLayout = getDotEncoding(tensorType).value();
-    auto dotOrder = dotLayout.getThreadOrder();
+    std::optional<LinearLayout> dotLL =
+        dotLayout.toLinearLayout(tensorType.getShape());
+    assert(dotLL.has_value() && "invalid dot layout to linear layout");
+    LinearEncodingAttr dotLLAttr =
+        LinearEncodingAttr::get(rewriter.getContext(), *dotLL);
+    SmallVector<unsigned> dotOrder = dotLLAttr.getThreadOrder();
     size_t rank = dotOrder.size();
     const bool valueRowMajor =
         (dotOrder[rank - 2] == 1 && dotOrder[rank - 1] == 0);