coredac
diff --git a/‎lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp‎
Lines changed: 22 additions & 0 deletions b/‎lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td‎
Lines changed: 5 additions & 4 deletions b/‎lib/Conversion/LlvmToNeura/LlvmToNeuraPatterns.td‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎test/README.md‎
Lines changed: 57 additions & 0 deletions b/‎test/README.md‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎test/affine2neura/bert/bert_node0/bert_node0.mlir‎
Lines changed: 24 additions & 6 deletions b/‎test/affine2neura/bert/bert_node0/bert_node0.mlir‎
Lines changed: 24 additions & 6 deletions
diff --git a/‎test/affine2neura/bert/bert_node1/bert_node1.mlir‎
Lines changed: 31 additions & 5 deletions b/‎test/affine2neura/bert/bert_node1/bert_node1.mlir‎
Lines changed: 31 additions & 5 deletions
diff --git a/‎test/affine2neura/bert/bert_node2/bert_node2.mlir‎
Lines changed: 50 additions & 5 deletions b/‎test/affine2neura/bert/bert_node2/bert_node2.mlir‎
Lines changed: 50 additions & 5 deletions
diff --git a/‎test/affine2neura/bert/bert_node28/bert_node28.mlir‎
Lines changed: 46 additions & 5 deletions b/‎test/affine2neura/bert/bert_node28/bert_node28.mlir‎
Lines changed: 46 additions & 5 deletions
diff --git a/‎test/affine2neura/bert/bert_node3/bert_node3.mlir‎
Lines changed: 33 additions & 5 deletions b/‎test/affine2neura/bert/bert_node3/bert_node3.mlir‎
Lines changed: 33 additions & 5 deletions
@@ -9,6 +9,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "Conversion/ConversionPasses.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
 namespace neura {
@@ -62,6 +63,26 @@ struct LlvmFAddToNeuraFAdd : public OpRewritePattern<mlir::LLVM::FAddOp> {
   }
 };
 
+struct LlvmFSubToNeuraFSub : public OpRewritePattern<mlir::LLVM::FSubOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::LLVM::FSubOp op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op->getOperand(0);
+    Value rhs = op.getOperand(1);
+    Type result_type = op->getResult(0).getType();
+
+    // Only matches scalar float.
+    if (!mlir::isa<FloatType>(result_type)){
+      return failure();
+    }
+
+    // Optional predicate: default to 'none'
+    rewriter.replaceOpWithNewOp<neura::FSubOp>(op, result_type, lhs, rhs, Value());
+    return success();
+  }
+};
+
 struct LlvmOrToNeuraOr : public OpRewritePattern<mlir::LLVM::OrOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -316,6 +337,7 @@ struct LowerLlvmToNeuraPass
     patterns.add<LlvmBrToNeuraBr>(&getContext());
     patterns.add<LlvmReturnToNeuraReturn>(&getContext());
     patterns.add<FuncReturnToNeuraReturn>(&getContext());
+    patterns.add<LlvmFSubToNeuraFSub>(&getContext());
 
     FrozenRewritePatternSet frozen(std::move(patterns));
 
 
@@ -4,8 +4,9 @@ include "mlir/Dialect/LLVMIR/LLVMOps.td"
 include "NeuraDialect/NeuraOps.td"
 
 // Floating point binary operations.
-def : Pat<
-  (LLVM_FSubOp $lhs, $rhs, $_fastmath),
-  (Neura_FSubOp $lhs, $rhs)
->;
+// Deprecated Pattern: Because we need the predicate bit to be set to null initially
+// def : Pat<
+//   (LLVM_FSubOp $lhs, $rhs, $_fastmath),
+//   (Neura_FSubOp $lhs, $rhs)
+// >;
 
@@ -0,0 +1,57 @@
+# Tests for Neura
+
+The structure of the files in this folder is as follows：
+```
+.
+├── affine2neura
+│   └── bert
+├── arith2neura
+│   ├── add.mlir
+│   └── Output
+├── c2llvm2mlir
+│   ├── kernel.cpp
+│   ├── Output
+│   └── test.mlir
+├── lit.cfg
+├── lit.cfg.in
+├── neura
+│   ├── arith_add.mlir
+│   ├── ctrl
+│   ├── fadd_fadd.mlir
+│   ├── for_loop
+│   ├── interpreter
+│   ├── llvm_add.mlir
+│   ├── llvm_sub.mlir
+│   └── Output
+├── Output
+│   └── test.mlir.script
+├── README.md
+├── samples
+│   ├── bert
+│   └── lenet
+└── test.mlir
+```
+
+All of the above content can be divided into three categories
+
+## 1 Conversion Test
+We need to convert other dialects to our `neura` dialect for compilation optimization. In order to verify the correctness of conversions from other dialects to `nerua` dialect, we need to provide the appropriate test for a conversion pass from a dialect to `nerua` dialect.
+
+For now, we have:
+`affine2neura`: tests provided for `--lower-affine-to-neura` [To be provided]
+`arith2neura`: tests provided for `--lower-arith-to-neura`
+`c2llvm2mlir`: tests provided for `--lower-llvm-to-neura`
+
+## 2 Neura Compiler Test
+Tests for individual passes/pass pipelines at the `neura` dialect level.
+
+## 3 Samples
+A collection of real-world applications for generating unit small tests.
+
+For now, [BERT](https://github.com/codertimo/BERT-pytorch) and [LENET](https://github.com/kuangliu/pytorch-cifar/blob/master/models/lenet.py) are included.
+
+We generate the `linalg` dialect of these models via [Torch MLIR](https://github.com/llvm/torch-mlir). which is then lowered to `affine` dialect for further lowering.
+
+Due to the data dependencies between loops in models, we are now unable to automatically extract each of these SINGLE loops from the model IR for individual tests.
+
+But we can manually collect some small unit tests from these sample IRs. For example, you can write `c++` code of a loop from BERT by mimicing the its corresponding `affine.for` operations, then use [Polygeist](https://github.com/llvm/Polygeist) to convert these `c++` code into `affine` mlir for further lowering. And that's how we generated tests in `affine2neura/bert`.
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-llvm-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
 
 module attributes {} {
   func.func @_Z10bert_node0PA128_KiPA128_b(%arg0: memref<?x128xi32>, %arg1: memref<?x128xi8>) attributes {} {
@@ -14,8 +14,26 @@ module attributes {} {
   }
 }
 
-// CHECK-LABEL: func.func @_Z10bert_node0PA128_KiPA128_b
-// CHECK-NOT: arith.
-// CHECK-NOT: affine.
-// CHECK-NOT: llvm.
-
+// CHECK: func.func @_Z10bert_node0PA128_KiPA128_b(%arg0: memref<?x128xi32>, %arg1: memref<?x128xi8>) attributes {accelerator = "neura"} {
+// CHECK-NEXT:  %0 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT:  %1 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT:  %2 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-NEXT:  %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT:  %4 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT:  llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb2
+// CHECK-NEXT:  %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT:  %7 = "neura.icmp"(%6, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:  llvm.cond_br %7, ^bb2, ^bb3
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT:  %8 = memref.load %arg0[%3, %6] : memref<?x128xi32>
+// CHECK-NEXT:  %9 = "neura.icmp"(%8, %2) <{cmpType = "sgt"}> : (i32, i32) -> i1
+// CHECK-NEXT:  %10 = "neura.cast"(%9) <{cast_type = "extui"}> : (i1) -> i8
+// CHECK-NEXT:  memref.store %10, %arg1[%3, %6] : memref<?x128xi8>
+// CHECK-NEXT:  %11 = "neura.add"(%6, %0) : (index, index) -> index
+// CHECK-NEXT:  %12 = builtin.unrealized_conversion_cast %11 : index to i64
+// CHECK-NEXT:  llvm.br ^bb1(%12 : i64)
+// CHECK-NEXT: ^bb3:  // pred: ^bb1
+// CHECK-NEXT:  return
+// CHECK-NEXT:  }
+// CHECK-NEXT:  }
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-llvm-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
 module attributes {} {
   func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {} {
     affine.for %arg2 = 0 to 128 {
@@ -12,7 +12,33 @@ module attributes {} {
   }
 }
 
-// CHECK-LABEL: func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_
-// CHECK-NOT: arith.
-// CHECK-NOT: affine.
-// CHECK-NOT: llvm.
+// CHECK: func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura"} {
+// CHECK-NEXT:    %0 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT:    %1 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT:    %2 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT:    %3 = builtin.unrealized_conversion_cast %2 : index to i64
+// CHECK-NEXT:    llvm.br ^bb1(%3 : i64)
+// CHECK-NEXT:  ^bb1(%4: i64):  // 2 preds: ^bb0, ^bb5
+// CHECK-NEXT:    %5 = builtin.unrealized_conversion_cast %4 : i64 to index
+// CHECK-NEXT:    %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:    llvm.cond_br %6, ^bb2, ^bb6
+// CHECK-NEXT:  ^bb2:  // pred: ^bb1
+// CHECK-NEXT:    %7 = builtin.unrealized_conversion_cast %2 : index to i64
+// CHECK-NEXT:    llvm.br ^bb3(%7 : i64)
+// CHECK-NEXT:  ^bb3(%8: i64):  // 2 preds: ^bb2, ^bb4
+// CHECK-NEXT:    %9 = builtin.unrealized_conversion_cast %8 : i64 to index
+// CHECK-NEXT:    %10 = "neura.icmp"(%9, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:    llvm.cond_br %10, ^bb4, ^bb5
+// CHECK-NEXT:  ^bb4:  // pred: ^bb3
+// CHECK-NEXT:    %11 = memref.load %arg0[%2, %2, %2, %2, %2, %9] : memref<?x1x1x1x1x128xi8>
+// CHECK-NEXT:    memref.store %11, %arg1[%2, %2, %5, %2, %2, %9] : memref<?x1x128x1x1x128xi8>
+// CHECK-NEXT:    %12 = "neura.add"(%9, %0) : (index, index) -> index
+// CHECK-NEXT:    %13 = builtin.unrealized_conversion_cast %12 : index to i64
+// CHECK-NEXT:    llvm.br ^bb3(%13 : i64)
+// CHECK-NEXT:  ^bb5:  // pred: ^bb3
+// CHECK-NEXT:    %14 = "neura.add"(%5, %0) : (index, index) -> index
+// CHECK-NEXT:    %15 = builtin.unrealized_conversion_cast %14 : index to i64
+// CHECK-NEXT:    llvm.br ^bb1(%15 : i64)
+// CHECK-NEXT:  ^bb6:  // pred: ^bb1
+// CHECK-NEXT:    return
+// CHECK-NEXT:  }
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-llvm-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
 module attributes {} {
   func.func @_Z10bert_node2PA128_KiPA768_KfPA128_A768_f(%arg0: memref<?x128xi32>, %arg1: memref<?x768xf32>, %arg2: memref<?x128x768xf32>) attributes {} {
     %false = arith.constant false
@@ -27,7 +27,52 @@ module attributes {} {
   }
 }
 
-// CHECK-LABEL: func.func @_Z10bert_node2PA128_KiPA768_KfPA128_A768_f
-// CHECK-NOT: arith.
-// CHECK-NOT: affine.
-// CHECK-NOT: llvm.
+// CHECK: func.func @_Z10bert_node2PA128_KiPA768_KfPA128_A768_f(%arg0: memref<?x128xi32>, %arg1: memref<?x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura"} {
+// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index
+// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT: %3 = "neura.constant"() <{value = false}> : () -> i1
+// CHECK-NEXT: %4 = "neura.constant"() <{value = 30521 : i32}> : () -> i32
+// CHECK-NEXT: %5 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-NEXT: %6 = "neura.constant"() <{value = 30522 : i32}> : () -> i32
+// CHECK-NEXT: %7 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %7 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%8 : i64)
+// CHECK-NEXT: ^bb1(%9: i64):  // 2 preds: ^bb0, ^bb9
+// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
+// CHECK-NEXT: %11 = "neura.icmp"(%10, %2) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %11, ^bb2, ^bb10
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT: %12 = builtin.unrealized_conversion_cast %7 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%12 : i64)
+// CHECK-NEXT: ^bb3(%13: i64):  // 2 preds: ^bb2, ^bb8
+// CHECK-NEXT: %14 = builtin.unrealized_conversion_cast %13 : i64 to index
+// CHECK-NEXT: %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %15, ^bb4, ^bb9
+// CHECK-NEXT: ^bb4:  // pred: ^bb3
+// CHECK-NEXT: %16 = memref.load %arg0[%7, %10] : memref<?x128xi32>
+// CHECK-NEXT: %17 = "neura.icmp"(%16, %6) <{cmpType = "sge"}> : (i32, i32) -> i1
+// CHECK-NEXT: %18 = "neura.sel"(%4, %16, %17) : (i32, i32, i1) -> i32
+// CHECK-NEXT: llvm.cond_br %17, ^bb5, ^bb6
+// CHECK-NEXT: ^bb5:  // pred: ^bb4
+// CHECK-NEXT: llvm.br ^bb7(%3 : i1)
+// CHECK-NEXT: ^bb6:  // pred: ^bb4
+// CHECK-NEXT: %19 = "neura.icmp"(%16, %5) <{cmpType = "slt"}> : (i32, i32) -> i1
+// CHECK-NEXT: llvm.br ^bb7(%19 : i1)
+// CHECK-NEXT: ^bb7(%20: i1):  // 2 preds: ^bb5, ^bb6
+// CHECK-NEXT: llvm.br ^bb8
+// CHECK-NEXT: ^bb8:  // pred: ^bb7
+// CHECK-NEXT: %21 = "neura.sel"(%5, %18, %20) : (i32, i32, i1) -> i32
+// CHECK-NEXT: %22 = "neura.cast"(%21) <{cast_type = "indexCast"}> : (i32) -> index
+// CHECK-NEXT: %23 = memref.load %arg1[%22, %14] : memref<?x768xf32>
+// CHECK-NEXT: memref.store %23, %arg2[%7, %10, %14] : memref<?x128x768xf32>
+// CHECK-NEXT: %24 = "neura.add"(%14, %1) : (index, index) -> index
+// CHECK-NEXT: %25 = builtin.unrealized_conversion_cast %24 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%25 : i64)
+// CHECK-NEXT: ^bb9:  // pred: ^bb3
+// CHECK-NEXT: %26 = "neura.add"(%10, %1) : (index, index) -> index
+// CHECK-NEXT: %27 = builtin.unrealized_conversion_cast %26 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%27 : i64)
+// CHECK-NEXT: ^bb10:  // pred: ^bb1
+// CHECK-NEXT: return
+// CHECK-NEXT: }
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-llvm-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
 module attributes {} {
   func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x768x768xf32>, %arg2: memref<?x128x768xf32>) attributes {} {
     affine.for %arg3 = 0 to 128 {
@@ -17,7 +17,48 @@ module attributes {} {
     return
   }
 }
-// CHECK-LABEL: func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f
-// CHECK-NOT: arith.
-// CHECK-NOT: affine.
-// CHECK-NOT: llvm.
+// CHECK: func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x768x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura"} {
+// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index
+// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb8
+// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb9
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%8 : i64)
+// CHECK-NEXT: ^bb3(%9: i64):  // 2 preds: ^bb2, ^bb7
+// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
+// CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb8
+// CHECK-NEXT: ^bb4:  // pred: ^bb3
+// CHECK-NEXT: %12 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb5(%12 : i64)
+// CHECK-NEXT: ^bb5(%13: i64):  // 2 preds: ^bb4, ^bb6
+// CHECK-NEXT: %14 = builtin.unrealized_conversion_cast %13 : i64 to index
+// CHECK-NEXT: %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %15, ^bb6, ^bb7
+// CHECK-NEXT: ^bb6:  // pred: ^bb5
+// CHECK-NEXT: %16 = memref.load %arg0[%3, %6, %14] : memref<?x128x768xf32>
+// CHECK-NEXT: %17 = memref.load %arg1[%3, %14, %10] : memref<?x768x768xf32>
+// CHECK-NEXT: %18 = memref.load %arg2[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %19 = "neura.fmul"(%16, %17) : (f32, f32) -> f32
+// CHECK-NEXT: %20 = "neura.fadd"(%18, %19) : (f32, f32) -> f32
+// CHECK-NEXT: memref.store %20, %arg2[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %21 = "neura.add"(%14, %1) : (index, index) -> index
+// CHECK-NEXT: %22 = builtin.unrealized_conversion_cast %21 : index to i64
+// CHECK-NEXT: llvm.br ^bb5(%22 : i64)
+// CHECK-NEXT: ^bb7:  // pred: ^bb5
+// CHECK-NEXT: %23 = "neura.add"(%10, %1) : (index, index) -> index
+// CHECK-NEXT: %24 = builtin.unrealized_conversion_cast %23 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%24 : i64)
+// CHECK-NEXT: ^bb8:  // pred: ^bb3
+// CHECK-NEXT: %25 = "neura.add"(%6, %1) : (index, index) -> index
+// CHECK-NEXT: %26 = builtin.unrealized_conversion_cast %25 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%26 : i64)
+// CHECK-NEXT: ^bb9:  // pred: ^bb1
+// CHECK-NEXT: return
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm -o %t-llvm.mlir
-// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura --lower-llvm-to-neura | FileCheck %s
+// RUN: mlir-neura-opt %t-llvm.mlir --assign-accelerator --lower-arith-to-neura | FileCheck %s
 module attributes {} {
   func.func @_Z10bert_node3PA128_A768_KfS2_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x128x768xf32>, %arg2: memref<?x128x768xf32>) attributes {} {
     affine.for %arg3 = 0 to 128 {
@@ -14,7 +14,35 @@ module attributes {} {
   }
 }
 
-// CHECK-LABEL: func.func @_Z10bert_node3PA128_A768_KfS2_PA128_A768_f
-// CHECK-NOT: arith.
-// CHECK-NOT: affine.
-// CHECK-NOT: llvm.
+// CHECK: func.func @_Z10bert_node3PA128_A768_KfS2_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x128x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura"} {
+// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index
+// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT: %4 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%4 : i64)
+// CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb5
+// CHECK-NEXT: %6 = builtin.unrealized_conversion_cast %5 : i64 to index
+// CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %7, ^bb2, ^bb6
+// CHECK-NEXT: ^bb2:  // pred: ^bb1
+// CHECK-NEXT: %8 = builtin.unrealized_conversion_cast %3 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%8 : i64)
+// CHECK-NEXT: ^bb3(%9: i64):  // 2 preds: ^bb2, ^bb4
+// CHECK-NEXT: %10 = builtin.unrealized_conversion_cast %9 : i64 to index
+// CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT: llvm.cond_br %11, ^bb4, ^bb5
+// CHECK-NEXT: ^bb4:  // pred: ^bb3
+// CHECK-NEXT: %12 = memref.load %arg0[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %13 = memref.load %arg1[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %14 = "neura.fadd"(%12, %13) : (f32, f32) -> f32
+// CHECK-NEXT: memref.store %14, %arg2[%3, %6, %10] : memref<?x128x768xf32>
+// CHECK-NEXT: %15 = "neura.add"(%10, %1) : (index, index) -> index
+// CHECK-NEXT: %16 = builtin.unrealized_conversion_cast %15 : index to i64
+// CHECK-NEXT: llvm.br ^bb3(%16 : i64)
+// CHECK-NEXT: ^bb5:  // pred: ^bb3
+// CHECK-NEXT: %17 = "neura.add"(%6, %1) : (index, index) -> index
+// CHECK-NEXT: %18 = builtin.unrealized_conversion_cast %17 : index to i64
+// CHECK-NEXT: llvm.br ^bb1(%18 : i64)
+// CHECK-NEXT: ^bb6:  // pred: ^bb1
+// CHECK-NEXT: return