Skip to content

Commit

Permalink
Convert tosa.bitwise_not to aievec.bneg to compute bitwise not for in…
Browse files Browse the repository at this point in the history
…teger types (#709)
  • Loading branch information
linay-xsj authored Nov 2, 2023
1 parent 0128219 commit 8bf4969
Show file tree
Hide file tree
Showing 17 changed files with 329 additions and 7 deletions.
16 changes: 16 additions & 0 deletions include/aie/Dialect/AIEVec/IR/AIEVecOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -759,4 +759,20 @@ def AIEVec_BxorOp:
let assemblyFormat = "$lhs `,` $rhs attr-dict `:` type($lhs) `,` type($rhs) `,` type($result)";
let hasVerifier = 0;
}

def AIEVec_BnegOp:
AIEVec_Op<"bneg", [
Pure,
AllTypesMatch<["source", "result"]>
]>,
Arguments<(ins AnyVector:$source)>,
Results<(outs AnyVector:$result)> {
let summary = "AIE vector bitwise negation";
let description = [{
AMD-specific intrinsic that computes bitwise negation of a vector and returns the result.
`$result = bneg(`$source`).
}];
let assemblyFormat = "$source attr-dict `:` type($result)";
let hasVerifier = 0;
}
#endif // AIEVEC_OPS
43 changes: 39 additions & 4 deletions lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2396,9 +2396,30 @@ struct ComputeNegOpPattern : public OpConversionPattern<arith::NegFOp> {
}
};

// Check whether the value of constant operation is int type and the dense value
// is -1.
static bool hasConstNegOneValue(arith::ConstantOp constOp, unsigned elWidth) {
if (!constOp) {
return false;
}
auto cstDense = dyn_cast<DenseIntElementsAttr>(constOp.getValue());
if (!cstDense) {
return false;
}

if (elWidth == 32) {
return cstDense.getSplatValue<int32_t>() == -1;
} else if (elWidth == 16) {
return cstDense.getSplatValue<int16_t>() == -1;
} else if (elWidth == 8) {
return cstDense.getSplatValue<int8_t>() == -1;
}
return false;
}

// Convert arith.xori to aievec.bxor to compute bitwise xor of two vectors for
// integer types
struct ComputeBxorOpPattern : public OpConversionPattern<arith::XOrIOp> {
struct ComputeBxorAndBnegOpPattern : public OpConversionPattern<arith::XOrIOp> {
using OpConversionPattern<arith::XOrIOp>::OpConversionPattern;

LogicalResult
Expand All @@ -2421,8 +2442,22 @@ struct ComputeBxorOpPattern : public OpConversionPattern<arith::XOrIOp> {
return failure();
}

rewriter.replaceOpWithNewOp<aievec::BxorOp>(
xorOp, srcType, adaptor.getLhs(), adaptor.getRhs());
auto lhsConstOp =
dyn_cast<arith::ConstantOp>(xorOp.getLhs().getDefiningOp());
auto rhsConstOp =
dyn_cast<arith::ConstantOp>(xorOp.getRhs().getDefiningOp());

// If one of operands in xorOp is a constant -1, xorOp will be replaced with
// aievec::BnegOp.
if ((lhsConstOp && hasConstNegOneValue(lhsConstOp, elWidth)) ||
(rhsConstOp && hasConstNegOneValue(rhsConstOp, elWidth))) {
Value val = hasConstNegOneValue(lhsConstOp, elWidth) ? adaptor.getRhs()
: adaptor.getLhs();
rewriter.replaceOpWithNewOp<aievec::BnegOp>(xorOp, srcType, val);
} else {
rewriter.replaceOpWithNewOp<aievec::BxorOp>(
xorOp, srcType, adaptor.getLhs(), adaptor.getRhs());
}
return success();
}
};
Expand Down Expand Up @@ -2468,7 +2503,7 @@ static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns,
ComputeCeilOpPattern,
ComputeFloorOpPattern,
ComputeNegOpPattern,
ComputeBxorOpPattern,
ComputeBxorAndBnegOpPattern,
ConvertMulIToAIEVecMulElemOpPattern,
LowerVectorAddFOpToAIEVecAddElemOp,
LowerVectorSubFOpToAIEVecSubElemOp,
Expand Down
23 changes: 22 additions & 1 deletion lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1527,6 +1527,27 @@ static LogicalResult printOperation(CppEmitter &emitter, aievec::NegOp negOp) {
return success();
}

// Generate the Bneg op
static LogicalResult printOperation(CppEmitter &emitter,
aievec::BnegOp bnegOp) {
auto src = bnegOp.getSource();

// The source should have already been emitted
if (!emitter.hasValueInScope(src))
return failure();

raw_indented_ostream &os = emitter.ostream();

// Generate the initialization for the result
if (failed(emitter.emitAssignPrefix(*bnegOp)))
return failure();

os << "bneg(";
os << emitter.getOrCreateName(src);
os << ")";
return success();
}

// Generate the Bxor op
static LogicalResult printOperation(CppEmitter &emitter, aievec::BxorOp xorOp) {
auto lhs = xorOp.getLhs();
Expand Down Expand Up @@ -3003,7 +3024,7 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
aievec::ShiftOp, aievec::ShuffleOp, aievec::CastOp,
aievec::MinOp, aievec::MaxOp, aievec::NegOp, aievec::CmpOp,
aievec::SelOp, aievec::ExtElemOp, aievec::BxorOp,
aievec::UnpackOp>(
aievec::BnegOp, aievec::UnpackOp>(
[&](auto op) { return printOperation(*this, op); })
.Default([&](Operation *) {
return op.emitOpError("unable to find printer for op");
Expand Down
3 changes: 3 additions & 0 deletions test/Integration/Dialect/TOSA/i16_bitwise_not/defines.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#pragma once
constexpr unsigned const IN0_SIZE = 1024;
constexpr unsigned const OUT0_SIZE = 1024;
12 changes: 12 additions & 0 deletions test/Integration/Dialect/TOSA/i16_bitwise_not/dut.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
void dut(int16_t *restrict v1, int16_t *restrict v2) {
size_t v3 = 0;
size_t v4 = 1024;
size_t v5 = 32;
for (size_t v6 = v3; v6 < v4; v6 += v5)
chess_prepare_for_pipelining chess_loop_range(32, 32) {
v32int16 v7 = *(v32int16 *)(v1 + v6);
v32int16 v8 = bneg(v7);
*(v32int16 *)(v2 + v6) = v8;
}
return;
}
13 changes: 13 additions & 0 deletions test/Integration/Dialect/TOSA/i16_bitwise_not/i16_bitwise_not.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// REQUIRES: valid_xchess_license
// RUN: aie-opt %s %tosa-to-linalg% | aie-opt %linalg-to-vector-v32% --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED
// Cycle count: 86

func.func @dut(%arg0: tensor<1024xi16>) -> (tensor<1024xi16>) {
%0 = "tosa.bitwise_not"(%arg0) : (tensor<1024xi16>) -> tensor<1024xi16>
return %0 : tensor<1024xi16>
}
56 changes: 56 additions & 0 deletions test/Integration/Dialect/TOSA/i16_bitwise_not/testbench.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "../common/testbench.h"
#include "defines.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>

void dut(int16_t *restrict in0, int16_t *restrict out0);
void dut_ref(int16_t *in0, int16_t *out0);

alignas(32) int16_t g_in0[IN0_SIZE];
alignas(32) int16_t g_out0[OUT0_SIZE];
alignas(32) int16_t g_out0Ref[OUT0_SIZE];

int main(int argc, char *argv[]) {
std::string dataDir(TO_STR(DATA_DIR));
srand(10);
std::generate(g_in0, g_in0 + IN0_SIZE,
[&]() { return random_integer<int16_t>(); });

writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");

chess_memory_fence();
auto cyclesBegin = chess_cycle_count();
dut(g_in0, g_out0);
auto cyclesEnd = chess_cycle_count();
chess_memory_fence();

auto cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");

writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
cyclesBegin = chess_cycle_count();
dut_ref(g_in0, g_out0Ref);
cyclesEnd = chess_cycle_count();
chess_memory_fence();
cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");

bool ok = true;
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 0, 0);

if (ok)
printf("TEST PASSED\n");
else
printf("TEST FAILED\n");

return ok ? 0 : 1;
}

void dut_ref(int16_t *in0, int16_t *out0) {
for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
out0[k] = ~in0[k];
}
}
3 changes: 3 additions & 0 deletions test/Integration/Dialect/TOSA/i32_bitwise_not/defines.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#pragma once
constexpr unsigned const IN0_SIZE = 1024;
constexpr unsigned const OUT0_SIZE = 1024;
12 changes: 12 additions & 0 deletions test/Integration/Dialect/TOSA/i32_bitwise_not/dut.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
void dut(int32_t *restrict v1, int32_t *restrict v2) {
size_t v3 = 0;
size_t v4 = 1024;
size_t v5 = 16;
for (size_t v6 = v3; v6 < v4; v6 += v5)
chess_prepare_for_pipelining chess_loop_range(64, 64) {
v16int32 v7 = *(v16int32 *)(v1 + v6);
v16int32 v8 = bneg(v7);
*(v16int32 *)(v2 + v6) = v8;
}
return;
}
13 changes: 13 additions & 0 deletions test/Integration/Dialect/TOSA/i32_bitwise_not/i32_bitwise_not.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// REQUIRES: valid_xchess_license
// RUN: aie-opt %s %tosa-to-linalg% | aie-opt %linalg-to-vector-v16% --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED
// Cycle count: 150

func.func @dut(%arg0: tensor<1024xi32>) -> (tensor<1024xi32>) {
%0 = "tosa.bitwise_not"(%arg0) : (tensor<1024xi32>) -> tensor<1024xi32>
return %0 : tensor<1024xi32>
}
56 changes: 56 additions & 0 deletions test/Integration/Dialect/TOSA/i32_bitwise_not/testbench.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "../common/testbench.h"
#include "defines.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>

void dut(int32_t *restrict in0, int32_t *restrict out0);
void dut_ref(int32_t *in0, int32_t *out0);

alignas(32) int32_t g_in0[IN0_SIZE];
alignas(32) int32_t g_out0[OUT0_SIZE];
alignas(32) int32_t g_out0Ref[OUT0_SIZE];

int main(int argc, char *argv[]) {
std::string dataDir(TO_STR(DATA_DIR));
srand(10);
std::generate(g_in0, g_in0 + IN0_SIZE,
[&]() { return random_integer<int32_t>(); });

writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");

chess_memory_fence();
auto cyclesBegin = chess_cycle_count();
dut(g_in0, g_out0);
auto cyclesEnd = chess_cycle_count();
chess_memory_fence();

auto cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");

writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
cyclesBegin = chess_cycle_count();
dut_ref(g_in0, g_out0Ref);
cyclesEnd = chess_cycle_count();
chess_memory_fence();
cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");

bool ok = true;
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 0, 0);

if (ok)
printf("TEST PASSED\n");
else
printf("TEST FAILED\n");

return ok ? 0 : 1;
}

void dut_ref(int32_t *in0, int32_t *out0) {
for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
out0[k] = ~in0[k];
}
}
3 changes: 3 additions & 0 deletions test/Integration/Dialect/TOSA/i8_bitwise_not/defines.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#pragma once
constexpr unsigned const IN0_SIZE = 1024;
constexpr unsigned const OUT0_SIZE = 1024;
12 changes: 12 additions & 0 deletions test/Integration/Dialect/TOSA/i8_bitwise_not/dut.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
void dut(int8_t *restrict v1, int8_t *restrict v2) {
size_t v3 = 0;
size_t v4 = 1024;
size_t v5 = 64;
for (size_t v6 = v3; v6 < v4; v6 += v5)
chess_prepare_for_pipelining chess_loop_range(16, 16) {
v64int8 v7 = *(v64int8 *)(v1 + v6);
v64int8 v8 = bneg(v7);
*(v64int8 *)(v2 + v6) = v8;
}
return;
}
13 changes: 13 additions & 0 deletions test/Integration/Dialect/TOSA/i8_bitwise_not/i8_bitwise_not.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// REQUIRES: valid_xchess_license
// RUN: aie-opt %s %tosa-to-linalg% | aie-opt %linalg-to-vector-v64% --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED
// Cycle count: 54

func.func @dut(%arg0: tensor<1024xi8>) -> (tensor<1024xi8>) {
%0 = "tosa.bitwise_not"(%arg0) : (tensor<1024xi8>) -> tensor<1024xi8>
return %0 : tensor<1024xi8>
}
56 changes: 56 additions & 0 deletions test/Integration/Dialect/TOSA/i8_bitwise_not/testbench.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "../common/testbench.h"
#include "defines.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>

void dut(int8_t *restrict in0, int8_t *restrict out0);
void dut_ref(int8_t *in0, int8_t *out0);

alignas(32) int8_t g_in0[IN0_SIZE];
alignas(32) int8_t g_out0[OUT0_SIZE];
alignas(32) int8_t g_out0Ref[OUT0_SIZE];

int main(int argc, char *argv[]) {
std::string dataDir(TO_STR(DATA_DIR));
srand(10);
std::generate(g_in0, g_in0 + IN0_SIZE,
[&]() { return random_integer<int8_t>(); });

writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");

chess_memory_fence();
auto cyclesBegin = chess_cycle_count();
dut(g_in0, g_out0);
auto cyclesEnd = chess_cycle_count();
chess_memory_fence();

auto cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");

writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
cyclesBegin = chess_cycle_count();
dut_ref(g_in0, g_out0Ref);
cyclesEnd = chess_cycle_count();
chess_memory_fence();
cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");

bool ok = true;
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 0, 0);

if (ok)
printf("TEST PASSED\n");
else
printf("TEST FAILED\n");

return ok ? 0 : 1;
}

void dut_ref(int8_t *in0, int8_t *out0) {
for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
out0[k] = ~in0[k];
}
}
1 change: 0 additions & 1 deletion test/unit_tests/aievec_tests/bf16_erf_v16/bf16_erf.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
// XFAIL: *
// REQUIRES: valid_xchess_license
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs-from-loops allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir
Expand Down
1 change: 0 additions & 1 deletion test/unit_tests/aievec_tests/bf16_erf_v32/bf16_erf.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
// XFAIL: *
// REQUIRES: valid_xchess_license
// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs-from-loops allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=32" -o affine.mlir
Expand Down

0 comments on commit 8bf4969

Please sign in to comment.