Skip to content

Commit

Permalink
Add comments to the tests.
Browse files Browse the repository at this point in the history
Signed-off-by: hanhanW <[email protected]>
  • Loading branch information
hanhanW committed Jan 8, 2025
1 parent 92a9908 commit 931032c
Show file tree
Hide file tree
Showing 2 changed files with 229 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/LogicalResult.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Support/LLVM.h"
Expand Down Expand Up @@ -52,6 +53,175 @@ SmallVector<const T *> gatherUsedDialectInterfaces(mlir::ModuleOp moduleOp) {
return results;
}

/// Returns the affinities of the `dispatchOp`'s resource operands. An empty
/// array attribute indicates that the resource operand affinity is not found.
/// Usually, it happens when it fails on affinity analysis.
/// Note that the size of the result might not equal to the number of resource
/// operands. If a resource operand type is not AffinityType, it is skipped.
static SmallVector<Attribute>
getResourceOperandsAffinities(IREE::Stream::AffinityAnalysis &affinityAnalysis,
IREE::Stream::AsyncDispatchOp dispatchOp) {
SmallVector<Attribute> result;
Builder b(dispatchOp.getContext());
auto emptyArray = b.getArrayAttr({});
for (auto operand : dispatchOp.getResourceOperands()) {
// Skip if the operand type is not AffinityType.
if (!isa<IREE::Stream::AffinityTypeInterface>(operand.getType())) {
continue;
}
SmallVector<IREE::Stream::AffinityAttr> affinities;
if (!affinityAnalysis.tryLookupResourceAffinity(operand, affinities)) {
result.push_back(emptyArray);
continue;
}
result.push_back(b.getArrayAttr(llvm::to_vector_of<Attribute>(affinities)));
}
return result;
}

/// Duplicates stream.executables based on the affinity analysis of
/// stream.async.dispatch ops. Some executables can be launched by different
/// devices. It can produce wrong codegen artifacts when bindings types are
/// encoded (i.e., the tensor type has an encoding attribute). Because they can
/// result in different layouts, especially when multi-device is involved. E.g.,
/// say that device_a and device_b interpret a tensor type with encodings in
/// different layouts, and there is an executable that can be launch with
/// resources from either device_a or device_b. It is confusing what the input
/// layouts for the executable because there are two possibilities. In this
/// case, we have to duplicate the executable with updated encoding, and modify
/// the dispatch to launch proper executable based on device analysis.
static LogicalResult duplicateExecutablesPerAffinityVariant(
ModuleOp moduleOp, SymbolTable symbolTable, FunctionOpInterface funcOp,
IREE::Stream::ResolveLayoutAttrFn resolveLayoutAttr) {
MLIRContext *ctx = moduleOp.getContext();
IRRewriter rewriter(ctx);

// 1. Gather per-export [execution affinity -> [resource affinities]] map.
IREE::Stream::AffinityAnalysis affinityAnalysis(moduleOp);
if (failed(affinityAnalysis.run())) {
return moduleOp.emitError("failed on running affinity analysis");
}
SmallVector<IREE::Stream::AsyncDispatchOp> candidates;
funcOp.walk(
[&](IREE::Stream::AsyncDispatchOp op) { candidates.push_back(op); });

// export -> [affinity -> array per resource of affinities PVS].
DenseMap<IREE::Stream::ExecutableExportOp,
SetVector<std::pair<IREE::Stream::AffinityAttr, ArrayAttr>>>
exportToDispatchSites;

llvm::MapVector<IREE::Stream::AsyncDispatchOp, SmallVector<Attribute>>
resourceAffinities;
for (auto dispatchOp : candidates) {
SmallVector<IREE::Stream::AffinityAttr> execAffinities;
if (!affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
execAffinities)) {
return dispatchOp.emitError("failed on execution affinity lookup");
}
assert(execAffinities.size() == 1 &&
"We should only have a single execution "
"affinity when running the pass.");

SmallVector<Attribute> operandAffinityAttrs =
getResourceOperandsAffinities(affinityAnalysis, dispatchOp);
resourceAffinities[dispatchOp] = operandAffinityAttrs;

dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
symbolTable.lookupSymbolIn(moduleOp, entryPoint));
exportToDispatchSites[exportOp].insert(std::make_pair(
execAffinities[0], rewriter.getArrayAttr(operandAffinityAttrs)));
});
}

LLVM_DEBUG({
llvm::dbgs() << "Dump of exportToDispatchSites\n";
for (auto [exportOp, affinities] : exportToDispatchSites) {
llvm::dbgs() << " ExportOp: " << exportOp.getSymName() << "\n";
for (auto [execAffinity, resourceAffinities] : affinities) {
llvm::dbgs() << " executaion affinity: " << execAffinity << "\n";
llvm::dbgs() << " resource affinities: " << resourceAffinities
<< "\n";
}
}
});

// 2. Duplicate executables for each unqiue resource affinities.

// Mapping from [execution affinity, resource operands affinities, export] to
// the executable op.
using DispatchSiteInfo = std::tuple<IREE::Stream::AffinityAttr, ArrayAttr,
IREE::Stream::ExecutableExportOp>;
DenseMap<DispatchSiteInfo, IREE::Stream::ExecutableOp>
dispatchSiteToExecutableOp;
for (auto [exportOp, execAndResourceAffinities] : exportToDispatchSites) {
auto executableOp = exportOp->getParentOfType<IREE::Stream::ExecutableOp>();
// No need to duplicate the executable if all the uses have the same
// affinities.
// TODO(hanchung): Do not duplicate the executables if bindings are not
// encoded. I.e., all the tensor types do not have encodings.
if (execAndResourceAffinities.size() == 1) {
auto [execAffinity, resourceAffinities] = execAndResourceAffinities[0];
dispatchSiteToExecutableOp[DispatchSiteInfo(
execAffinity, resourceAffinities, exportOp)] = executableOp;
continue;
}

int64_t dupId = -1;
for (auto [execAffinity, resourceAffinities] : execAndResourceAffinities) {
rewriter.setInsertionPointAfter(executableOp);
IREE::Stream::ExecutableOp dupOp = executableOp;
if (dupId != -1) {
auto symName = std::string(executableOp.getSymName());
symName += "_dup" + std::to_string(dupId);
dupOp = rewriter.cloneWithoutRegions(executableOp);
rewriter.modifyOpInPlace(dupOp, [&] {
dupOp.setSymName(symName);
IRMapping mapping;
executableOp.getRegion().cloneInto(&dupOp.getRegion(), mapping);
});
}
dispatchSiteToExecutableOp[DispatchSiteInfo(
execAffinity, resourceAffinities, exportOp)] = dupOp;
dupId++;
}
}

// 3. Update dispatch sites, i.e., point dispatch entry points to
// corresponding cloned executables.
for (auto dispatchOp : candidates) {
SmallVector<Attribute> newEntryPoints;
SmallVector<IREE::Stream::AffinityAttr> execAffinities;
// Sanity checks. It should already meet the requirement because they are
// checked in step 1.
assert(affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
execAffinities));
assert(execAffinities.size() == 1);
SmallVector<Attribute> operandAttrs = resourceAffinities[dispatchOp];
dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
symbolTable.lookupSymbolIn(moduleOp, entryPoint));
auto info = DispatchSiteInfo(
execAffinities[0], rewriter.getArrayAttr(operandAttrs), exportOp);
assert(dispatchSiteToExecutableOp.count(info));

auto executableOp = dispatchSiteToExecutableOp[info];
auto newSym = SymbolRefAttr::get(executableOp->getAttrOfType<StringAttr>(
SymbolTable::getSymbolAttrName()),
entryPoint.getNestedReferences());
newEntryPoints.push_back(newSym);
});

rewriter.modifyOpInPlace(dispatchOp, [&] {
dispatchOp.setEntryPointsAttr(rewriter.getArrayAttr(newEntryPoints));
});
}

// TODO(hanchung): Update encodings in executables.

return success();
}

// TODO(hanchung): Add "cloneWithEncoding" method to RankedTensorType.
static RankedTensorType cloneWithEncoding(RankedTensorType type,
Attribute encodingAttr) {
Expand Down Expand Up @@ -149,6 +319,7 @@ struct SpecializeEncodingsPass
return signalPassFailure();
}

SymbolTable symbolTable(moduleOp);
llvm::MapVector<StringRef, IREE::Stream::ExecutableOp> executableOps;
for (auto executableOp : moduleOp.getOps<IREE::Stream::ExecutableOp>()) {
executableOps[executableOp.getName()] = executableOp;
Expand All @@ -164,7 +335,11 @@ struct SpecializeEncodingsPass
return signalPassFailure();
}

// TODO(hanchung): Duplicate executables and update dispatch ops.
if (failed(duplicateExecutablesPerAffinityVariant(
moduleOp, symbolTable, funcOp, resolveLayoutAttr))) {
funcOp.emitError("failed on executable duplication");
return signalPassFailure();
}
}
}
};
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,69 @@
// RUN: iree-opt --split-input-file --iree-stream-specialize-encodings %s | FileCheck %s

#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {encoding_layout = #iree_cpu.vmvx_encoding_layout<>, ukernels = "all"}>
//------------------------------------------------------------------------------
// Stream ops that have TensorPhaseOp trait. This test suite tests that the
// encoding is updated that carries resolved layouts.
//------------------------------------------------------------------------------

#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {encoding_layout = #iree_cpu.vmvx_encoding_layout<>}>
#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
#encoding = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f32, f32]>
module {
util.global private @device_a = #device_target_local_0_

util.func public @main(%d0: index, %d1: index) -> index {
util.func public @tensor_sizeof(%d0: index, %d1: index) -> index {
%size = stream.tensor.sizeof on(#hal.device.affinity<@device_a>) tensor<?x?xf32, #encoding>{%d0, %d1} : index
util.return %size : index
}
}
// CHECK: #[[EXECUTABLE:.+]] = #hal.executable.target<"vmvx",
// CHECK: #[[$ENCODING:.+]] = #iree_encoding.encoding
// CHECK-SAME: layouts = [#[[EXECUTABLE]]]
// CHECK-LABEL: util.func public @main
// CHECK-LABEL: util.func public @tensor_sizeof
// CHECK: %[[RES:.+]] = stream.tensor.sizeof {{.+}} tensor<?x?xf32, #[[$ENCODING]]>
// CHECK: return %[[RES]]

// -----

#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>
#map = affine_map<(d0) -> (d0)>
#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
#device_target_local_1_ = #hal.device.target<"local", {ordinal = 1 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@device_a>} {
util.global private @device_a = #device_target_local_0_
util.global private @device_b = #device_target_local_1_
stream.executable private @ex {
stream.executable.export public @dispatch
}
util.func public @multi_device(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view {
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%element_type_f32 = hal.element_type<f32> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c4]) type(%element_type_f32) encoding(%dense_row_major)
%0 = stream.tensor.import on(#hal.device.affinity<@device_a>) %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.timepoint.import on(#hal.device.affinity<@device_a>) %arg1 : (!hal.fence) => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
%4 = stream.async.dispatch on(#hal.device.affinity<@device_a>) @ex::@dispatch(%3[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%c16}
%6 = stream.async.dispatch on(#hal.device.affinity<@device_b>) @ex::@dispatch(%5[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_b>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
%result, %result_timepoint = stream.timepoint.barrier on(#hal.device.affinity<@device_a>) %7 : !stream.resource<*>{%c16} => !stream.timepoint
stream.timepoint.chain_external on(#hal.device.affinity<@device_a>) %result_timepoint => (%arg2 : !hal.fence)
%8 = stream.async.transfer %result : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<external>{%c16}
%9 = stream.tensor.export on(#hal.device.affinity<@device_a>) %8 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}

// CHECK: #[[DEVICE_LOCAL_0:.+]] = #hal.device.target
// CHECK: #[[DEVICE_LOCAL_1:.+]] = #hal.device.target
// CHECK: util.global private @[[$DEVICE_A:.+]] = #[[DEVICE_LOCAL_0]]
// CHECK: util.global private @[[$DEVICE_B:.+]] = #[[DEVICE_LOCAL_1]]
// CHECK: stream.executable private @[[$EX0:.+]] {
// CHECK: stream.executable private @[[$EX1:.+]] {
// CHECK-LABEL: util.func public @multi_device
// CHECK: stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_A]]>) @[[$EX0]]::@dispatch
// CHECK: stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_B]]>) @[[$EX1]]::@dispatch

0 comments on commit 931032c

Please sign in to comment.