diff --git a/proposals/0035-linalg-matrix.md b/proposals/0035-linalg-matrix.md index f7700149..2bd7cd55 100644 --- a/proposals/0035-linalg-matrix.md +++ b/proposals/0035-linalg-matrix.md @@ -57,6 +57,18 @@ template class Matrix { using ElementType = typename __detail::ComponentTypeTraits::Type; + // If this isn't a native scalar, we have an 8-bit type, so we have 4 elements + // packed in each scalar value. + static const uint ElementsPerScalar = + __detail::ComponentTypeTraits::IsNativeScalar ? 1 : 4; + // Computes the number of scalars actually stored in the matrix M dimension + // accounting for packing. + static const uint MScalars = + (M + (ElementsPerScalar - 1)) / ElementsPerScalar; + // Computes the number of scalars actually stored in the matrix N dimension + // accounting for packing. + static const uint NScalars = + (N + (ElementsPerScalar - 1)) / ElementsPerScalar; template Matrix cast(); @@ -90,6 +102,18 @@ class Matrix { static typename hlsl::enable_if::value, Matrix>::type Load(/*groupshared*/ T Arr[], uint StartIdx, uint Stride, bool ColMajor); + template + typename hlsl::enable_if::type + FromThreadVectors(vector); + + template + typename hlsl::enable_if::type + FromThreadVectors(vector); + void Store(RWByteAddressBuffer Res, uint StartOffset, uint Stride, bool ColMajor, uint Align = sizeof(ElementType)); @@ -97,18 +121,18 @@ class Matrix { typename hlsl::enable_if::value, void>::type Store(/*groupshared*/ T Arr[], uint StartIdx, uint Stride, bool ColMajor); - // Row accesses - vector GetRow(uint Index); - void SetRow(vector V, uint Index); + // Extract the thread-specific vector. + template + typename hlsl::enable_if>::type + GetThreadVector(uint Index = 0); - // Element access - typename hlsl::enable_if< - __detail::ComponentTypeTraits::IsNativeScalar, - ElementType>::type - Get(uint2 Index); - typename hlsl::enable_if< - __detail::ComponentTypeTraits::IsNativeScalar, void>::type - Set(ElementType V, uint2 Index); + template + typename hlsl::enable_if>::type + GetThreadVector(uint Index = 0); template @@ -133,6 +157,8 @@ class Matrix { OuterProductAccumulate(const vector, const vector); }; +MatrixUse AccumulatorLayout(); + template Matrix @@ -177,12 +203,11 @@ void WaveMatrixExample() { using MatrixAccumTy = Matrix; using MatrixAccum32Ty = Matrix; + MatrixUse::Accumulator, MatrixScope::Wave>; + + MatrixATy MatA = MatrixATy::Load(B, 0, 8 * 4, false); + MatrixBTy MatB = MatrixBTy::Load(B, 0, 32 * 4, false); - MatrixATy MatA = Matrix::Load(B, 0, 8 * 4, false); - MatrixBTy MatB = Matrix::Load(B, 0, 32 * 4, false); MatrixAccumTy Accum = Multiply(MatA, MatB); MatrixAccum32Ty Accum32 = Multiply(MatA, MatB); } @@ -262,6 +287,26 @@ Throughout this document a matrix may be described as having a scope as specified by the `Scope` parameter (e.g. a matrix with `Scope == Thread` is a _matrix with thread scope_). +Matrix storage is always opaque, the `Scope` does not directly restrict how the +matrix is stored, it merely denotes allowed scopes of allowed data divergence. +A matrix with thread scope must behave as if each thread has a unique copy of +the matrix. An implementation may coalesce identical matrices across threads. + +#### Matrix Storage + +In HLSL, matrix objects are intangible objects so they do not have defined size +or memory layout. When in use, implementations are expected to distribute the +storage of matrices across the thread-local storage for all threads in a SIMD +unit. An implementation may also utilize caches or other memory regions as +appropriate. At the DXIL level a matrix is represented as a handle object. + +An A matrix is a collection of per-thread vectors representing matrix rows, +while a B matrix is a collection of per-thread vectors representing matrix +columns. + +An Accumulator matrix may be either an A matrix, or a B matrix, and it varies by +hardware implementation. + ### HLSL API Documentation #### HLSL Enumerations @@ -487,6 +532,10 @@ to: Matrix::Splat(WaveReadLaneFirst(Val)); ``` +This operation may be called in divergent control flow when creating a thread +scope matrix, and must be called in uniform control flow when creating a wave +scope matrix. + #### Matrix::Load ```c++ @@ -513,6 +562,36 @@ expected target data format. When read from `groupshared` memory, the data may be in any arithmetic or packed data type. If the type mismatches the target data type of the matrix a data conversion is applied on load. +This operation may be called in divergent control flow when loading a thread +scope matrix, and must be called in uniform control flow when loading a wave +scope matrix. + +#### Matrix::FromThreadVectors + +```c++ +template +typename hlsl::enable_if::type + FromThreadVectors(vector); + +template +typename hlsl::enable_if::type + FromThreadVectors(vector); +``` + +Produces a matrix from per-thread vectors. An A matrix is produced from +per-thread column vectors, while a B matrix is produced from per-thread row +vectors. The `FromThreadVectors` construction method is not available for +accumulator matrices which vary by hardware implementation. + +When creating an A matrix, the N dimension must be less than or equal to the +wave size. When creating a B matrix, the M dimension must be less than or equal +to the wave size. Threads outside the matrix size are discarded. + +Must be called from wave-uniform control flow. #### Matrix::Store @@ -534,53 +613,41 @@ matrix object. When storing to `groupshared` memory, the matrix component data is converted to the target arithmetic or packed data type if the data types do not match. -#### Matrix::GetRow(uint) +This operation may be called in divergent control flow when storing a thread +scope matrix, and must be called in uniform control flow when storing a wave +scope matrix. -```c++ -vector Matrix::GetRow(uint Index); -``` - -Returns a row vector of the matrix as a vector of the underlying HLSL native -element type. If `Index` is out of range for the matrix size the result is a `0` -filled vector. - - -#### Matrix::SetRow(vector, uint) +#### Matrix::GetThreadVector(uint) ```c++ -void Matrix::SetRow(vector V, uint Index); +template +typename hlsl::enable_if>::type +GetThreadVector(uint Index = 0); + +template +typename hlsl::enable_if>::type +GetThreadVector(uint Index = 0); ``` -Sets the specified matrix row to the value in the vector V. If the matrix scope -is `Wave`, this behaves as if called as `SetRow(WaveReadLaneFirst(V), Index)`. -If the `Index` is out of range of the matrix, this is a no-op. +Returns the underlying vector for the associated thread in the matrix. The +optional index is used when the matrix `K` dimension is larger than the wave +size to compute the starting offset (i.e. `(Index * WaveSize) + ThreadID`). -#### Matrix::Get(uint2) +An A matrix produces a vector containing a column of a matrix, while a B matrix +produces a vector containing a row of the matrix. This method may not be used +on an Accumulator matrix because the matrix layout varies by hardware +implementation. -```c++ -std::enable_if_t<__detail::ComponentTypeTraits::IsNativeScalar, - ElementType> -Matrix::Get(uint2 Index); -``` +Threads which correspond to threads outside the matrix size will return a vector +with all elements zero initialized. -Accesses a specific component of the matrix using two-dimensional indexing. This -method is only available if the component type has has native scalar support in -HLSL. If the `Index` parameter is out-of range for the matrix the result is `0` -casted to `ElementType`. +Must be called from wave-uniform control flow. -#### Matrix::Set(ElementType, uint2) - -```c++ -std::enable_if_t<__detail::ComponentTypeTraits::IsNativeScalar, - void> -Matrix::Set(ElementType V, uint2 Index); -``` - -Sets a specified element of the matrix to the provided value. If the matrix -scope is `Wave`, this behaves as if called as `Set(WaveReadLaneFirst(V), Index)`. -If the `Index` is out of range, this is a no-op. - -#### Matrix::MultiplyAccumuate(Matrix, Matrix) +#### Matrix::MultiplyAccumulate(Matrix, Matrix) ```c++ template , ; Vector + ) +``` + +Populates a matrix from per-thread vectors. For an A matrix the NUM corresponds +to the M dimension while for a B matrix it corresponds to the N dimension. The +NUM must match the matrix corresponding dimension, unless the element is a +packed data type in which case it must be the number of 32-bit unsigned integers +used to store M elements. This operation may not be used on Accumulator +matrices. + +For an A matrix the N dimension must be less than or equal to the WaveSize. For +a B matrix the M dimension must be less than or equal to the WaveSize. Values +from additional threads are discarded. + +The result of this operation is undefined if called from non-uniform control +flow. + ```llvm declare void @dx.op.matrixStoreToDescriptor( immarg i32, ; opcode @@ -862,6 +968,39 @@ Store a matrix to groupshared memory. Data conversions between opaque matrices and groupshared memory are defined in the [Conversions on groupshared memory](#conversions-on-groupshared-memory) section below. +```llvm +declare < NUM x [Ty]> @dx.op.matrixExtractToThreads.v[NUM][TY]( + immarg i32, ; opcode + %dx.types.MatrixRef *, ; matrix + i32, ; Index + ) +``` + +Extracts per-thread vectors from a matrix. For an A matrix the NUM corresponds +to the M dimension while for a B matrix it corresponds to the N dimension. The +NUM must match the matrix corresponding dimension, unless the element is a +packed data type in which case it must be the number of 32-bit unsigned integers +used to store M elements. This operation may not be used on Accumulator +matrices. + +The Index argument specifies the starting row or column as a multiple of the +wave size. The resulting vector corresponds to the row or column numbered +`(Index * WaveSize) + ThreadID`. + +Must be called from wave-uniform control flow. + +```llvm +declare i32 @dx.op.matrixQueryAccumulatorLayout.v[NUM][TY]( + immarg i32, ; opcode + ) +``` + +This opcode must be evaluated at driver compile time and replaced with the +appropriate architecture specific value denoting the layout of accumulator +matrices. A return value of `0` will denote that accumulator matrices are `A` +layout while a return value of `1` will denote that accumulator matrices are `B` +layout. + ```llvm declare void @dx.op.matrixOp( immarg i32 ; opcode @@ -885,6 +1024,8 @@ Validation rules will enforce that: * Matrix C's dimensions shall be M x N * The element types are compatible +Must be called from wave-uniform control flow. + ``` llvm declare <[NUMo] x [TYo]> @dx.op.matvecmul.v[NUMo][TYo].v[NUMi][TYi]( immarg i32 ; opcode @@ -915,35 +1056,12 @@ a bias vector added to the result. > Note for this operation the matrix can be of any scope. -```llvm -declare <[NUMo] x [TYo]> @dx.op.matrixLoadRow.v[NUMo][Tyo]( - immarg i32 ; opcode - %dx.types.MatrixRef *, ; matrix A - i32 ; row index - ) -``` - -Loads a row-vector from a matrix. Out of bounds reads return `0`. - -```llvm -declare void @dx.op.matrixStoreRow.v[NUMi][Tyi]( - immarg i32 ; opcode - %dx.types.MatrixRef *, ; matrix A - i32, ; index - <[NUMi] x [Tyi]> ; row vector - ) -``` - -Stores a row-vector to a matrix. Out of bounds writes no-op. - ### Conversions ## Appendix 1: Outstanding Questions * What is the exhaustive list of data types we need to support? * What data type conversions do we need to support? -* Do we need load and store per-element accessors or is row enough? -* Should we consider get/set column accessors? * Support for other number formats that aren't natively supported by HLSL? * Do we need to specify a source/destination format for the data in the load and store operations that operate on descriptors or should we assume @@ -951,7 +1069,7 @@ Stores a row-vector to a matrix. Out of bounds writes no-op. ## Appendix 2: HLSL Header -[Compiler Explorer](https://godbolt.org/z/jbq7eheT1) +[Compiler Explorer](https://godbolt.org/z/79bv43raj) > Note: this mostly works with Clang, but has some issues to work out still. ```cpp @@ -1063,6 +1181,18 @@ template class Matrix { using ElementType = typename __detail::ComponentTypeTraits::Type; + // If this isn't a native scalar, we have an 8-bit type, so we have 4 elements + // packed in each scalar value. + static const uint ElementsPerScalar = + __detail::ComponentTypeTraits::IsNativeScalar ? 1 : 4; + // Computes the number of scalars actually stored in the matrix M dimension + // accounting for packing. + static const uint MScalars = + (M + (ElementsPerScalar - 1)) / ElementsPerScalar; + // Computes the number of scalars actually stored in the matrix N dimension + // accounting for packing. + static const uint NScalars = + (N + (ElementsPerScalar - 1)) / ElementsPerScalar; template Matrix cast(); @@ -1096,6 +1226,18 @@ class Matrix { static typename hlsl::enable_if::value, Matrix>::type Load(/*groupshared*/ T Arr[], uint StartIdx, uint Stride, bool ColMajor); + template + typename hlsl::enable_if::type + FromThreadVectors(vector); + + template + typename hlsl::enable_if::type + FromThreadVectors(vector); + void Store(RWByteAddressBuffer Res, uint StartOffset, uint Stride, bool ColMajor, uint Align = sizeof(ElementType)); @@ -1103,18 +1245,18 @@ class Matrix { typename hlsl::enable_if::value, void>::type Store(/*groupshared*/ T Arr[], uint StartIdx, uint Stride, bool ColMajor); - // Row accesses - vector GetRow(uint Index); - void SetRow(vector V, uint Index); + // Extract the thread-specific vector. + template + typename hlsl::enable_if>::type + GetThreadVector(uint Index = 0); - // Element access - typename hlsl::enable_if< - __detail::ComponentTypeTraits::IsNativeScalar, - ElementType>::type - Get(uint2 Index); - typename hlsl::enable_if< - __detail::ComponentTypeTraits::IsNativeScalar, void>::type - Set(ElementType V, uint2 Index); + template + typename hlsl::enable_if>::type + GetThreadVector(uint Index = 0); template @@ -1139,6 +1281,8 @@ class Matrix { OuterProductAccumulate(const vector, const vector); }; +MatrixUse AccumulatorLayout(); + template Matrix