diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc index a7db3f3d55f10..a502da947427a 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc @@ -171,6 +171,37 @@ void joint_matrix_store(Group g, ext::oneapi::experimental::annotated_ptr dest, size_t stride); +// Overloads for offset store +template +void joint_matrix_store(Group g, + const joint_matrix &res, + multi_ptr base_dest, size_t row_index, + size_t col_index, size_t stride); + +template +void joint_matrix_store(Group g, + const joint_matrix &res, + multi_ptr base_dest, size_t row_index, + size_t col_index, size_t stride); + +template +void joint_matrix_store(Group g, + const joint_matrix &res, + ext::oneapi::experimental::annotated_ptr + base_dest, size_t row_index, size_t col_index, size_t stride); + +template +void joint_matrix_store(Group g, + const joint_matrix &res, + ext::oneapi::experimental::annotated_ptr + base_dest, size_t row_index, size_t col_index, size_t stride); + } // namespace sycl::ext::intel::experimental::matrix ``` @@ -244,19 +275,19 @@ supporting the out of bounds checked APIs that are defined in this section. In this section, we refer to the memory buffer where a `joint_matrix` is loaded from or stored to as the global matrix. This global matrix is also interpreted as a two-dimensional memory region as follows, where -`GlobalRows` is number of rows in the global matrix, `GlobalCols` is number of -columns in the global matrix, `Stride` is number of columns that include +`global_rows` is number of rows in the global matrix, `global_cols` is number of +columns in the global matrix, `stride` is number of columns that include the out of bounds data (depicted as x here). ``` - GlobalCols + global_cols <-----------> dddddddddddddxxx ^ - dddddddddddddxxx | GlobalRows + dddddddddddddxxx | global_rows dddddddddddddxxx v xxxxxxxxxxxxxxxx <--------------> - Stride + stride ``` In the diagram above, the global matrix has 13 columns and 3 @@ -293,15 +324,15 @@ checking, namely `joint_matrix_fill`, `joint_matrix_load`, and the global memory matrix, which is different from the APIs that do not do bounds checking. Those non-bounds-checking APIs take a pointer to the base of the joint matrix. -* The coordinates `RowIndex` and `ColIndex` into the global matrix to +* The coordinates `row_index` and `col_index` into the global matrix to calculate the pointer offset to load/store are given as separate arguments. * These variants take extra arguments to determine the global bounds -`GlobalRows` and `GlobalCols` of the global matrix. +`global_rows` and `global_cols` of the global matrix. To illustrate the out-of-bounds checking, consider the global matrix -shown above which has 13 columns and 3 rows (`GlobalRows=3` and -`GlobalCols=13`), where the joint matrix size is 8 columns by 2 rows defined as +shown above which has 13 columns and 3 rows (`global_rows=3` and +`global_cols=13`), where the joint matrix size is 8 columns by 2 rows defined as ``` joint_matrix sub_b; ``` @@ -311,14 +342,14 @@ both dimensions. This is shown below, where capital letters correspond to the elements that are accessed by this joint matrix load: ``` - GlobalCols + global_cols <-----------> dddddddddddddxxx ^ - dddddddddddddxxx | GlobalRows + dddddddddddddxxx | global_rows ddddddddDDDDDXXX v xxxxxxxxXXXXXXXX <--------------> - Stride + stride ``` If the joint matrix is loaded via `joint_matrix_load_checked` using @@ -335,8 +366,8 @@ namespace sycl::ext::intel::experimental::matrix { template void joint_matrix_fill_checked(Group g, joint_matrix &m, Tv v, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + Cols, Layout> &m, Tv v, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); // Only available when std::is_same_v> template void joint_matrix_load_checked(Group g, joint_matrix &res, - multi_ptr base_src, size_t Stride, - layout Layout, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + multi_ptr base_src, size_t stride, + layout Layout, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); // Only available when Layout != layout::dynamic // and when std::is_same_v> @@ -356,8 +387,8 @@ template void joint_matrix_load_checked(Group g, joint_matrix &res, - multi_ptr base_src, size_t Stride, - size_t GlobalRows, size_t GlobalCols, size_t RowIndex, size_t ColIndex); + multi_ptr base_src, size_t stride, + size_t global_rows, size_t global_cols, size_t row_index, size_t col_index); // Only available when std::is_same_v> template &res, ext::oneapi::experimental::annotated_ptr base_src, - size_t Stride, layout Layout, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + size_t stride, layout Layout, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); // Only available when Layout != layout::dynamic // and when std::is_same_v> @@ -375,55 +406,55 @@ template &res, ext::oneapi::experimental::annotated_ptr base_src, - size_t Stride, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + size_t stride, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, - multi_ptr base_dest, size_t Stride, layout Layout, - size_t GlobalRows, size_t GlobalCols, size_t RowIndex, size_t ColIndex); + multi_ptr base_dest, size_t stride, layout Layout, + size_t global_rows, size_t global_cols, size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, - multi_ptr base_dest, size_t Stride, - size_t GlobalRows, size_t GlobalCols, size_t RowIndex, size_t ColIndex); + multi_ptr base_dest, size_t stride, + size_t global_rows, size_t global_cols, size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, - multi_ptr base_dest, size_t Stride, - size_t GlobalRows, size_t GlobalCols, size_t RowIndex, size_t ColIndex); + multi_ptr base_dest, size_t stride, + size_t global_rows, size_t global_cols, size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr base_dest, - size_t Stride, layout Layout, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + size_t stride, layout Layout, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr base_dest, - size_t Stride, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + size_t stride, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr base_dest, - size_t Stride, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + size_t stride, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); } // namespace sycl::ext::intel::experimental::matrix ``` @@ -445,12 +476,12 @@ the following queries to get these requirements: |Tells the required alignment (in bytes) of the base pointer for `joint_matrix_load_checked` and `joint_matrix_store_checked`. |`ext::intel::experimental::info::device::matrix_checked_rowindex_multiple_of`| -`size_t`|Returns a value, of which `RowIndex` must be multiple of; +`size_t`|Returns a value, of which `row_index` must be multiple of; where `T` is the element type of the matrix. When using the matrices with the machine learning types, `T` should be the element type (e.g. `precision::tf32`) not the storage type. |`ext::intel::experimental::info::device::matrix_checked_globalcols_multiple_of`| -`size_t` | Returns a value, of which `GlobalCols` must be multiple of; +`size_t` | Returns a value, of which `global_cols` must be multiple of; where `T` is the element type of the matrix. When using the matrices with the machine learning types, `T` should be the element type (e.g. `precision::tf32`) not the storage type. @@ -462,14 +493,19 @@ The checked APIs are currently available in devices with the architecture `architecture::intel_gpu_pvc`. The following restrictions apply to these checked APIs: +- The `stride` parameter has the following restrictions: + +* The value `stride * sizeof(T1)` must be a multiple of 8, and +* The value of `stride * sizeof(T1)` must not exceed `2^24^`. + - The base pointer must be 4 bytes aligned. -- For 8 bits data type, `RowIndex` must be a multiple of 4. For 16 bits -data type, `RowIndex` must be a multiple of 2. So `RowIndex` must be a +- For 8 bits data type, `row_index` must be a multiple of 4. For 16 bits +data type, `row_index` must be a multiple of 2. So `row_index` must be a multiple of 4 divided by size of the element type (`4/sizeof(T)`). -- For 8 bits data type, `GlobalCols` must be a multiple of 4. For 16 bits -data type, `GlobalCols` must be a multiple of 2. So `GlobalCols` must be a +- For 8 bits data type, `global_cols` must be a multiple of 4. For 16 bits +data type, `global_cols` must be a multiple of 2. So `global_cols` must be a multiple of 4 divided by size of the element type (`4/sizeof(T)`). === New Device Information Descriptor diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index fd3ae8527815a..8588fa9cb2ba3 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -326,6 +326,86 @@ of `sycl::multi_ptr`. The property list associated with the `annotated_ptr` argument represents the compile-time constant properties for cache control included in the SYCL extenion link:../../proposed/sycl_ext_intel_cache_controls.asciidoc[sycl_ext_intel_cache_controls] + +==== Offset Load +```c++ +namespace sycl::ext::oneapi::experimental::matrix { + +// Only available when std::is_same_v> +template +void joint_matrix_load(Group g, + joint_matrix &res, + multi_ptr base_src, size_t row_index, + size_t col_index, size_t stride, layout Layout); + +// Only available when Layout != layout::dynamic +// and when std::is_same_v> +template +void joint_matrix_load(Group g, + joint_matrix &res, + multi_ptr base_src, size_t row_index, + size_t col_index size_t stride); + +// Only available when std::is_same_v> +template +void joint_matrix_load(Group g, + joint_matrix &res, + annotated_ptr base_src, size_t row_index, size_t + col_index, size_t stride, layout Layout); + +// Only available when Layout != layout::dynamic +// and when std::is_same_v> +template +void joint_matrix_load(Group g, + joint_matrix &res, + annotated_ptr base_src, size_t row_index, size_t + col_index, size_t stride); + +} // namespace sycl::ext::oneapi::experimental::matrix +``` + +These overloads of `joint_matrix_load` takes the pointer `base_src` to +designate the base pointer of the global memory matrix. The +coordinates `row_index` and `col_index` into the global matrix to +calculate the pointer offset to load/store are given as separate +arguments. + +==== Offset Store +```c++ +namespace sycl::ext::oneapi::experimental::matrix { + +// T1 must be the same as T2 +template +void joint_matrix_store(Group g, + const joint_matrix &res, + multi_ptr base_dest, size_t row_index, + size_t col_index, size_t stride, layout Layout); + +template +void joint_matrix_store(Group g, + const joint_matrix &res, + annotated_ptr base_dest, size_t row_index, size_t + col_index, size_t stride, layout Layout); + +} // namespace sycl::ext::oneapi::experimental::matrix +``` +These overloads of `joint_matrix_store` takes the pointer `base_dest` to +designate the base pointer of the global memory matrix. The +coordinates `row_index` and `col_index` into the global matrix to +calculate the pointer offset to load/store are given as separate +arguments. + ==== Multiply and Add ```c++ @@ -562,7 +642,7 @@ float *buf = malloc_shared(M*K, q); auto pBuf = address_space_cast(buf); -joint_matrix_load(sg, tA, pBuf + Offset, Stride); +joint_matrix_load(sg, tA, pBuf + Offset, stride); ``` ==== store @@ -576,7 +656,7 @@ float *buf = malloc_shared(M*K, q); auto pBuf = address_space_cast(buf); -joint_matrix_store(sg, tA, pBuf + Offset, Stride, layout::row_major); +joint_matrix_store(sg, tA, pBuf + Offset, stride, layout::row_major); ``` ==== fill @@ -979,7 +1059,7 @@ for (int i = 0; sizeof(combinations); i++) { } ``` -=== Appendix: Supported Combinations Per Hardware +=== Appendix: Supported Combinations and Restrictions Per Hardware The table below provides a list of the combinations that `joint_matrix` implementations support on each of Intel AMX and Intel XMX hardware. Note that these can be returned using @@ -1065,6 +1145,27 @@ architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` `architecture::intel_gpu_pvc` |====================== +===== Restrictions on `architecture::intel_gpu_pvc` + +- The `stride` parameter to `joint_matrix_load` and + `joint_matrix_store` has the following restrictions: + +* The value `stride * sizeof(T1)` must be a multiple of 8, and +* The value of `stride * sizeof(T1)` must not exceed `2^24^`. + +- The base pointer argument to `joint_matrix_load` and +`joint_matrix_store` must be 4 bytes aligned. + +- In the case of the offset overloads of `joint_matrix_load` and +`joint_matrix_store`, for 8 bits data type, `row_index` must be a +multiple of 4. For 16 bits data type, `row_index` must be a multiple +of 2. So `row_index` must be a multiple of 4 divided by size of the +element type (`4/sizeof(T)`). + +- If these restrictions are not satisfied, users can switch to slower +implementations of `joint_matrix_load` and `joint_matrix_store` by +setting the driver flag `IGC_JointMatrixLoadStoreOpt=1`. + ==== Nvidia Tensor Cores Supported Combinations The complete set of matrix data types and shapes that are supported by the `ext_oneapi_cuda` backend are represented in the following