From eab723a3b40326565dcf31a1b9ed143bd969ff56 Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 24 Sep 2024 12:56:09 -0700 Subject: [PATCH 1/4] [SYCL][Docs][Joint matrix] Add overloads and restrictions for the offset load store --- .../sycl_ext_intel_matrix.asciidoc | 3 + .../sycl_ext_oneapi_matrix.asciidoc | 101 +++++++++++++++++- 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc index a7db3f3d55f10..6d1f80eab164d 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc @@ -462,6 +462,9 @@ The checked APIs are currently available in devices with the architecture `architecture::intel_gpu_pvc`. The following restrictions apply to these checked APIs: +- The `Stride` argument must be a multiple of 8 bytes. Also, `Stride` +should not exceed `2^24` bytes. + - The base pointer must be 4 bytes aligned. - For 8 bits data type, `RowIndex` must be a multiple of 4. For 16 bits diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index fd3ae8527815a..8caa5f18bc69f 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -326,6 +326,86 @@ of `sycl::multi_ptr`. The property list associated with the `annotated_ptr` argument represents the compile-time constant properties for cache control included in the SYCL extenion link:../../proposed/sycl_ext_intel_cache_controls.asciidoc[sycl_ext_intel_cache_controls] + +==== Offset Load +```c++ +namespace sycl::ext::oneapi::experimental::matrix { + +// Only available when std::is_same_v> +template +void joint_matrix_load(Group g, + joint_matrix &res, + multi_ptr base_src, size_t RowIndex, + size_t ColIndex, size_t Stride, layout Layout); + +// Only available when Layout != layout::dynamic +// and when std::is_same_v> +template +void joint_matrix_load(Group g, + joint_matrix &res, + multi_ptr base_src, size_t RowIndex, + size_t ColIndex size_t Stride); + +// Only available when std::is_same_v> +template +void joint_matrix_load(Group g, + joint_matrix &res, + annotated_ptr base_src, size_t RowIndex, size_t + ColIndex, size_t Stride, layout Layout); + +// Only available when Layout != layout::dynamic +// and when std::is_same_v> +template +void joint_matrix_load(Group g, + joint_matrix &res, + annotated_ptr base_src, size_t RowIndex, size_t + ColIndex, size_t stride); + +} // namespace sycl::ext::oneapi::experimental::matrix +``` + +These overloads of `joint_matrix_load` takes the pointer `base_src` to +designate the base pointer of the global memory matrix. The +coordinates `RowIndex` and `ColIndex` into the global matrix to +calculate the pointer offset to load/store are given as separate +arguments. + +==== Offset Store +```c++ +namespace sycl::ext::oneapi::experimental::matrix { + +// T1 must be the same as T2 +template +void joint_matrix_store(Group g, + const joint_matrix &res, + multi_ptr base_dest, size_t RowIndex, + size_t ColIndex, size_t Stride, layout Layout); + +template +void joint_matrix_store(Group g, + const joint_matrix &res, + annotated_ptr base_dest, size_t RowIndex, size_t + ColIndex, size_t Stride, layout Layout); + +} // namespace sycl::ext::oneapi::experimental::matrix +``` +These overloads of `joint_matrix_store` takes the pointer `base_dest` to +designate the base pointer of the global memory matrix. The +coordinates `RowIndex` and `ColIndex` into the global matrix to +calculate the pointer offset to load/store are given as separate +arguments. + ==== Multiply and Add ```c++ @@ -979,7 +1059,7 @@ for (int i = 0; sizeof(combinations); i++) { } ``` -=== Appendix: Supported Combinations Per Hardware +=== Appendix: Supported Combinations and Restrictions Per Hardware The table below provides a list of the combinations that `joint_matrix` implementations support on each of Intel AMX and Intel XMX hardware. Note that these can be returned using @@ -1065,6 +1145,25 @@ architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` `architecture::intel_gpu_pvc` |====================== +===== Restrictions on `architecture::intel_gpu_pvc` + +- The `Stride` argument to `joint_matrix_load` and +`joint_matrix_store` must be a multiple of 8 bytes. Also, `Stride` +should not exceed `2^24` bytes. + +- The base pointer argument to `joint_matrix_load` and +`joint_matrix_store` must be 4 bytes aligned. + +- In the case of the offset overloads of `joint_matrix_load` and +`joint_matrix_store`, for 8 bits data type, `RowIndex` must be a +multiple of 4. For 16 bits data type, `RowIndex` must be a multiple of +2. So `RowIndex` must be a multiple of 4 divided by size of the +element type (`4/sizeof(T)`). + +- If these restrictions are not satisfied, users can switch to slower +implementations of `joint_matrix_load` and `joint_matrix_store` by +setting the driver flag `IGC_JointMatrixLoadStoreOpt=1`. + ==== Nvidia Tensor Cores Supported Combinations The complete set of matrix data types and shapes that are supported by the `ext_oneapi_cuda` backend are represented in the following From 9c76f6f70f5bc5dfd3d155347b512fdf92443895 Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 24 Sep 2024 13:02:44 -0700 Subject: [PATCH 2/4] formatting --- .../sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc | 2 +- .../sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc index 6d1f80eab164d..e625f3dc43416 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc @@ -463,7 +463,7 @@ The checked APIs are currently available in devices with the architecture these checked APIs: - The `Stride` argument must be a multiple of 8 bytes. Also, `Stride` -should not exceed `2^24` bytes. +should not exceed `2^24^` bytes. - The base pointer must be 4 bytes aligned. diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index 8caa5f18bc69f..8326e36b33016 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -1149,15 +1149,15 @@ architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` - The `Stride` argument to `joint_matrix_load` and `joint_matrix_store` must be a multiple of 8 bytes. Also, `Stride` -should not exceed `2^24` bytes. +should not exceed `2^24^` bytes. - The base pointer argument to `joint_matrix_load` and `joint_matrix_store` must be 4 bytes aligned. - In the case of the offset overloads of `joint_matrix_load` and `joint_matrix_store`, for 8 bits data type, `RowIndex` must be a -multiple of 4. For 16 bits data type, `RowIndex` must be a multiple of -2. So `RowIndex` must be a multiple of 4 divided by size of the +multiple of 4. For 16 bits data type, `RowIndex` must be a multiple +of 2. So `RowIndex` must be a multiple of 4 divided by size of the element type (`4/sizeof(T)`). - If these restrictions are not satisfied, users can switch to slower From 24f500390ce0ebae9e598adfcbe3fd5c07b96d4f Mon Sep 17 00:00:00 2001 From: Dounia Date: Wed, 25 Sep 2024 10:39:11 -0700 Subject: [PATCH 3/4] Add offset overloads for store A and B --- .../sycl_ext_intel_matrix.asciidoc | 41 ++++++++++++++++--- .../sycl_ext_oneapi_matrix.asciidoc | 24 +++++------ 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc index e625f3dc43416..a543a62602590 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc @@ -124,7 +124,7 @@ enum class layout { Consequently, the layout argument `layout` in `joint_matrix_load` can take `ext_intel_packed` as argument to specify that the data has -already been transformed into VNNI format. In this case, the `stride` +already been transformed into VNNI format. In this case, the `Stride` argument of `joint_matrix_load` describes the number of elements between consecutive rows for packed layouts. @@ -148,28 +148,59 @@ template void joint_matrix_store(Group g, const joint_matrix &res, - multi_ptr dest, size_t stride); + multi_ptr dest, size_t Stride); template void joint_matrix_store(Group g, const joint_matrix &res, - multi_ptr dest, size_t stride); + multi_ptr dest, size_t Stride); template void joint_matrix_store(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr dest, - size_t stride); + size_t Stride); template void joint_matrix_store(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr dest, - size_t stride); + size_t Stride); + +// Overloads for offset store +template +void joint_matrix_store(Group g, + const joint_matrix &res, + multi_ptr base_dest, size_t RowIndex, + size_t ColIndex, size_t Stride); + +template +void joint_matrix_store(Group g, + const joint_matrix &res, + multi_ptr base_dest, size_t RowIndex, + size_t ColIndex, size_t Stride); + +template +void joint_matrix_store(Group g, + const joint_matrix &res, + ext::oneapi::experimental::annotated_ptr + base_dest, size_t RowIndex, size_t ColIndex, size_t Stride); + +template +void joint_matrix_store(Group g, + const joint_matrix &res, + ext::oneapi::experimental::annotated_ptr + base_dest, size_t RowIndex, size_t ColIndex, size_t Stride); } // namespace sycl::ext::intel::experimental::matrix ``` diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index 8326e36b33016..922ee1bff06cc 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -216,7 +216,7 @@ template void joint_matrix_load(Group g, joint_matrix &res, - multi_ptr src, size_t stride, layout Layout); + multi_ptr src, size_t Stride, layout Layout); // Only available when Layout != layout::dynamic // and when std::is_same_v> @@ -226,7 +226,7 @@ template void joint_matrix_load(Group g, joint_matrix &res, - multi_ptr src, size_t stride); + multi_ptr src, size_t Stride); // Only available when std::is_same_v> template void joint_matrix_load(Group g, joint_matrix &res, - annotated_ptr src, size_t stride, layout Layout); + annotated_ptr src, size_t Stride, layout Layout); // Only available when Layout != layout::dynamic // and when std::is_same_v> @@ -243,7 +243,7 @@ template void joint_matrix_load(Group g, joint_matrix &res, - annotated_ptr src, size_t stride); + annotated_ptr src, size_t Stride); } // namespace sycl::ext::oneapi::experimental::matrix ``` @@ -261,7 +261,7 @@ The second overload without a memory layout must not be used with a The base pointer `src` of type `T` here determines the starting address of the matrix to be loaded from. `Layout` determines whether the data is being read in a row (`row_major`), column major (`col_major`) -fashion. `stride` describes the number of elements between consecutive +fashion. `Stride` describes the number of elements between consecutive rows for the row major layout, or between columns for the column major layout. @@ -301,13 +301,13 @@ template void joint_matrix_store(Group g, const joint_matrix &res, - multi_ptr dest, size_t stride, layout Layout); + multi_ptr dest, size_t Stride, layout Layout); template void joint_matrix_store(Group g, const joint_matrix &res, - annotated_ptr dest, size_t stride, layout Layout); + annotated_ptr dest, size_t Stride, layout Layout); } // namespace sycl::ext::oneapi::experimental::matrix ``` @@ -317,7 +317,7 @@ registers back to memory. The base pointer `dest` here determines the starting address of the matrix to be stored. `Layout` determines whether the data is being written in a row (`row_major`), column major (`col_major`) -fashion. `stride` describes the number of elements between consecutive +fashion. `Stride` describes the number of elements between consecutive rows for the row major layout, or between columns for the column major layout. The second overload of `joint_matrix_store` takes @@ -368,7 +368,7 @@ template &res, annotated_ptr base_src, size_t RowIndex, size_t - ColIndex, size_t stride); + ColIndex, size_t Stride); } // namespace sycl::ext::oneapi::experimental::matrix ``` @@ -552,7 +552,7 @@ namespace sycl::ext::oneapi::experimental::matrix { template -void joint_matrix_prefetch(Group g, T* ptr, size_t stride, layout Layout, +void joint_matrix_prefetch(Group g, T* ptr, size_t Stride, layout Layout, Properties properties = {}); } // namespace sycl::ext::oneapi::experimental::matrix @@ -1217,11 +1217,11 @@ supported parameter combination is specified in the following table. | `matrix_type::fp64` | `matrix_type::fp64` | `matrix_type::fp64` |8 |8 |4 |====================== -IMPORTANT: The `stride` argument to `joint_matrix_load` and +IMPORTANT: The `Stride` argument to `joint_matrix_load` and `joint_matrix_store` must be a multiple of 8 when `T` is `half`, and a multiple of 4 when `T` is `float`; where `T` is the type of the `joint_matrix` elements. When `T` is not `half` or `float` there are -no restrictions to `stride`. +no restrictions to `Stride`. IMPORTANT: For some devices it is important to use the sm version (Compute Capability) corresponding to the device that will run the From 8b51f6a01f5659daf43fc3c6bba219ebd5dc769a Mon Sep 17 00:00:00 2001 From: Dounia Date: Wed, 9 Oct 2024 18:50:01 -0700 Subject: [PATCH 4/4] Address Greg's review --- .../sycl_ext_intel_matrix.asciidoc | 110 +++++++++--------- .../sycl_ext_oneapi_matrix.asciidoc | 68 +++++------ 2 files changed, 91 insertions(+), 87 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc index a543a62602590..a502da947427a 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc @@ -124,7 +124,7 @@ enum class layout { Consequently, the layout argument `layout` in `joint_matrix_load` can take `ext_intel_packed` as argument to specify that the data has -already been transformed into VNNI format. In this case, the `Stride` +already been transformed into VNNI format. In this case, the `stride` argument of `joint_matrix_load` describes the number of elements between consecutive rows for packed layouts. @@ -148,28 +148,28 @@ template void joint_matrix_store(Group g, const joint_matrix &res, - multi_ptr dest, size_t Stride); + multi_ptr dest, size_t stride); template void joint_matrix_store(Group g, const joint_matrix &res, - multi_ptr dest, size_t Stride); + multi_ptr dest, size_t stride); template void joint_matrix_store(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr dest, - size_t Stride); + size_t stride); template void joint_matrix_store(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr dest, - size_t Stride); + size_t stride); // Overloads for offset store template void joint_matrix_store(Group g, const joint_matrix &res, - multi_ptr base_dest, size_t RowIndex, - size_t ColIndex, size_t Stride); + multi_ptr base_dest, size_t row_index, + size_t col_index, size_t stride); template void joint_matrix_store(Group g, const joint_matrix &res, - multi_ptr base_dest, size_t RowIndex, - size_t ColIndex, size_t Stride); + multi_ptr base_dest, size_t row_index, + size_t col_index, size_t stride); template void joint_matrix_store(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr - base_dest, size_t RowIndex, size_t ColIndex, size_t Stride); + base_dest, size_t row_index, size_t col_index, size_t stride); template void joint_matrix_store(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr - base_dest, size_t RowIndex, size_t ColIndex, size_t Stride); + base_dest, size_t row_index, size_t col_index, size_t stride); } // namespace sycl::ext::intel::experimental::matrix ``` @@ -275,19 +275,19 @@ supporting the out of bounds checked APIs that are defined in this section. In this section, we refer to the memory buffer where a `joint_matrix` is loaded from or stored to as the global matrix. This global matrix is also interpreted as a two-dimensional memory region as follows, where -`GlobalRows` is number of rows in the global matrix, `GlobalCols` is number of -columns in the global matrix, `Stride` is number of columns that include +`global_rows` is number of rows in the global matrix, `global_cols` is number of +columns in the global matrix, `stride` is number of columns that include the out of bounds data (depicted as x here). ``` - GlobalCols + global_cols <-----------> dddddddddddddxxx ^ - dddddddddddddxxx | GlobalRows + dddddddddddddxxx | global_rows dddddddddddddxxx v xxxxxxxxxxxxxxxx <--------------> - Stride + stride ``` In the diagram above, the global matrix has 13 columns and 3 @@ -324,15 +324,15 @@ checking, namely `joint_matrix_fill`, `joint_matrix_load`, and the global memory matrix, which is different from the APIs that do not do bounds checking. Those non-bounds-checking APIs take a pointer to the base of the joint matrix. -* The coordinates `RowIndex` and `ColIndex` into the global matrix to +* The coordinates `row_index` and `col_index` into the global matrix to calculate the pointer offset to load/store are given as separate arguments. * These variants take extra arguments to determine the global bounds -`GlobalRows` and `GlobalCols` of the global matrix. +`global_rows` and `global_cols` of the global matrix. To illustrate the out-of-bounds checking, consider the global matrix -shown above which has 13 columns and 3 rows (`GlobalRows=3` and -`GlobalCols=13`), where the joint matrix size is 8 columns by 2 rows defined as +shown above which has 13 columns and 3 rows (`global_rows=3` and +`global_cols=13`), where the joint matrix size is 8 columns by 2 rows defined as ``` joint_matrix sub_b; ``` @@ -342,14 +342,14 @@ both dimensions. This is shown below, where capital letters correspond to the elements that are accessed by this joint matrix load: ``` - GlobalCols + global_cols <-----------> dddddddddddddxxx ^ - dddddddddddddxxx | GlobalRows + dddddddddddddxxx | global_rows ddddddddDDDDDXXX v xxxxxxxxXXXXXXXX <--------------> - Stride + stride ``` If the joint matrix is loaded via `joint_matrix_load_checked` using @@ -366,8 +366,8 @@ namespace sycl::ext::intel::experimental::matrix { template void joint_matrix_fill_checked(Group g, joint_matrix &m, Tv v, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + Cols, Layout> &m, Tv v, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); // Only available when std::is_same_v> template void joint_matrix_load_checked(Group g, joint_matrix &res, - multi_ptr base_src, size_t Stride, - layout Layout, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + multi_ptr base_src, size_t stride, + layout Layout, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); // Only available when Layout != layout::dynamic // and when std::is_same_v> @@ -387,8 +387,8 @@ template void joint_matrix_load_checked(Group g, joint_matrix &res, - multi_ptr base_src, size_t Stride, - size_t GlobalRows, size_t GlobalCols, size_t RowIndex, size_t ColIndex); + multi_ptr base_src, size_t stride, + size_t global_rows, size_t global_cols, size_t row_index, size_t col_index); // Only available when std::is_same_v> template &res, ext::oneapi::experimental::annotated_ptr base_src, - size_t Stride, layout Layout, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + size_t stride, layout Layout, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); // Only available when Layout != layout::dynamic // and when std::is_same_v> @@ -406,55 +406,55 @@ template &res, ext::oneapi::experimental::annotated_ptr base_src, - size_t Stride, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + size_t stride, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, - multi_ptr base_dest, size_t Stride, layout Layout, - size_t GlobalRows, size_t GlobalCols, size_t RowIndex, size_t ColIndex); + multi_ptr base_dest, size_t stride, layout Layout, + size_t global_rows, size_t global_cols, size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, - multi_ptr base_dest, size_t Stride, - size_t GlobalRows, size_t GlobalCols, size_t RowIndex, size_t ColIndex); + multi_ptr base_dest, size_t stride, + size_t global_rows, size_t global_cols, size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, - multi_ptr base_dest, size_t Stride, - size_t GlobalRows, size_t GlobalCols, size_t RowIndex, size_t ColIndex); + multi_ptr base_dest, size_t stride, + size_t global_rows, size_t global_cols, size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr base_dest, - size_t Stride, layout Layout, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + size_t stride, layout Layout, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr base_dest, - size_t Stride, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + size_t stride, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); template void joint_matrix_store_checked(Group g, const joint_matrix &res, ext::oneapi::experimental::annotated_ptr base_dest, - size_t Stride, size_t GlobalRows, size_t GlobalCols, - size_t RowIndex, size_t ColIndex); + size_t stride, size_t global_rows, size_t global_cols, + size_t row_index, size_t col_index); } // namespace sycl::ext::intel::experimental::matrix ``` @@ -476,12 +476,12 @@ the following queries to get these requirements: |Tells the required alignment (in bytes) of the base pointer for `joint_matrix_load_checked` and `joint_matrix_store_checked`. |`ext::intel::experimental::info::device::matrix_checked_rowindex_multiple_of`| -`size_t`|Returns a value, of which `RowIndex` must be multiple of; +`size_t`|Returns a value, of which `row_index` must be multiple of; where `T` is the element type of the matrix. When using the matrices with the machine learning types, `T` should be the element type (e.g. `precision::tf32`) not the storage type. |`ext::intel::experimental::info::device::matrix_checked_globalcols_multiple_of`| -`size_t` | Returns a value, of which `GlobalCols` must be multiple of; +`size_t` | Returns a value, of which `global_cols` must be multiple of; where `T` is the element type of the matrix. When using the matrices with the machine learning types, `T` should be the element type (e.g. `precision::tf32`) not the storage type. @@ -493,17 +493,19 @@ The checked APIs are currently available in devices with the architecture `architecture::intel_gpu_pvc`. The following restrictions apply to these checked APIs: -- The `Stride` argument must be a multiple of 8 bytes. Also, `Stride` -should not exceed `2^24^` bytes. +- The `stride` parameter has the following restrictions: + +* The value `stride * sizeof(T1)` must be a multiple of 8, and +* The value of `stride * sizeof(T1)` must not exceed `2^24^`. - The base pointer must be 4 bytes aligned. -- For 8 bits data type, `RowIndex` must be a multiple of 4. For 16 bits -data type, `RowIndex` must be a multiple of 2. So `RowIndex` must be a +- For 8 bits data type, `row_index` must be a multiple of 4. For 16 bits +data type, `row_index` must be a multiple of 2. So `row_index` must be a multiple of 4 divided by size of the element type (`4/sizeof(T)`). -- For 8 bits data type, `GlobalCols` must be a multiple of 4. For 16 bits -data type, `GlobalCols` must be a multiple of 2. So `GlobalCols` must be a +- For 8 bits data type, `global_cols` must be a multiple of 4. For 16 bits +data type, `global_cols` must be a multiple of 2. So `global_cols` must be a multiple of 4 divided by size of the element type (`4/sizeof(T)`). === New Device Information Descriptor diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index 922ee1bff06cc..8588fa9cb2ba3 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -216,7 +216,7 @@ template void joint_matrix_load(Group g, joint_matrix &res, - multi_ptr src, size_t Stride, layout Layout); + multi_ptr src, size_t stride, layout Layout); // Only available when Layout != layout::dynamic // and when std::is_same_v> @@ -226,7 +226,7 @@ template void joint_matrix_load(Group g, joint_matrix &res, - multi_ptr src, size_t Stride); + multi_ptr src, size_t stride); // Only available when std::is_same_v> template void joint_matrix_load(Group g, joint_matrix &res, - annotated_ptr src, size_t Stride, layout Layout); + annotated_ptr src, size_t stride, layout Layout); // Only available when Layout != layout::dynamic // and when std::is_same_v> @@ -243,7 +243,7 @@ template void joint_matrix_load(Group g, joint_matrix &res, - annotated_ptr src, size_t Stride); + annotated_ptr src, size_t stride); } // namespace sycl::ext::oneapi::experimental::matrix ``` @@ -261,7 +261,7 @@ The second overload without a memory layout must not be used with a The base pointer `src` of type `T` here determines the starting address of the matrix to be loaded from. `Layout` determines whether the data is being read in a row (`row_major`), column major (`col_major`) -fashion. `Stride` describes the number of elements between consecutive +fashion. `stride` describes the number of elements between consecutive rows for the row major layout, or between columns for the column major layout. @@ -301,13 +301,13 @@ template void joint_matrix_store(Group g, const joint_matrix &res, - multi_ptr dest, size_t Stride, layout Layout); + multi_ptr dest, size_t stride, layout Layout); template void joint_matrix_store(Group g, const joint_matrix &res, - annotated_ptr dest, size_t Stride, layout Layout); + annotated_ptr dest, size_t stride, layout Layout); } // namespace sycl::ext::oneapi::experimental::matrix ``` @@ -317,7 +317,7 @@ registers back to memory. The base pointer `dest` here determines the starting address of the matrix to be stored. `Layout` determines whether the data is being written in a row (`row_major`), column major (`col_major`) -fashion. `Stride` describes the number of elements between consecutive +fashion. `stride` describes the number of elements between consecutive rows for the row major layout, or between columns for the column major layout. The second overload of `joint_matrix_store` takes @@ -337,8 +337,8 @@ template void joint_matrix_load(Group g, joint_matrix &res, - multi_ptr base_src, size_t RowIndex, - size_t ColIndex, size_t Stride, layout Layout); + multi_ptr base_src, size_t row_index, + size_t col_index, size_t stride, layout Layout); // Only available when Layout != layout::dynamic // and when std::is_same_v> @@ -348,8 +348,8 @@ template void joint_matrix_load(Group g, joint_matrix &res, - multi_ptr base_src, size_t RowIndex, - size_t ColIndex size_t Stride); + multi_ptr base_src, size_t row_index, + size_t col_index size_t stride); // Only available when std::is_same_v> template void joint_matrix_load(Group g, joint_matrix &res, - annotated_ptr base_src, size_t RowIndex, size_t - ColIndex, size_t Stride, layout Layout); + annotated_ptr base_src, size_t row_index, size_t + col_index, size_t stride, layout Layout); // Only available when Layout != layout::dynamic // and when std::is_same_v> @@ -367,15 +367,15 @@ template void joint_matrix_load(Group g, joint_matrix &res, - annotated_ptr base_src, size_t RowIndex, size_t - ColIndex, size_t Stride); + annotated_ptr base_src, size_t row_index, size_t + col_index, size_t stride); } // namespace sycl::ext::oneapi::experimental::matrix ``` These overloads of `joint_matrix_load` takes the pointer `base_src` to designate the base pointer of the global memory matrix. The -coordinates `RowIndex` and `ColIndex` into the global matrix to +coordinates `row_index` and `col_index` into the global matrix to calculate the pointer offset to load/store are given as separate arguments. @@ -388,21 +388,21 @@ template void joint_matrix_store(Group g, const joint_matrix &res, - multi_ptr base_dest, size_t RowIndex, - size_t ColIndex, size_t Stride, layout Layout); + multi_ptr base_dest, size_t row_index, + size_t col_index, size_t stride, layout Layout); template void joint_matrix_store(Group g, const joint_matrix &res, - annotated_ptr base_dest, size_t RowIndex, size_t - ColIndex, size_t Stride, layout Layout); + annotated_ptr base_dest, size_t row_index, size_t + col_index, size_t stride, layout Layout); } // namespace sycl::ext::oneapi::experimental::matrix ``` These overloads of `joint_matrix_store` takes the pointer `base_dest` to designate the base pointer of the global memory matrix. The -coordinates `RowIndex` and `ColIndex` into the global matrix to +coordinates `row_index` and `col_index` into the global matrix to calculate the pointer offset to load/store are given as separate arguments. @@ -552,7 +552,7 @@ namespace sycl::ext::oneapi::experimental::matrix { template -void joint_matrix_prefetch(Group g, T* ptr, size_t Stride, layout Layout, +void joint_matrix_prefetch(Group g, T* ptr, size_t stride, layout Layout, Properties properties = {}); } // namespace sycl::ext::oneapi::experimental::matrix @@ -642,7 +642,7 @@ float *buf = malloc_shared(M*K, q); auto pBuf = address_space_cast(buf); -joint_matrix_load(sg, tA, pBuf + Offset, Stride); +joint_matrix_load(sg, tA, pBuf + Offset, stride); ``` ==== store @@ -656,7 +656,7 @@ float *buf = malloc_shared(M*K, q); auto pBuf = address_space_cast(buf); -joint_matrix_store(sg, tA, pBuf + Offset, Stride, layout::row_major); +joint_matrix_store(sg, tA, pBuf + Offset, stride, layout::row_major); ``` ==== fill @@ -1147,17 +1147,19 @@ architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` ===== Restrictions on `architecture::intel_gpu_pvc` -- The `Stride` argument to `joint_matrix_load` and -`joint_matrix_store` must be a multiple of 8 bytes. Also, `Stride` -should not exceed `2^24^` bytes. +- The `stride` parameter to `joint_matrix_load` and + `joint_matrix_store` has the following restrictions: + +* The value `stride * sizeof(T1)` must be a multiple of 8, and +* The value of `stride * sizeof(T1)` must not exceed `2^24^`. - The base pointer argument to `joint_matrix_load` and `joint_matrix_store` must be 4 bytes aligned. - In the case of the offset overloads of `joint_matrix_load` and -`joint_matrix_store`, for 8 bits data type, `RowIndex` must be a -multiple of 4. For 16 bits data type, `RowIndex` must be a multiple -of 2. So `RowIndex` must be a multiple of 4 divided by size of the +`joint_matrix_store`, for 8 bits data type, `row_index` must be a +multiple of 4. For 16 bits data type, `row_index` must be a multiple +of 2. So `row_index` must be a multiple of 4 divided by size of the element type (`4/sizeof(T)`). - If these restrictions are not satisfied, users can switch to slower @@ -1217,11 +1219,11 @@ supported parameter combination is specified in the following table. | `matrix_type::fp64` | `matrix_type::fp64` | `matrix_type::fp64` |8 |8 |4 |====================== -IMPORTANT: The `Stride` argument to `joint_matrix_load` and +IMPORTANT: The `stride` argument to `joint_matrix_load` and `joint_matrix_store` must be a multiple of 8 when `T` is `half`, and a multiple of 4 when `T` is `float`; where `T` is the type of the `joint_matrix` elements. When `T` is not `half` or `float` there are -no restrictions to `Stride`. +no restrictions to `stride`. IMPORTANT: For some devices it is important to use the sm version (Compute Capability) corresponding to the device that will run the