load_tile_transpose.hpp Source File

load_tile_transpose.hpp Source File#

Composable Kernel: load_tile_transpose.hpp Source File
load_tile_transpose.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
17
18namespace ck_tile {
19
20constexpr int DS_READ_TR_SIZE()
21{
22 return 8; // Literal constant, evaluated at compile time
23}
24
25namespace util {
26template <typename Suffix, typename Sequence>
28{
29 static constexpr bool size_check = (Suffix::size() <= Sequence::size());
30
31 static constexpr index_t start_pos = Sequence::size() - Suffix::size();
32 using extract_indices = typename arithmetic_sequence_gen<start_pos, Sequence::size(), 1>::type;
33
34 static constexpr bool value =
35 size_check && (Suffix{} == decltype(Sequence::extract(extract_indices{})){});
36};
37
38template <index_t... Xs>
40{
41 static constexpr bool value = true;
42};
43
44template <typename Suffix, typename Sequence>
46
47} // namespace util
48
49// Default policy: Retains original 2D transpose behavior
50template <typename DataType>
52{
53 template <index_t LaneGroupSize>
54 struct Quad16
55 {
56 static_assert(LaneGroupSize == 64 || LaneGroupSize == 32 || LaneGroupSize == 16,
57 "LaneGroupSize must be 64, 32, or 16");
60 tuple<sequence<4>, sequence<LaneGroupSize / 16, 4, 4>>,
65
73 };
74
75 template <index_t LaneGroupSize>
76 struct Quad8
77 {
78 static_assert(LaneGroupSize == 64 || LaneGroupSize == 32 || LaneGroupSize == 16,
79 "LaneGroupSize must be 64, 32, or 16");
82 tuple<sequence<8>, sequence<LaneGroupSize / 16, 2, 8>>,
87
95 };
96
97 // Select based on data size
98 template <index_t LaneGroupSize>
99 using QuadInputEncoding = std::conditional_t<sizeof(DataType) == 2,
102
103 template <index_t LaneGroupSize>
104 using QuadOutputEncoding = std::conditional_t<sizeof(DataType) == 2,
107
108 // Always swap last two dimensions
109 static constexpr auto transpose_dims = sequence<1, 0>{};
110
111 // Programmable: Element grouping function
112 static constexpr auto group_func = [](auto idx) {
113 return idx; // Identity mapping
114 };
115
116 template <typename InDstrEncode, bool ReverseDirection, index_t LaneGroupSize>
118 {
119 using QuadEncoding = std::conditional_t<ReverseDirection,
122 static constexpr auto I0 = number<0>{};
123 static constexpr auto I1 = number<1>{};
124 static constexpr auto input_hs = InDstrEncode::hs_lengthss_;
125 static constexpr auto quad_hs = QuadEncoding::hs_lengthss_;
126 // 1. Must be 2D tensor
127 static constexpr bool dims_valid = (InDstrEncode::NDimX == 2);
128 // 2. Quad pattern must be suffix of input pattern
129 static constexpr bool suffix_valid_dim0 =
130 util::is_sequence_suffix_v<decltype(quad_hs[I0]), decltype(input_hs[I0])>;
131 static constexpr bool suffix_valid_dim1 =
132 util::is_sequence_suffix_v<decltype(quad_hs[I1]), decltype(input_hs[I1])>;
133
134 // 3. PS→RHS mapping constraints
135 static constexpr auto input_ps_major = InDstrEncode::ps_to_rhss_major_;
136 static constexpr auto input_ps_minor = InDstrEncode::ps_to_rhss_minor_;
137
138 static constexpr auto quad_ps_major0 = QuadEncoding::ps_to_rhss_major_[I0];
139 static constexpr auto quad_ps_minor0 = QuadEncoding::ps_to_rhss_minor_[I0];
140
141 static constexpr auto input_ps_major_last =
142 input_ps_major[number<input_ps_major.size() - 1>{}];
143 static constexpr auto input_ps_minor_last =
144 input_ps_minor[number<input_ps_minor.size() - 1>{}];
145
147 input_hs[I1].size() - quad_hs[I1].size()>;
149 [](auto i) {
150 return number<quad_ps_minor0[i] + psys_offset{}[quad_ps_major0[i] - 1]>{};
151 },
152 number<quad_ps_minor0.size()>{});
153
154 static constexpr bool ps_mapping_valid =
157 decltype(input_ps_minor_last)>;
158
159 // 4. YS→RHS mapping constraints
160 static constexpr auto input_ys_major = InDstrEncode::ys_to_rhs_major_;
161 static constexpr auto input_ys_minor = InDstrEncode::ys_to_rhs_minor_;
162 static constexpr auto quad_ys_major = QuadEncoding::ys_to_rhs_major_;
163 static constexpr auto quad_ys_minor = QuadEncoding::ys_to_rhs_minor_;
164
165 static_assert(quad_ys_major.size() == 1 && quad_ys_minor.size() == 1,
166 "YS->RHS mapping must be single dimension");
167 static_assert(quad_ys_major.back() == 2 && quad_ys_minor.back() == quad_hs[I1].size() - 1,
168 "YS->RHS mapping must be the last dimension");
169 static constexpr bool ys_mapping_valid =
170 (input_ys_major.back() == 2) && (input_ys_minor.back() == input_hs[I1].size() - 1);
171
172 static constexpr bool value = dims_valid && suffix_valid_dim0 && suffix_valid_dim1 &&
174 };
175
176 template <typename InDstrEncode, bool ReverseDirection = false>
189};
190template <typename TileDistribution_, typename DataType_, typename Policy>
192{
194
195 using Validator = typename Policy::template ValidationTraits<InDstrEncode>;
196
197 static constexpr bool distr_encoding_valid = Validator::value;
198};
199
200// this is used to generate the transposed output tile distribution encoding
201// based on the input tile distribution encoding
202template <typename TileDistributionEncoding_,
203 typename DataType_,
204 typename Policy = DefaultTranspose<DataType_>,
205 bool ReverseDirection = false>
207{
209 static constexpr auto input_hs_lengthss = InDstrEncode::hs_lengthss_;
210 static constexpr index_t LaneGroupSize =
211 Policy::template ValidationTraits<InDstrEncode, ReverseDirection>::LaneGroupSize;
212 static_assert(Policy::template ValidationTraits<InDstrEncode, ReverseDirection>::value,
213 "The input tile distribution encoding is not valid for transpose!");
214
215 using QuadInputEncoding = std::conditional_t< //
216 ReverseDirection,
217 typename Policy::template QuadOutputEncoding<LaneGroupSize>,
218 typename Policy::template QuadInputEncoding<LaneGroupSize>>;
219 using QuadOutputEncoding = std::conditional_t< //
220 ReverseDirection,
221 typename Policy::template QuadInputEncoding<LaneGroupSize>,
222 typename Policy::template QuadOutputEncoding<LaneGroupSize>>;
223
224 static constexpr auto quad_input_hs_lengthss = QuadInputEncoding::hs_lengthss_;
225 static constexpr auto quad_output_hs_lengthss = QuadOutputEncoding::hs_lengthss_;
226
227 static constexpr auto input_ps_to_rhss_major = InDstrEncode::ps_to_rhss_major_;
228 static constexpr auto input_ps_to_rhss_minor = InDstrEncode::ps_to_rhss_minor_;
229 static constexpr auto input_ys_to_rhs_major = InDstrEncode::ys_to_rhs_major_;
230 static constexpr auto input_ys_to_rhs_minor = InDstrEncode::ys_to_rhs_minor_;
231
232 static constexpr auto I0 = number<0>{};
233 static constexpr auto quad_input_ps_to_rhss_major0 = QuadInputEncoding::ps_to_rhss_major_[I0];
234 static constexpr auto quad_input_ps_to_rhss_minor0 = QuadInputEncoding::ps_to_rhss_minor_[I0];
235 static constexpr auto quad_output_ps_to_rhss_major0 = QuadOutputEncoding::ps_to_rhss_major_[I0];
236 static constexpr auto quad_output_ps_to_rhss_minor0 = QuadOutputEncoding::ps_to_rhss_minor_[I0];
237 static constexpr auto quad_output_ys_to_rhs_major = QuadOutputEncoding::ys_to_rhs_major_;
238 static constexpr auto quad_output_ys_to_rhs_minor = QuadOutputEncoding::ys_to_rhs_minor_;
239
240 static constexpr index_t dim0 = Policy::transpose_dims[0];
241 static constexpr index_t dim1 = Policy::transpose_dims[1];
242
243 static constexpr auto swap_one_and_two = [](const index_t idx) {
244 return (idx == 1) ? 2 : (idx == 2) ? 1 : idx;
245 };
246
247 // for transpose load
248 // remove the quad_input_hs_lengthss from the input_hs_lengthss for each dimension and reverse
249 // dims and append the quad_output_hs_lengthss to the end of each dimension
250 static constexpr auto outer_hs_lengthss = generate_tuple(
251 [](auto i) {
252 constexpr auto input_i = input_hs_lengthss[i];
253 constexpr auto outer_len = input_i.size() - quad_input_hs_lengthss[i].size();
254 return typename sequence_split<decltype(input_i), outer_len>::left_type{};
255 },
258 static constexpr auto dst_out_hs_lengthss = generate_tuple(
259 [](auto i) {
260 auto outer_i = reversed_outer_hs_lengthss[i];
261 // append the reversed quad output hs lengths to the outer hs lengths
262 return outer_i.push_back(quad_output_hs_lengthss[i]);
263 },
265
266 // for PS→RHS mapping(both major and minor), we need to modify the last element (which is for
267 // thread distr) of the major sequence
268 static constexpr auto dst_ps_to_rhss_major = generate_tuple(
269 // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed
270 [](auto i) {
271 if constexpr(i == input_ps_to_rhss_major.size() - 1)
272 {
273 constexpr auto current_size = input_ps_to_rhss_major[i].size();
274 constexpr auto reduce_size = quad_input_ps_to_rhss_major0.size();
275 constexpr auto quad_out = quad_output_ps_to_rhss_major0;
276 constexpr auto reduced_ps_to_rhss_major = input_ps_to_rhss_major[i].extract(
278 return reduced_ps_to_rhss_major.transform(swap_one_and_two).push_back(quad_out);
279 }
280 else
281 {
282 // For all other sequences (i.e. warp), keep them unchanged
283 return input_ps_to_rhss_major[i].transform(swap_one_and_two);
284 }
285 },
287
288 static constexpr auto quad_idx_offset =
289 transform_tuples([](auto x) { return number<x.size()>{}; }, reversed_outer_hs_lengthss);
290
291 // minus 1 because RsLength is not counted
293 [](auto x) { return quad_idx_offset[number<x - 1>{}]; }, quad_output_ps_to_rhss_major0));
295 [](auto x) { return quad_idx_offset[number<x - 1>{}]; }, quad_output_ys_to_rhs_major));
296
297 static constexpr auto dst_ps_to_rhss_minor = generate_tuple(
298 [](auto i) {
299 constexpr auto input_i = input_ps_to_rhss_minor[i];
300 if constexpr(i == input_ps_to_rhss_minor.size() - 1)
301 {
302 constexpr auto outer_len = input_i.size() - quad_input_ps_to_rhss_minor0.size();
303 constexpr auto outer_ps =
304 typename sequence_split<decltype(input_i), outer_len>::left_type{};
305
306 return outer_ps.push_back(quad_output_ps_minor_offset +
308 }
309 else
310 {
311 // For all other sequences, keep them unchanged
312 return input_i;
313 }
314 },
316
317 static constexpr auto outer_input_ys_to_rhs_major = input_ys_to_rhs_major.pop_back();
318
319 // for major because of dst_out_hs_lengthss is reversed, this index also need to be reversed
320 static constexpr auto dst_ys_to_rhs_major =
322
323 static constexpr auto dst_ys_to_rhs_minor = input_ys_to_rhs_minor.pop_back().push_back(
325
327 tile_distribution_encoding<typename InDstrEncode::RsLengths,
333};
334
335template <typename TileDistributionEncoding_,
336 typename DataType_,
337 typename Policy = DefaultTranspose<DataType_>>
340template <typename TileDistributionEncoding_,
341 typename DataType_,
342 typename Policy = DefaultTranspose<DataType_>>
345
346template <typename InnerEncode,
347 index_t kLeadIterPerWarp,
348 index_t kSecondIterPerWarp,
349 index_t kLeadNumWarps,
350 index_t kSecondNumWarps>
352{
353 constexpr auto block_outer_dst_encoding =
361 constexpr auto blk_distr_encode =
362 detail::make_embed_tile_distribution_encoding(block_outer_dst_encoding, InnerEncode{});
363
364 return blk_distr_encode;
365}
366
392template <
393 typename BottomTensorView_,
394 typename WindowLengths_,
395 typename TileDistribution_,
396 index_t NumCoord,
397 typename Policy = DefaultTranspose<typename BottomTensorView_::DataType>,
398 typename = std::enable_if_t<TransposeTileDistrChecker<TileDistribution_,
399 typename BottomTensorView_::DataType,
400 Policy>::distr_encoding_valid,
401 Policy>>
404 WindowLengths_,
405 TileDistribution_,
406 NumCoord>& tile_window)
407{
408 using OutTileDstrEncode = typename OutputTileDistributionTraits<
409 typename TileDistribution_::DstrEncode,
410 typename BottomTensorView_::DataType>::TransposedDstrEncode;
412 make_static_tile_distribution(OutTileDstrEncode{}));
413 auto trans_tensor = tile_window.template load_transpose<Policy>();
414 constexpr auto input_distr = TileDistribution_{};
415 constexpr auto output_distr = make_static_tile_distribution(OutTileDstrEncode{});
416
417 constexpr auto y_in_desc = input_distr.get_ys_to_d_descriptor();
418 constexpr auto y_out_desc = output_distr.get_ys_to_d_descriptor();
419
420 constexpr index_t NDimYIn = input_distr.get_num_of_dimension_y();
421 constexpr index_t NDimYOut = output_distr.get_num_of_dimension_y();
422
423 constexpr auto y_in_lengths = to_sequence(y_in_desc.get_lengths());
424 constexpr auto y_out_lengths = to_sequence(y_out_desc.get_lengths());
425
426 constexpr auto y_in_element_space_size = y_in_desc.get_element_space_size();
427 constexpr auto y_out_element_space_size = y_out_desc.get_element_space_size();
428 static_assert(y_in_element_space_size == y_out_element_space_size,
429 "the element space size is not the same!");
430 static_assert(y_in_lengths[NDimYIn - 1] == y_out_lengths[NDimYOut - 1],
431 "the vector length is not the same!");
432 constexpr index_t vecLoadSize = y_in_lengths[NDimYIn - 1];
433 constexpr index_t num_of_access =
434 reduce_on_sequence(y_in_lengths, multiplies{}, number<1>{}) / vecLoadSize;
435
437 static_for<0, num_of_access, 1>{}([&](auto iAccess) {
438 out_tensor.get_thread_buffer().template set_as<DataVec>(
440 trans_tensor.get_thread_buffer().template get_as<DataVec>(number<iAccess>{}));
441 });
442
443 return out_tensor;
444}
445
446} // namespace ck_tile
#define CK_TILE_DEVICE
Definition config.hpp:41
#define CK_TILE_HOST_DEVICE
Definition config.hpp:42
CK_TILE_HOST_DEVICE constexpr auto make_embed_tile_distribution_encoding(OuterDstr, InnerDstr)
Definition tile_distribution_encoding.hpp:457
Definition load_tile_transpose.hpp:25
constexpr bool is_sequence_suffix_v
Definition load_tile_transpose.hpp:45
Definition tile/core/algorithm/cluster_descriptor.hpp:13
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition type_traits.hpp:21
CK_TILE_HOST_DEVICE constexpr auto generate_tuple_for(F &&f, sequence< ids... >)
Definition tile/core/container/tuple.hpp:423
CK_TILE_HOST_DEVICE constexpr auto tuple_reverse(const tuple< Ts... > &t)
Definition tile/core/container/tuple.hpp:583
TransposeTileDistributionTraits< TileDistributionEncoding_, DataType_, Policy, false > OutputTileDistributionTraits
Definition load_tile_transpose.hpp:338
TransposeTileDistributionTraits< TileDistributionEncoding_, DataType_, Policy, true > InputTileDistributionTraits
Definition load_tile_transpose.hpp:343
CK_TILE_HOST_DEVICE constexpr auto generate_sequence_v2(F &&f, number< N >)
Definition tile/core/container/sequence.hpp:1045
CK_TILE_HOST_DEVICE constexpr auto transform_tuples(F f, const X &x)
Definition tile/core/container/tuple.hpp:505
CK_TILE_HOST_DEVICE constexpr auto make_static_distributed_tensor(const StaticTileDistribution &)
Definition static_distributed_tensor.hpp:142
CK_TILE_DEVICE auto load_tile_transpose(const tile_window_with_static_distribution< BottomTensorView_, WindowLengths_, TileDistribution_, NumCoord > &tile_window)
transpose loads tile from a tensor and returns the resulting tensor with a new (transposed) tile dist...
Definition load_tile_transpose.hpp:403
constant< v > number
Definition tile/core/numeric/integral_constant.hpp:37
CK_TILE_HOST_DEVICE constexpr auto generate_tuple(F &&f, number< N >)
Definition tile/core/container/tuple.hpp:429
CK_TILE_HOST_DEVICE constexpr auto to_sequence(tuple< number< Is >... >)
Definition tile/core/container/sequence.hpp:1055
int32_t index_t
Definition integer.hpp:9
CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution(StaticTileDistributionEncoding_)
Definition tile_distribution.hpp:480
constexpr int DS_READ_TR_SIZE()
Definition load_tile_transpose.hpp:20
CK_TILE_HOST_DEVICE constexpr index_t reduce_on_sequence(Seq, Reduce f, number< Init >)
Definition tile/core/container/sequence.hpp:982
CK_TILE_HOST_DEVICE constexpr auto InputTileDistributionEncoding()
Definition load_tile_transpose.hpp:351
Definition load_tile_transpose.hpp:55
tile_distribution_encoding< sequence<>, tuple< sequence< 4 >, sequence< LaneGroupSize/16, 4, 4 > >, tuple< sequence< 2, 1, 2 > >, tuple< sequence< 0, 0, 1 > >, sequence< 2 >, sequence< 2 > > InputEncoding
Definition load_tile_transpose.hpp:58
tile_distribution_encoding< sequence<>, tuple< sequence< LaneGroupSize >, sequence< 4 > >, tuple< sequence< 1 > >, tuple< sequence< 0 > >, sequence< 2 >, sequence< 0 > > OutputEncoding
Definition load_tile_transpose.hpp:66
Definition load_tile_transpose.hpp:77
tile_distribution_encoding< sequence<>, tuple< sequence< 8 >, sequence< LaneGroupSize/16, 2, 8 > >, tuple< sequence< 2, 1, 2 > >, tuple< sequence< 0, 0, 1 > >, sequence< 2 >, sequence< 2 > > InputEncoding
Definition load_tile_transpose.hpp:80
tile_distribution_encoding< sequence<>, tuple< sequence< LaneGroupSize >, sequence< 8 > >, tuple< sequence< 1 > >, tuple< sequence< 0 > >, sequence< 2 >, sequence< 0 > > OutputEncoding
Definition load_tile_transpose.hpp:88
Definition load_tile_transpose.hpp:178
static constexpr index_t LaneGroupSize
Definition load_tile_transpose.hpp:183
static constexpr bool value
Definition load_tile_transpose.hpp:179
Definition load_tile_transpose.hpp:118
static constexpr bool ys_mapping_valid
Definition load_tile_transpose.hpp:169
static constexpr auto I1
Definition load_tile_transpose.hpp:123
static constexpr bool suffix_valid_dim1
Definition load_tile_transpose.hpp:131
static constexpr bool value
Definition load_tile_transpose.hpp:172
static constexpr auto quad_ys_major
Definition load_tile_transpose.hpp:162
static constexpr auto quad_ys_minor
Definition load_tile_transpose.hpp:163
static constexpr auto quad_ps_minor0
Definition load_tile_transpose.hpp:139
static constexpr auto quad_hs
Definition load_tile_transpose.hpp:125
static constexpr auto input_ys_minor
Definition load_tile_transpose.hpp:161
static constexpr auto input_ps_major_last
Definition load_tile_transpose.hpp:141
static constexpr auto input_ps_minor_last
Definition load_tile_transpose.hpp:143
static constexpr auto I0
Definition load_tile_transpose.hpp:122
static constexpr auto shifted_quad_ps_minor0
Definition load_tile_transpose.hpp:148
static constexpr bool ps_mapping_valid
Definition load_tile_transpose.hpp:154
static constexpr auto input_ps_minor
Definition load_tile_transpose.hpp:136
static constexpr auto input_hs
Definition load_tile_transpose.hpp:124
static constexpr auto input_ps_major
Definition load_tile_transpose.hpp:135
ck_tile::sequence< input_hs[I0].size() - quad_hs[I0].size(), input_hs[I1].size() - quad_hs[I1].size()> psys_offset
Definition load_tile_transpose.hpp:146
static constexpr bool dims_valid
Definition load_tile_transpose.hpp:127
static constexpr bool suffix_valid_dim0
Definition load_tile_transpose.hpp:129
std::conditional_t< ReverseDirection, QuadOutputEncoding< LaneGroupSize >, QuadInputEncoding< LaneGroupSize > > QuadEncoding
Definition load_tile_transpose.hpp:119
static constexpr auto quad_ps_major0
Definition load_tile_transpose.hpp:138
static constexpr auto input_ys_major
Definition load_tile_transpose.hpp:160
Definition load_tile_transpose.hpp:52
static constexpr auto group_func
Definition load_tile_transpose.hpp:112
static constexpr auto transpose_dims
Definition load_tile_transpose.hpp:109
std::conditional_t< sizeof(DataType)==2, typename Quad16< LaneGroupSize >::InputEncoding, typename Quad8< LaneGroupSize >::InputEncoding > QuadInputEncoding
Definition load_tile_transpose.hpp:99
std::conditional_t< sizeof(DataType)==2, typename Quad16< LaneGroupSize >::OutputEncoding, typename Quad8< LaneGroupSize >::OutputEncoding > QuadOutputEncoding
Definition load_tile_transpose.hpp:104
Definition load_tile_transpose.hpp:192
static constexpr bool distr_encoding_valid
Definition load_tile_transpose.hpp:197
typename Policy::template ValidationTraits< InDstrEncode > Validator
Definition load_tile_transpose.hpp:195
typename remove_cvref_t< TileDistribution_ >::DstrEncode InDstrEncode
Definition load_tile_transpose.hpp:193
Definition load_tile_transpose.hpp:207
static constexpr auto quad_output_ps_to_rhss_minor0
Definition load_tile_transpose.hpp:236
static constexpr auto input_ys_to_rhs_major
Definition load_tile_transpose.hpp:229
static constexpr auto quad_input_ps_to_rhss_major0
Definition load_tile_transpose.hpp:233
static constexpr auto outer_input_ys_to_rhs_major
Definition load_tile_transpose.hpp:317
static constexpr auto quad_output_ps_to_rhss_major0
Definition load_tile_transpose.hpp:235
static constexpr auto dst_ps_to_rhss_major
Definition load_tile_transpose.hpp:268
static constexpr index_t LaneGroupSize
Definition load_tile_transpose.hpp:210
static constexpr auto quad_idx_offset
Definition load_tile_transpose.hpp:288
remove_cvref_t< TileDistributionEncoding_ > InDstrEncode
Definition load_tile_transpose.hpp:208
std::conditional_t< ReverseDirection, typename Policy::template QuadInputEncoding< LaneGroupSize >, typename Policy::template QuadOutputEncoding< LaneGroupSize > > QuadOutputEncoding
Definition load_tile_transpose.hpp:219
static constexpr auto dst_ys_to_rhs_minor
Definition load_tile_transpose.hpp:323
static constexpr auto dst_out_hs_lengthss
Definition load_tile_transpose.hpp:258
static constexpr auto quad_output_ys_to_rhs_minor
Definition load_tile_transpose.hpp:238
static constexpr auto quad_input_ps_to_rhss_minor0
Definition load_tile_transpose.hpp:234
static constexpr auto quad_input_hs_lengthss
Definition load_tile_transpose.hpp:224
static constexpr auto swap_one_and_two
Definition load_tile_transpose.hpp:243
static constexpr auto outer_hs_lengthss
Definition load_tile_transpose.hpp:250
static constexpr auto reversed_outer_hs_lengthss
Definition load_tile_transpose.hpp:257
static constexpr auto dst_ps_to_rhss_minor
Definition load_tile_transpose.hpp:297
static constexpr auto input_ys_to_rhs_minor
Definition load_tile_transpose.hpp:230
static constexpr auto quad_output_ps_minor_offset
Definition load_tile_transpose.hpp:292
static constexpr auto input_ps_to_rhss_major
Definition load_tile_transpose.hpp:227
static constexpr auto dst_ys_to_rhs_major
Definition load_tile_transpose.hpp:320
tile_distribution_encoding< typename InDstrEncode::RsLengths, remove_cvref_t< decltype(dst_out_hs_lengthss)>, remove_cvref_t< decltype(dst_ps_to_rhss_major)>, remove_cvref_t< decltype(dst_ps_to_rhss_minor)>, remove_cvref_t< decltype(dst_ys_to_rhs_major)>, remove_cvref_t< decltype(dst_ys_to_rhs_minor)> > TransposedDstrEncode
Definition load_tile_transpose.hpp:326
static constexpr auto quad_output_ys_to_rhs_major
Definition load_tile_transpose.hpp:237
static constexpr auto quad_output_hs_lengthss
Definition load_tile_transpose.hpp:225
std::conditional_t< ReverseDirection, typename Policy::template QuadOutputEncoding< LaneGroupSize >, typename Policy::template QuadInputEncoding< LaneGroupSize > > QuadInputEncoding
Definition load_tile_transpose.hpp:215
static constexpr auto input_ps_to_rhss_minor
Definition load_tile_transpose.hpp:228
static constexpr auto quad_output_ys_minor_offset
Definition load_tile_transpose.hpp:294
static constexpr auto input_hs_lengthss
Definition load_tile_transpose.hpp:209
Definition tile/core/container/sequence.hpp:287
typename std::conditional< kHasContent, type0, type1 >::type type
Definition tile/core/container/sequence.hpp:302
A fixed-size array container similar to std::array with additional utilities.
Definition tile/core/container/array.hpp:43
Definition tile/core/numeric/math.hpp:98
Definition tile/core/container/sequence.hpp:352
Definition tile/core/container/sequence.hpp:49
Definition tile/core/utility/functional.hpp:43
Definition tile_distribution_encoding.hpp:26
This class provides tile (windowed) view and access to the device memory.
Definition tile_window.hpp:46
Definition tile/core/container/tuple.hpp:192
static constexpr bool value
Definition load_tile_transpose.hpp:41
Definition load_tile_transpose.hpp:28
typename arithmetic_sequence_gen< start_pos, Sequence::size(), 1 >::type extract_indices
Definition load_tile_transpose.hpp:32
static constexpr bool value
Definition load_tile_transpose.hpp:34
static constexpr bool size_check
Definition load_tile_transpose.hpp:29
static constexpr index_t start_pos
Definition load_tile_transpose.hpp:31