20 typename ComputeDataType,
24 typename AMmaTileDesc,
25 typename BMmaTileDesc,
26 index_t ABlockTransferSrcScalarPerVector,
27 index_t BBlockTransferSrcScalarPerVector,
43 typename ComputeDataType,
47 typename AMmaTileDesc,
48 typename BMmaTileDesc,
49 index_t ABlockTransferSrcScalarPerVector,
50 index_t BBlockTransferSrcScalarPerVector,
71 ABlockTransferSrcScalarPerVector,
72 BBlockTransferSrcScalarPerVector,
90 ABlockTransferSrcScalarPerVector,
91 BBlockTransferSrcScalarPerVector,
111 ABlockTransferSrcScalarPerVector,
112 BBlockTransferSrcScalarPerVector,
173 constexpr auto num_ds_read_inst_a =
177 constexpr auto num_ds_read_inst_b =
183 constexpr auto num_dswrite_per_issue_a =
185 constexpr auto num_dsread_per_issue_a = num_ds_read_inst_a / num_issue_a;
188 constexpr auto num_dswrite_per_issue_b =
190 constexpr auto num_dsread_per_issue_b = num_ds_read_inst_b / num_issue_b;
192 constexpr auto num_mfma_per_issue =
199 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
200 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
205 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
206 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
209 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
210 __builtin_amdgcn_sched_group_barrier(0x008,
211 num_mfma_per_issue - num_dsread_per_issue_a -
212 num_dswrite_per_issue_a,
220 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
221 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
226 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
227 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
230 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
231 __builtin_amdgcn_sched_group_barrier(0x008,
232 num_mfma_per_issue - num_dsread_per_issue_a -
233 num_dswrite_per_issue_b,
236 __builtin_amdgcn_sched_barrier(0);
239 template <
bool HasMainLoop,
243 typename ABlockTransfer,
244 typename AGridBuffer,
245 typename ABlockBuffer,
246 typename ABlockTransferStep,
249 typename BBlockTransfer,
250 typename BGridBuffer,
251 typename BBlockBuffer,
252 typename BBlockTransferStep,
253 typename CThreadBuffer>
254 __device__
void Run(
const AGridDesc& a_grid_desc,
255 const ABlockDesc& a_block_desc,
256 ABlockTransfer& a_blockwise_copy,
257 const AGridBuffer& a_grid_buf,
258 ABlockBuffer& a_block_buf,
259 const ABlockTransferStep& a_block_copy_step,
260 const BGridDesc& b_grid_desc,
261 const BBlockDesc& b_block_desc,
262 BBlockTransfer& b_blockwise_copy,
263 const BGridBuffer& b_grid_buf,
264 BBlockBuffer& b_block_buf,
265 const BBlockTransferStep& b_block_copy_step,
266 CThreadBuffer& c_thread_buf,
278 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
279 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
281 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
282 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
285 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(
I0));
286 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(
I0));
310 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
311 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
313 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
314 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
317 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(
I1));
318 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(
I1));
321 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
322 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
324 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
325 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
328 c_thread_buf.Clear();
331 if constexpr(HasMainLoop)
337 auto LoopFunc = [&](
auto lds_read_buf,
338 auto lds_read_reg_buf,
347 a_block_buf.At(lds_read_buf),
350 a_thread_bufs(lds_read_reg_buf));
355 b_block_buf.At(lds_read_buf),
358 b_thread_bufs(lds_read_reg_buf));
362 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf));
363 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf));
365 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
366 b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
368 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
369 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
378 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
379 a_thread_bufs[mfma_reg_buf]
382 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
383 b_thread_bufs[mfma_reg_buf]
388 using mfma_input_type =
396 a_thread_vec.template AsType<mfma_input_type>(),
397 b_thread_vec.template AsType<mfma_input_type>(),
413 auto ReadWriteCompFunc = [&](
auto lds_read_buf,
414 auto lds_read_reg_buf,
423 a_block_buf.At(lds_read_buf),
426 a_thread_bufs(lds_read_reg_buf));
431 b_block_buf.At(lds_read_buf),
434 b_thread_bufs(lds_read_reg_buf));
438 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf));
439 b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf));
448 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
451 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
456 using mfma_input_type =
462 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
463 b_thread_vec.template AsType<mfma_input_type>(),
472 auto ReadCompFunc = [&](
auto lds_read_buf,
auto lds_read_reg_buf,
auto mfma_reg_buf) {
479 a_block_buf.At(lds_read_buf),
482 a_thread_bufs(lds_read_reg_buf));
487 b_block_buf.At(lds_read_buf),
490 b_thread_bufs(lds_read_reg_buf));
501 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
504 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
509 using mfma_input_type =
515 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
516 b_thread_vec.template AsType<mfma_input_type>(),
525 auto CompFunc = [&](
auto mfma_reg_buf) {
533 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
536 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
541 using mfma_input_type =
547 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
548 b_thread_vec.template AsType<mfma_input_type>(),
569 using Base::a_thread_copy_;
570 using Base::a_thread_desc_;
571 using Base::b_thread_copy_;
572 using Base::b_thread_desc_;
573 using Base::c_thread_desc_;
587 typename ComputeDataType,
588 typename AccDataType,
591 typename AMmaTileDesc,
592 typename BMmaTileDesc,
593 index_t ABlockTransferSrcScalarPerVector,
594 index_t BBlockTransferSrcScalarPerVector,
610 typename ComputeDataType,
611 typename AccDataType,
614 typename AMmaTileDesc,
615 typename BMmaTileDesc,
616 index_t ABlockTransferSrcScalarPerVector,
617 index_t BBlockTransferSrcScalarPerVector,
638 ABlockTransferSrcScalarPerVector,
639 BBlockTransferSrcScalarPerVector,
657 ABlockTransferSrcScalarPerVector,
658 BBlockTransferSrcScalarPerVector,
678 ABlockTransferSrcScalarPerVector,
679 BBlockTransferSrcScalarPerVector,
740 constexpr auto num_ds_read_inst_a =
744 constexpr auto num_ds_read_inst_b =
750 constexpr auto num_dswrite_per_issue_a = 0;
751 constexpr auto num_dsread_per_issue_a = num_ds_read_inst_a / num_issue_a;
754 constexpr auto num_dswrite_per_issue_b = 0;
755 constexpr auto num_dsread_per_issue_b = num_ds_read_inst_b / num_issue_b;
757 constexpr auto num_mfma_per_issue =
764 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
765 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
770 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
771 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
774 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
775 __builtin_amdgcn_sched_group_barrier(0x008,
776 num_mfma_per_issue - num_dsread_per_issue_a -
777 num_dswrite_per_issue_a,
785 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);
786 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
791 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
792 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
795 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
796 __builtin_amdgcn_sched_group_barrier(0x008,
797 num_mfma_per_issue - num_dsread_per_issue_a -
798 num_dswrite_per_issue_b,
801 __builtin_amdgcn_sched_barrier(0);
804 template <
bool HasMainLoop,
808 typename ABlockTransfer,
809 typename AGridBuffer,
810 typename ABlockBuffer,
811 typename ABlockTransferStep,
814 typename BBlockTransfer,
815 typename BGridBuffer,
816 typename BBlockBuffer,
817 typename BBlockTransferStep,
818 typename CThreadBuffer>
819 __device__
void Run(
const AGridDesc& a_grid_desc,
820 const ABlockDesc& a_block_desc,
821 ABlockTransfer& a_blockwise_copy,
822 const AGridBuffer& a_grid_buf,
823 ABlockBuffer& a_block_buf,
824 const ABlockTransferStep& a_block_copy_step,
825 const BGridDesc& b_grid_desc,
826 const BBlockDesc& b_block_desc,
827 BBlockTransfer& b_blockwise_copy,
828 const BGridBuffer& b_grid_buf,
829 BBlockBuffer& b_block_buf,
830 const BBlockTransferStep& b_block_copy_step,
831 CThreadBuffer& c_thread_buf,
843 a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf.At(
I0));
844 b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf.At(
I0));
846 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
847 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
872 a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf.At(
I1));
873 b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf.At(
I1));
875 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
876 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
879 c_thread_buf.Clear();
882 if constexpr(HasMainLoop)
888 auto LoopFunc = [&](
auto lds_read_buf,
889 auto lds_read_reg_buf,
898 a_block_buf.At(lds_read_buf),
901 a_thread_bufs(lds_read_reg_buf));
906 b_block_buf.At(lds_read_buf),
909 b_thread_bufs(lds_read_reg_buf));
913 a_blockwise_copy.Run(
914 a_grid_desc, a_grid_buf, a_block_desc, a_block_buf.At(lds_write_buf));
915 b_blockwise_copy.Run(
916 b_grid_desc, b_grid_buf, b_block_desc, b_block_buf.At(lds_write_buf));
918 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
919 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
928 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
929 a_thread_bufs[mfma_reg_buf]
932 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
933 b_thread_bufs[mfma_reg_buf]
938 using mfma_input_type =
946 a_thread_vec.template AsType<mfma_input_type>(),
947 b_thread_vec.template AsType<mfma_input_type>(),
963 auto ReadWriteCompFunc = [&](
auto lds_read_buf,
964 auto lds_read_reg_buf,
973 a_block_buf.At(lds_read_buf),
976 a_thread_bufs(lds_read_reg_buf));
981 b_block_buf.At(lds_read_buf),
984 b_thread_bufs(lds_read_reg_buf));
988 a_blockwise_copy.Run(
989 a_grid_desc, a_grid_buf, a_block_desc, a_block_buf.At(lds_write_buf));
990 b_blockwise_copy.Run(
991 b_grid_desc, b_grid_buf, b_block_desc, b_block_buf.At(lds_write_buf));
1000 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
1003 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
1008 using mfma_input_type =
1014 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
1015 b_thread_vec.template AsType<mfma_input_type>(),
1024 auto ReadCompFunc = [&](
auto lds_read_buf,
auto lds_read_reg_buf,
auto mfma_reg_buf) {
1031 a_block_buf.At(lds_read_buf),
1034 a_thread_bufs(lds_read_reg_buf));
1039 b_block_buf.At(lds_read_buf),
1042 b_thread_bufs(lds_read_reg_buf));
1053 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
1056 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
1061 using mfma_input_type =
1067 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
1068 b_thread_vec.template AsType<mfma_input_type>(),
1077 auto CompFunc = [&](
auto mfma_reg_buf) {
1085 a_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
1088 b_thread_vec.template AsType<ComputeDataTypeBuf>()(ik) =
1093 using mfma_input_type =
1099 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
1100 b_thread_vec.template AsType<mfma_input_type>(),
1110 ReadCompFunc(
I0,
I0,
I1);
1115 ReadCompFunc(
I1,
I1,
I0);
1121 using Base::a_thread_copy_;
1122 using Base::a_thread_desc_;
1123 using Base::b_thread_copy_;
1124 using Base::b_thread_desc_;
1125 using Base::c_thread_desc_;
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Even
Definition blkgemmpipe_scheduler.hpp:34
@ Odd
Definition blkgemmpipe_scheduler.hpp:33
__device__ void block_sync_lds_direct_load()
Definition synchronization.hpp:43
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__device__ void block_sync_lds()
Definition synchronization.hpp:16
__host__ __device__ BlockwiseGemmXdlops_pipeline_base(Tuple4 a_origin=CalculateAThreadOriginDataIndex(), Tuple4 b_origin=CalculateBThreadOriginDataIndex())
Constructor for BlockwiseGemmXdlops_pipeline_base.
Definition blockwise_gemm_pipeline_xdlops_base.hpp:222
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:280
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:239
static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
conditional_t< std::is_same< ComputeDataType, ck::tf32_t >::value, float, ComputeDataType > ComputeDataTypeBuf
Definition blockwise_gemm_pipeline_xdlops_base.hpp:57
static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:360
static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
__host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:266
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:294
static constexpr index_t AMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:60
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:253
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops > HotLoopInstList
Definition blockwise_gemm_pipeline_xdlops_base.hpp:82
__host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:111
static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:160
static __device__ auto CalculateCThreadOriginDataIndex8D(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:189
static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_xdlops_base.hpp:64
static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
static constexpr index_t BMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:61
__host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N &c_grid_desc_g_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:341
__host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:307
__host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N &c_grid_desc_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:324
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::B_LDS_Write_Inst_Num static constexpr index_t B_LDS_Write_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:46
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_LDS_Read_Inst_Num static constexpr index_t A_LDS_Read_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:49
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_LDS_Read_Width static constexpr index_t A_LDS_Read_Width
Definition blkgemmpipe_scheduler.hpp:82
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::B_LDS_Read_Inst_Num static constexpr index_t B_LDS_Read_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:51
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_LDS_Write_Inst_Num static constexpr index_t A_LDS_Write_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:44
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::C_MFMA_Inst_Num static constexpr index_t C_MFMA_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:54
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_Buffer_Load_Inst_Num static constexpr index_t A_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:39
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::B_Buffer_Load_Inst_Num static constexpr index_t B_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:41
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::B_LDS_Read_Width static constexpr index_t B_LDS_Read_Width
Definition blkgemmpipe_scheduler.hpp:83
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum __host__ static __device__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:157
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop __host__ static __device__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:152
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:147
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:378
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base BlockwiseGemmXdlops_pipeline_base< BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:102
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_n2_k static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:360
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:148
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:402
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeDataTypeBuf typename Base::ComputeDataTypeBuf ComputeDataTypeBuf
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:145
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotloopUnroll static constexpr index_t HotloopUnroll
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:150
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:366
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:149
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:372
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopScheduler static __device__ constexpr void HotLoopScheduler()
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:169
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:401
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, index_t num_loop) const
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:254
Definition blockwise_gemm_pipeline_xdlops.hpp:103
static __device__ constexpr auto HotLoopScheduler()
Definition blockwise_gemm_pipeline_xdlops.hpp:373
ck::BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockwiseGemmXdlops_pipeline_v4 __host__ __device__ BlockwiseGemmXdlops_pipeline_v4(Tuple4 a_origin=CalculateAThreadOriginDataIndex(), Tuple4 b_origin=CalculateBThreadOriginDataIndex())
Definition blockwise_gemm_pipeline_xdlops.hpp:235
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum __host__ static __device__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:724
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop __host__ static __device__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:719
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:715
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:378
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopScheduler static __device__ constexpr void HotLoopScheduler()
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:736
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:716
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_n2_k static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:360
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base BlockwiseGemmXdlops_pipeline_base< BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:669
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotloopUnroll static constexpr index_t HotloopUnroll
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:717
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:402
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:714
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeDataTypeBuf typename Base::ComputeDataTypeBuf ComputeDataTypeBuf
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:712
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:366
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:372
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, index_t num_loop) const
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:819
ck::BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:401
Definition blockwise_gemm_pipeline_xdlops_v4.hpp:604
Definition functional2.hpp:33
Definition dtype_vector.hpp:10