mfma_f32_16x16x8xf32 > Struct Reference

mfma_f32_16x16x8xf32 > Struct Reference#

Composable Kernel: ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 > Struct Reference
ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 > Struct Reference

#include <xdlops_gemm.hpp>

Public Member Functions

template<index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
__device__ void run (const FloatA &a, const FloatB &b, FloatC &reg_c) const

Static Public Attributes

static constexpr index_t wave_size = 64
static constexpr index_t m_per_blk = 16
static constexpr index_t n_per_blk = 16
static constexpr index_t num_threads_per_blk = n_per_blk
static constexpr index_t num_regs_per_blk = m_per_blk * n_per_blk / wave_size
static constexpr index_t num_input_blks = m_per_blk / num_regs_per_blk
static constexpr index_t group_size = 4
static constexpr index_t num_groups_per_blk = 1
static constexpr index_t num_output_blks = 1
static constexpr index_t k_per_blk = 2
static constexpr bool is_k_reduction = true

Detailed Description

num_threads_per_blk == n_per_blk num_regs_per_blk * num_input_blks == m_per_blk num_regs_per_blk * wave_size == m_per_blk * n_per_blk

group_size * num_groups_per_blk == num_regs_per_blk

num_regs_per_blk is output(CD) register size which is determined by the instruction. k_per_blk(K1PerXdlops) is input(AB) register size which is determined by the instruction. group_size is corresponding to CD rows mapping. see: GetBeginOfThreadBlk()

is_k_reduction = (k_per_blk == KPerXdlops) ? false: true.

if (is_k_reduction){ num_output_blks == 1; } else { num_input_blks == num_output_blks; }

Member Function Documentation

◆ run()

template<index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
__device__ void ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >::run ( const FloatA & a,
const FloatB & b,
FloatC & reg_c ) const
inline

Member Data Documentation

◆ group_size

index_t ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >::group_size = 4
staticconstexpr

◆ is_k_reduction

bool ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >::is_k_reduction = true
staticconstexpr

◆ k_per_blk

index_t ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >::k_per_blk = 2
staticconstexpr

◆ m_per_blk

index_t ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >::m_per_blk = 16
staticconstexpr

◆ n_per_blk

index_t ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >::n_per_blk = 16
staticconstexpr

◆ num_groups_per_blk

index_t ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >::num_groups_per_blk = 1
staticconstexpr

◆ num_input_blks

◆ num_output_blks

index_t ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >::num_output_blks = 1
staticconstexpr

◆ num_regs_per_blk

◆ num_threads_per_blk

index_t ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >::num_threads_per_blk = n_per_blk
staticconstexpr

◆ wave_size

index_t ck::mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >::wave_size = 64
staticconstexpr

The documentation for this struct was generated from the following file: