ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch > Struct Template Reference

ThreadGroupTensorSliceTransfer_v4r2&lt; ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch &gt; Struct Template Reference#

Composable Kernel: ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch > Struct Template Reference
ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch > Struct Template Reference

Blockwise data transfer. More...

#include <thread_group_tensor_slice_transfer_v4r2.hpp>

Public Types

using Index = MultiIndex<nDim>

Public Member Functions

__device__ constexpr ThreadGroupTensorSliceTransfer_v4r2 (const SrcDescs &src_descs, const StaticallyIndexedArray< Index, nSrc > &src_block_slice_origins, const DstDescs &dst_descs, const StaticallyIndexedArray< Index, nDst > &dst_block_slice_origins, const ElementwiseOperation &element_op)
template<typename SrcBuffers, index_t ThreadScratchId = 0>
__device__ void RunRead (const SrcDescs &src_descs, const SrcBuffers &src_bufs, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
template<typename DstBuffers, index_t ThreadScratchId = 0>
__device__ void RunWrite (const DstDescs &dst_descs, DstBuffers &dst_bufs, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
template<typename SrcBuffer, typename DstBuffer, index_t ThreadScratchId>
__device__ void Run (const SrcDescs &src_descs, const SrcBuffer &src_bufs, const DstDescs &dst_descs, DstBuffer &dst_bufs, Number< ThreadScratchId > thread_scratch_id)
__device__ void MoveSrcSliceWindow (const SrcDescs &src_descs, const Index &step)
__device__ void MoveDstSliceWindow (const DstDescs &dst_descs, const Index &step)

Static Public Attributes

static constexpr index_t nDim
static constexpr index_t nSrc = SrcDescs::Size()
static constexpr index_t nDst = DstDescs::Size()
static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{}

Detailed Description

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
struct ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >

Blockwise data transfer.

This version does following things to avoid scratch memory issue

  1. Use StaticallyIndexedArray instead of C array for thread buffer
  2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
  3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate

Member Typedef Documentation

◆ Index

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
using ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::Index = MultiIndex<nDim>

Constructor & Destructor Documentation

◆ ThreadGroupTensorSliceTransfer_v4r2()

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
__device__ constexpr ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::ThreadGroupTensorSliceTransfer_v4r2 ( const SrcDescs & src_descs,
const StaticallyIndexedArray< Index, nSrc > & src_block_slice_origins,
const DstDescs & dst_descs,
const StaticallyIndexedArray< Index, nDst > & dst_block_slice_origins,
const ElementwiseOperation & element_op )
inlineconstexpr

Member Function Documentation

◆ MoveDstSliceWindow()

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
__device__ void ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::MoveDstSliceWindow ( const DstDescs & dst_descs,
const Index & step )
inline

◆ MoveSrcSliceWindow()

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
__device__ void ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::MoveSrcSliceWindow ( const SrcDescs & src_descs,
const Index & step )
inline

◆ Run()

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
template<typename SrcBuffer, typename DstBuffer, index_t ThreadScratchId>
__device__ void ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::Run ( const SrcDescs & src_descs,
const SrcBuffer & src_bufs,
const DstDescs & dst_descs,
DstBuffer & dst_bufs,
Number< ThreadScratchId > thread_scratch_id )
inline

◆ RunRead()

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
template<typename SrcBuffers, index_t ThreadScratchId = 0>
__device__ void ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::RunRead ( const SrcDescs & src_descs,
const SrcBuffers & src_bufs,
Number< ThreadScratchId > thread_scratch_id = Number<ThreadScratchId>{} )
inline

◆ RunWrite()

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
template<typename DstBuffers, index_t ThreadScratchId = 0>
__device__ void ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::RunWrite ( const DstDescs & dst_descs,
DstBuffers & dst_bufs,
Number< ThreadScratchId > thread_scratch_id = Number<ThreadScratchId>{} )
inline

Member Data Documentation

◆ nDim

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
index_t ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::nDim
staticconstexpr
Initial value:
=
typename remove_reference< T >::type remove_reference_t
Definition type.hpp:292

◆ nDst

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
index_t ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::nDst = DstDescs::Size()
staticconstexpr

◆ nSrc

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
index_t ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::nSrc = SrcDescs::Size()
staticconstexpr

◆ thread_slice_lengths

template<typename ThreadGroup, typename ElementwiseOperation, typename DstInMemOps, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcDatas, typename DstDatas, typename SrcDescs, typename DstDescs, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, typename SrcsScalarPerVector, typename DstsScalarPerVector, typename SrcsScalarStrideInVector, typename DstsScalarStrideInVector, typename ThreadTransferSrcsResetCoordinateAfterRun, typename ThreadTransferDstsResetCoordinateAfterRun, index_t NumThreadScratch = 1>
auto ck::ThreadGroupTensorSliceTransfer_v4r2< ThreadGroup, ElementwiseOperation, DstInMemOps, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcDatas, DstDatas, SrcDescs, DstDescs, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcsScalarPerVector, DstsScalarPerVector, SrcsScalarStrideInVector, DstsScalarStrideInVector, ThreadTransferSrcsResetCoordinateAfterRun, ThreadTransferDstsResetCoordinateAfterRun, NumThreadScratch >::thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{}
staticconstexpr

The documentation for this struct was generated from the following file: