CUDA transposition kernel implementation header. More...

#include <cstdint>
#include <complex>

Include dependency graph for cuda_transpose_impl_cukrn.h:

This graph shows which files directly or indirectly include this file:

Classes
struct	cukrn_transpose_array
	Array carrying the permutation and dimensions into the kernels. More...

Namespaces
namespace	syten
	Syten namespace.

namespace	syten::CudaDenseTensorImpl
	Implementation namespace for CUDA dense tensors.

Functions
void	syten::CudaDenseTensorImpl::cuda_transpose_kernel_impl (std::uint32_t rank, std::size_t sz, const double inp, double out, cukrn_transpose_array const &old_dim, cukrn_transpose_array const &new_dim, cukrn_transpose_array const &ar_perm, void *str, bool do_conj)
	Launcher for the CUDA tensor transposition kernel, double version. More...

void	syten::CudaDenseTensorImpl::cuda_transpose_kernel_impl (std::uint32_t rank, std::size_t sz, const float inp, float out, cukrn_transpose_array const &old_dim, cukrn_transpose_array const &new_dim, cukrn_transpose_array const &ar_perm, void *str, bool do_conj)
	Launcher for the CUDA tensor transposition kernel, float version. More...

void	syten::CudaDenseTensorImpl::cuda_transpose_kernel_impl (std::uint32_t rank, std::size_t sz, const std::complex< double > inp, std::complex< double > out, cukrn_transpose_array const &old_dim, cukrn_transpose_array const &new_dim, cukrn_transpose_array const &ar_perm, void *str, bool do_conj)
	Launcher for the CUDA tensor transposition kernel, std::complex<double> version. More...

void	syten::CudaDenseTensorImpl::cuda_transpose_kernel_impl (std::uint32_t rank, std::size_t sz, const std::complex< float > inp, std::complex< float > out, cukrn_transpose_array const &old_dim, cukrn_transpose_array const &new_dim, cukrn_transpose_array const &ar_perm, void *str, bool do_conj)
	Launcher for the CUDA tensor transposition kernel, std::complex<float> version. More...

Variables
constexpr int	cukrn_transpose_max_rank = 10
	Maximal rank which is understood by the CUDA kernel transpose. More...

constexpr int	cukrn_transpose_threads_per_block = 256
	Number of threads per block. More...

Detailed Description

CUDA transposition kernel implementation header.

Classes