void syten::CudaDenseTensorImpl::cuda_transpose_kernel | ( | CudaConstSpan< Scalar > | inp, |
CudaMutSpan< Scalar > | out, | ||
ConstSpan< IndexNumber > | perm, | ||
ConstSpan< Index > | dim, | ||
Conj | do_conj | ||
) |
Wrapper around the CUDA transpose kernels which sets everything up such that the functions in cuda_transpose_impl_cukrn.h
really only have to launch the kernels.
References syten::Cuda::CudaMutSpan< T >::begin(), cuda_transpose_kernel_impl(), cukrn_transpose_max_rank, syten::Cuda::CudaPtr< T >::dev(), syten::RepRegister::dim(), syten::Cuda::CudaPtr< T >::get(), syten::pi, syten::rank(), syten::Cuda::select_device(), syten::Cuda::CudaMutSpan< T >::size(), syten::SpanImpl::ConstSpan< T >::size(), syten::Cuda::CudaStream::str(), syten::Cuda::CudaStream::sync_destroy(), SYTEN_ASSERT, and cukrn_transpose_array::values.