void syten::CudaDenseTensorImpl::cuda_transpose_kernel_impl | ( | std::uint32_t | rank, |
std::size_t | sz, | ||
const std::complex< float > * | inp, | ||
std::complex< float > * | out, | ||
cukrn_transpose_array const & | old_dim, | ||
cukrn_transpose_array const & | new_dim, | ||
cukrn_transpose_array const & | ar_perm, | ||
void * | str, | ||
bool | do_conj | ||
) |
Launcher for the CUDA tensor transposition kernel, std::complex<float> version.
rank | number of tensor indices |
sz | number of elements in the tensor |
inp | input tensor array, pointer to device memory |
out | output tensor array, pointer to device memory |
old_dim | dimensions of the input tensor as a simple POD |
new_dim | dimensions of the output tensor as a simple POD |
ar_perm | permutation, perm[i] = j puts the old leg j at position i+1 |
str | CUDA stream in which the computation will take place |
do_conj | whether to conjugate every entry |