| void syten::CudaDenseTensorImpl::cuda_transpose_kernel_impl | ( | std::uint32_t | rank, |
| std::size_t | sz, | ||
| const std::complex< float > * | inp, | ||
| std::complex< float > * | out, | ||
| cukrn_transpose_array const & | old_dim, | ||
| cukrn_transpose_array const & | new_dim, | ||
| cukrn_transpose_array const & | ar_perm, | ||
| void * | str, | ||
| bool | do_conj | ||
| ) |
Launcher for the CUDA tensor transposition kernel, std::complex<float> version.
| rank | number of tensor indices |
| sz | number of elements in the tensor |
| inp | input tensor array, pointer to device memory |
| out | output tensor array, pointer to device memory |
| old_dim | dimensions of the input tensor as a simple POD |
| new_dim | dimensions of the output tensor as a simple POD |
| ar_perm | permutation, perm[i] = j puts the old leg j at position i+1 |
| str | CUDA stream in which the computation will take place |
| do_conj | whether to conjugate every entry |