Implementation namespace for CUDA dense tensors. More...
Classes | |
struct | CudaDenseTensor |
A dense tensor using CUDA for storage and computing if SYTEN_USE_CUDA is defined. More... | |
Functions | |
template<Rank rank, typename Scalar > | |
void | addScaled (CudaDenseTensor< rank, Scalar > &a, CudaDenseTensor< rank, Scalar > const &b, typename IdentityType< Scalar >::type const factor, EliminateZeros const ezeros=EliminateZeros::No) |
Adds the entries of the CUDA dense tensor b scaled by factor to those of a . More... | |
template<Rank rank, typename Scalar > | |
void | addScaled (CudaDenseTensor< rank, Scalar > &a, DenseTensor< rank, Scalar > const &b, typename IdentityType< Scalar >::type const factor, EliminateZeros const ezeros) |
Adds the entries of the host-based dense tensor b scaled by factor to the CUDA dense tensor a . More... | |
template<Rank rank, typename Scalar > | |
void | addScaled (CudaDenseTensor< rank, Scalar > &a, IdentityDenseTensor< rank, Scalar > const &b, typename IdentityType< Scalar >::type const factor, EliminateZeros const ezeros) |
Adds the entries of the identity tensor b scaled by factor to the CUDA dense tensor a . More... | |
template<Rank rank, typename Scalar > | |
void | addScaled (CudaDenseTensor< rank, Scalar > &a, OffsetDenseTensor< rank, Scalar > const &b, typename IdentityType< Scalar >::type const factor, EliminateZeros const ezeros) |
Adds the entries of the offset dense tensor b scaled by factor to the CUDA dense tensor a . More... | |
template<Rank rank, typename Scalar > | |
auto | conj (CudaDenseTensor< rank, Scalar > const &in) |
Returns a complex-conjugated copy of the input CUDA dense tensor on the same device. More... | |
void | cuda_iszero_kernel_impl (std::size_t sz, const double *inp, int *is_nonzero, void *str) |
Launcher for the CUDA isZero kernel, double version. More... | |
void | cuda_iszero_kernel_impl (std::size_t sz, const float *inp, int *is_nonzero, void *str) |
Launcher for the CUDA isZero kernel, float version. More... | |
void | cuda_iszero_kernel_impl (std::size_t sz, const std::complex< double > *inp, int *is_nonzero, void *str) |
Launcher for the CUDA isZero kernel, std::complex<double> version. More... | |
void | cuda_iszero_kernel_impl (std::size_t sz, const std::complex< float > *inp, int *is_nonzero, void *str) |
Launcher for the CUDA isZero kernel, std::complex<float> version. More... | |
void | cuda_mm_cm (CudaPtr< const double > at, CudaPtr< const double > bt, CudaPtr< double > rt, Size const dx, Size const dy, Size const dz) |
CUDA real matrix-matrix multiplication. More... | |
void | cuda_mm_cm (CudaPtr< const float > at, CudaPtr< const float > bt, CudaPtr< float > rt, Size const dx, Size const dy, Size const dz) |
CUDA real float matrix-matrix multiplication. More... | |
void | cuda_mm_cm (CudaPtr< const std::complex< double > > at, CudaPtr< const std::complex< double > > bt, CudaPtr< std::complex< double > > rt, Size const dx, Size const dy, Size const dz) |
CUDA complex matrix-matrix multiplication. More... | |
void | cuda_mm_cm (CudaPtr< const std::complex< float > > at, CudaPtr< const std::complex< float > > bt, CudaPtr< std::complex< float > > rt, Size const dx, Size const dy, Size const dz) |
CUDA complex float matrix-matrix multiplication. More... | |
template<Rank rank, typename Scalar , TransposeMethod method = TransposeMethod::Default, Rank... Ranks> | |
void | cuda_transpose_impl (CudaConstSpan< Scalar > inp, CudaMutSpan< Scalar > out, ConstSpan< Index > in_perm, ConstSpan< Index > in_dim, Conj do_conj=Conj::n(), bool do_checks=true, std::index_sequence< Ranks... >={}) |
Implementation of the CUDA tensor transposition routines, to be called only from cuda_transpose(). More... | |
template<Rank rank, typename Scalar > | |
void | cuda_transpose_kernel (CudaConstSpan< Scalar > inp, CudaMutSpan< Scalar > out, ConstSpan< IndexNumber > perm, ConstSpan< Index > dim, Conj do_conj) |
Wrapper around the CUDA transpose kernels which sets everything up such that the functions in cuda_transpose_impl_cukrn.h really only have to launch the kernels. More... | |
void | cuda_transpose_kernel_impl (std::uint32_t rank, std::size_t sz, const double *inp, double *out, cukrn_transpose_array const &old_dim, cukrn_transpose_array const &new_dim, cukrn_transpose_array const &ar_perm, void *str, bool do_conj) |
Launcher for the CUDA tensor transposition kernel, double version. More... | |
void | cuda_transpose_kernel_impl (std::uint32_t rank, std::size_t sz, const float *inp, float *out, cukrn_transpose_array const &old_dim, cukrn_transpose_array const &new_dim, cukrn_transpose_array const &ar_perm, void *str, bool do_conj) |
Launcher for the CUDA tensor transposition kernel, float version. More... | |
void | cuda_transpose_kernel_impl (std::uint32_t rank, std::size_t sz, const std::complex< double > *inp, std::complex< double > *out, cukrn_transpose_array const &old_dim, cukrn_transpose_array const &new_dim, cukrn_transpose_array const &ar_perm, void *str, bool do_conj) |
Launcher for the CUDA tensor transposition kernel, std::complex<double> version. More... | |
void | cuda_transpose_kernel_impl (std::uint32_t rank, std::size_t sz, const std::complex< float > *inp, std::complex< float > *out, cukrn_transpose_array const &old_dim, cukrn_transpose_array const &new_dim, cukrn_transpose_array const &ar_perm, void *str, bool do_conj) |
Launcher for the CUDA tensor transposition kernel, std::complex<float> version. More... | |
void | cuda_transpose_r2 (CudaConstSpan< double > inp, CudaMutSpan< double > out, Index inp_rows, Index inp_cols, Conj const conj=Conj::n()) |
Transposition of a rank-2 CUDA matrix from range inp into range out in row-major format with inp_rows and inp_cols respectively. More... | |
void | cuda_transpose_r2 (CudaConstSpan< float > inp, CudaMutSpan< float > out, Index inp_rows, Index inp_cols, Conj const conj=Conj::n()) |
Transposition of a rank-2 CUDA matrix from range inp into range out in row-major format with inp_rows and inp_cols respectively. More... | |
void | cuda_transpose_r2 (CudaConstSpan< std::complex< double > > inp, CudaMutSpan< std::complex< double > > out, Index inp_rows, Index inp_cols, Conj const conj=Conj::n()) |
Transposition of a rank-2 CUDA matrix from range inp into range out in row-major format with inp_rows and inp_cols respectively. More... | |
void | cuda_transpose_r2 (CudaConstSpan< std::complex< float > > inp, CudaMutSpan< std::complex< float > > out, Index inp_rows, Index inp_cols, Conj const conj=Conj::n()) |
Transposition of a rank-2 CUDA matrix from range inp into range out in row-major format with inp_rows and inp_cols respectively. More... | |
template<Rank rank, typename Scalar > | |
void | cuda_transpose_recursive (CudaConstSpan< Scalar > inp, CudaMutSpan< Scalar > out, ConstSpan< IndexNumber > in_perm, ConstSpan< Index > in_dim, Conj do_conj=Conj::n()) |
Entry point for the CUDA recursive tensor transposition implementation, to be called only from cuda_transpose_impl(). More... | |
template<Rank rank, typename Scalar > | |
void | cuda_transpose_recursive_impl (CudaConstSpan< Scalar > inp, CudaMutSpan< Scalar > out, CudaMutSpan< Scalar > workspace, ConstSpan< IndexNumber > perm, ConstSpan< Index > dim) |
Recursive implementation of the CUDA tensor transposition. More... | |
template<Rank summed, Rank frank, Rank srank, Rank rrank, typename Scalar > | |
void | gemm_transpose (CudaDenseTensor< frank, Scalar > const &a, CudaDenseTensor< srank, Scalar > const &b, CudaDenseTensor< rrank, Scalar > &r, std::array< int, frank > const &c_a, std::array< int, srank > const &c_b) |
Last part of a transpose-transpose-gemm-transpose CUDA tensor contraction. More... | |
template<Rank rank, typename Scalar > | |
bool | isZero (CudaDenseTensor< rank, Scalar > const &c) |
Returns true if the supplied CudaDenseTensor only has zero entries. More... | |
template<Rank rank, typename Scalar > | |
ScalarBase< Scalar >::type | normSqd (CudaDenseTensor< rank, Scalar > const &a) |
Returns the squared norm of the input tensor. More... | |
template<Rank rank, typename Scalar , typename ScalarF > | |
CudaDenseTensor< rank, Scalar > & | operator*= (CudaDenseTensor< rank, Scalar > &a, ScalarF const &b) |
Scales every element of the CUDA dense tensor a by the scalar factor b . More... | |
template<Rank summed, Rank frank, Rank srank, typename Scalar , std::enable_if_t<(int(frank)+int(srank) - 2 *int(summed) > 0), int > = 0> | |
CudaDenseTensor< frank+srank - 2 *summed, Scalar > | prodD (CudaDenseTensor< frank, Scalar > const &a, CudaDenseTensor< srank, Scalar > const &b, std::array< int, frank > const &c_a, std::array< int, srank > const &c_b, bool conjugate=false, EliminateZeros const ezeros=EliminateZeros::No, DenseProduct::TemporaryTransposeStorage< Scalar, frank, srank > *=nullptr) |
Product/Contraction of two dense CUDA tensors. More... | |
template<Rank summed, Rank frank, Rank srank, typename Scalar , std::enable_if_t<(int(frank)+int(srank) - 2 *int(summed) > 0), int > = 0> | |
GenericDenseTensor< frank+srank - 2 *summed, Scalar > | prodD (CudaDenseTensor< frank, Scalar > const &a, DenseTensor< srank, Scalar > const &b, std::array< int, frank > const &c_a, std::array< int, srank > const &c_b, bool conjugate=false, EliminateZeros const ezeros=EliminateZeros::No, DenseProduct::TemporaryTransposeStorage< Scalar, frank, srank > *=nullptr) |
Product/Contraction of a CUDA dense tensor and a standard dense tensor. More... | |
template<Rank r, typename Scalar > | |
Scalar | prodD (CudaDenseTensor< r, Scalar > const &a, CudaDenseTensor< r, Scalar > const &b, std::array< int, r > const &c_a, std::array< int, r > const &c_b, bool conjugate=false) |
Reordering scalar product of two CUDA dense tensors. More... | |
template<Rank r, typename Scalar > | |
Scalar | prodD (CudaDenseTensor< r, Scalar > const &a, DenseTensor< r, Scalar > const &b, std::array< int, r > const &c_a, std::array< int, r > const &c_b, bool conjugate=false) |
Reordering scalar product of a CUDA dense tensor and a standard dense tensor. More... | |
template<Rank summed, Rank frank, Rank srank, typename Scalar , std::enable_if_t<(int(frank)+int(srank) - 2 *int(summed) > 0), int > = 0> | |
GenericDenseTensor< frank+srank - 2 *summed, Scalar > | prodD (DenseTensor< frank, Scalar > const &a, CudaDenseTensor< srank, Scalar > const &b, std::array< int, frank > const &c_a, std::array< int, srank > const &c_b, bool conjugate=false, EliminateZeros const ezeros=EliminateZeros::No, DenseProduct::TemporaryTransposeStorage< Scalar, frank, srank > *=nullptr) |
Product/Contraction of a standard dense tensor and a CUDA dense tensor. More... | |
template<Rank r, typename Scalar > | |
Scalar | prodD (DenseTensor< r, Scalar > const &a, CudaDenseTensor< r, Scalar > const &b, std::array< int, r > const &c_a, std::array< int, r > const &c_b, bool conjugate=false) |
Reordering scalar product of a standard dense tensor and a CUDA dense tensor. More... | |
Implementation namespace for CUDA dense tensors.