cpp/develop/GinkgoRadialBasisFctSolver_8hpp_source.html

#pragma once


#ifndef PRECICE_NO_GINKGO


#include <array>

#include <cmath>

#include <functional>

#include <numeric>

#include "mapping/GinkgoDefinitions.hpp"

#include "mapping/config/MappingConfiguration.hpp"

#include "mapping/device/Device.hpp"

#include "mapping/device/GinkgoRBFKernels.hpp"

#include "mapping/impl/BasisFunctions.hpp"

#include "mesh/Mesh.hpp"

#include "precice/impl/Types.hpp"

#include "profiling/Event.hpp"

#ifdef PRECICE_WITH_HIP

#include "mapping/device/HipQRSolver.hip.hpp"

#endif

#ifdef PRECICE_WITH_CUDA

#include "mapping/device/CudaQRSolver.cuh"

#endif

#ifdef PRECICE_WITH_OPENMP

#include <omp.h>

#endif


using precice::mapping::RadialBasisParameters;


namespace precice {

namespace mapping {


enum class GinkgoSolverType {

  CG,

  GMRES,

  QR

};


enum class GinkgoPreconditionerType {

  Jacobi,

  Cholesky,

  None

};


// Runtime lookups as suggested by Ginkgo


const std::map<std::string, GinkgoSolverType> solverTypeLookup{

    {"cg-solver", GinkgoSolverType::CG},

    {"gmres-solver", GinkgoSolverType::GMRES},

    {"qr-solver", GinkgoSolverType::QR}};


const std::map<std::string, GinkgoPreconditionerType> preconditionerTypeLookup{

    {"jacobi-preconditioner", GinkgoPreconditionerType::Jacobi},

    {"cholesky-preconditioner", GinkgoPreconditionerType::Cholesky},

    {"no-preconditioner", GinkgoPreconditionerType::None}};


template <typename RADIAL_BASIS_FUNCTION_T>


class GinkgoRadialBasisFctSolver {

public:

  using BASIS_FUNCTION_T = RADIAL_BASIS_FUNCTION_T;


  template <typename IndexContainer>

  GinkgoRadialBasisFctSolver(RADIAL_BASIS_FUNCTION_T basisFunction, const mesh::Mesh &inputMesh, const IndexContainer &inputIDs,

                             const mesh::Mesh &outputMesh, const IndexContainer &outputIDs, std::vector<bool> deadAxis, Polynomial polynomial,

                             MappingConfiguration::GinkgoParameter ginkgoParameter);


  Eigen::MatrixXd solveConsistent(const Eigen::MatrixXd &inputData, Polynomial polynomial);


  Eigen::MatrixXd solveConservative(const Eigen::MatrixXd &inputData, Polynomial polynomial);


  void clear();


  Eigen::Index getInputSize() const;


  Eigen::Index getOutputSize() const;


  std::shared_ptr<gko::Executor> getReferenceExecutor() const;


private:

  mutable precice::logging::Logger _log{"mapping::GinkgoRadialBasisFctSolver"};


  std::shared_ptr<gko::Executor> _deviceExecutor;

  std::shared_ptr<gko::Executor> _hostExecutor = gko::ReferenceExecutor::create();


  // Stores the RBF interpolation matrix

  std::shared_ptr<GinkgoMatrix> _rbfSystemMatrix;


  std::shared_ptr<GinkgoMatrix> _matrixA;


  std::shared_ptr<GinkgoMatrix> _matrixQ;


  std::shared_ptr<gko::LinOp> _matrixQ_T;


  std::shared_ptr<gko::LinOp> _matrixQ_TQ;


  std::shared_ptr<gko::LinOp> _matrixQQ_T;


  std::shared_ptr<GinkgoVector> _polynomialRhs;


  std::shared_ptr<GinkgoVector> _subPolynomialContribution;


  std::shared_ptr<GinkgoVector> _addPolynomialContribution;


  std::shared_ptr<GinkgoMatrix> _matrixV;


  std::shared_ptr<GinkgoVector> _rbfCoefficients;


  std::shared_ptr<GinkgoVector> _polynomialContribution;


  std::shared_ptr<GinkgoMatrix> _decompMatrixQ_T;


  std::shared_ptr<GinkgoMatrix> _dQ_T_Rhs;


  std::shared_ptr<GinkgoMatrix> _decompMatrixR;


  std::shared_ptr<triangular> _triangularSolver;


  // std::unique_ptr<QRSolver> _qrSolver;


  // Solver used for iteratively solving linear systems of equations

  std::shared_ptr<cg>    _cgSolver    = nullptr;

  std::shared_ptr<gmres> _gmresSolver = nullptr;


  std::shared_ptr<cg> _polynomialSolver = nullptr;


  GinkgoSolverType _solverType = GinkgoSolverType::CG;


  GinkgoPreconditionerType _preconditionerType;


  // 1x1 identity matrix used for AXPY operations

  std::shared_ptr<GinkgoScalar> _scalarOne;

  std::shared_ptr<GinkgoScalar> _scalarNegativeOne;


  void _solveRBFSystem(const std::shared_ptr<GinkgoVector> &rhs) const;


  std::shared_ptr<gko::stop::Iteration::Factory> _iterationCriterion;


  std::shared_ptr<gko::stop::ResidualNorm<>::Factory> _residualCriterion;


  std::shared_ptr<gko::stop::ResidualNorm<>::Factory> _absoluteResidualCriterion;


  MappingConfiguration::GinkgoParameter _ginkgoParameter;

};


template <typename RADIAL_BASIS_FUNCTION_T>

template <typename IndexContainer>


GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::GinkgoRadialBasisFctSolver(RADIAL_BASIS_FUNCTION_T basisFunction, const mesh::Mesh &inputMesh, const IndexContainer &inputIDs,

                                                                                const mesh::Mesh &outputMesh, const IndexContainer &outputIDs, std::vector<bool> deadAxis, Polynomial polynomial,

                                                                                MappingConfiguration::GinkgoParameter ginkgoParameter)

    : _ginkgoParameter(ginkgoParameter)

{

  PRECICE_TRACE();

  // We have to initialize Kokkos and Ginkgo here, as the initialization call allocates memory

  // in the current setup, this will only initialize the device (and allocate memory) on the primary rank

  device::Device::initialize(_ginkgoParameter.nThreads, _ginkgoParameter.deviceId);

  PRECICE_INFO("Using Ginkgo solver {} on executor {} with max. iterations {} and residual reduction {}",

               ginkgoParameter.solver,

               ginkgoParameter.executor,

               ginkgoParameter.maxIterations,

               ginkgoParameter.residualNorm);

  _deviceExecutor = create_device_executor(ginkgoParameter.executor, ginkgoParameter.enableUnifiedMemory);

#ifdef PRECICE_WITH_OPENMP

  if (_ginkgoParameter.nThreads > 0 && _ginkgoParameter.executor == "omp-executor")

    omp_set_num_threads(_ginkgoParameter.nThreads);

#endif

  _solverType         = solverTypeLookup.at(ginkgoParameter.solver);

  _preconditionerType = preconditionerTypeLookup.at(ginkgoParameter.preconditioner);


  PRECICE_CHECK(!(RADIAL_BASIS_FUNCTION_T::isStrictlyPositiveDefinite() && polynomial == Polynomial::ON), "The integrated polynomial (polynomial=\"on\") is not supported for the selected radial-basis function. Please select another radial-basis function or change the polynomial configuration.");

  // Convert dead axis vector into an active axis array so that we can handle the reduction more easily

  std::array<bool, 3> activeAxis({{false, false, false}});

  std::transform(deadAxis.begin(), deadAxis.end(), activeAxis.begin(), [](const auto ax) { return !ax; });


  const std::size_t deadDimensions = std::count(activeAxis.begin(), activeAxis.end(), false);

  const std::size_t dimensions     = 3;

  const std::size_t polyparams     = polynomial == Polynomial::ON ? 1 + dimensions - deadDimensions : 0;


  // Add linear polynom degrees if polynomial requires this

  const auto inputSize  = inputIDs.size();

  const auto outputSize = outputIDs.size();

  const auto n          = inputSize + polyparams;


  PRECICE_ASSERT((inputMesh.getDimensions() == 3) || activeAxis[2] == false);

  PRECICE_ASSERT((inputSize >= 1 + polyparams) || polynomial != Polynomial::ON, inputSize);


  const std::size_t inputMeshSize  = inputMesh.nVertices();

  const std::size_t outputMeshSize = outputMesh.nVertices();

  const std::size_t meshDim        = inputMesh.vertex(0).getDimensions();


  _scalarOne         = gko::share(gko::initialize<GinkgoScalar>({1.0}, _deviceExecutor));

  _scalarNegativeOne = gko::share(gko::initialize<GinkgoScalar>({-1.0}, _deviceExecutor));


  // Now we fill the RBF system matrix on the GPU (or any other selected device)

  precice::profiling::Event _allocCopyEvent{"map.rbf.ginkgo.memoryAllocAndCopy"};

  _rbfCoefficients = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{n, 1}));

  _allocCopyEvent.stop();

  // Initial guess is required since uninitialized memory could lead to a never converging system

  _rbfCoefficients->fill(0.0);


  // We need to copy the input data into a CPU stored vector first and copy it to the GPU afterwards

  // To allow for coalesced memory accesses on the GPU, we need to store them in transposed order IFF the backend is the GPU

  // However, the CPU does not need that; in fact, it would make it slower

  std::size_t inputVerticesM, inputVerticesN, outputVerticesM, outputVerticesN;


  if ("cuda-executor" == ginkgoParameter.executor || "hip-executor" == ginkgoParameter.executor) {

    inputVerticesM  = meshDim;

    inputVerticesN  = inputMeshSize;

    outputVerticesM = meshDim;

    outputVerticesN = outputMeshSize;

  } else {

    inputVerticesM  = inputMeshSize;

    inputVerticesN  = meshDim;

    outputVerticesM = outputMeshSize;

    outputVerticesN = meshDim;

  }


  auto inputVertices  = gko::share(GinkgoMatrix::create(_hostExecutor, gko::dim<2>{inputVerticesM, inputVerticesN}));

  auto outputVertices = gko::share(GinkgoMatrix::create(_hostExecutor, gko::dim<2>{outputVerticesM, outputVerticesN}));

  for (std::size_t i = 0; i < inputMeshSize; ++i) {

    for (std::size_t j = 0; j < meshDim; ++j) {

      if ("cuda-executor" == ginkgoParameter.executor || "hip-executor" == ginkgoParameter.executor) {

        inputVertices->at(j, i) = inputMesh.vertex(i).coord(j);

      } else {

        inputVertices->at(i, j) = inputMesh.vertex(i).coord(j);

      }

    }

  }

  for (std::size_t i = 0; i < outputMeshSize; ++i) {

    for (std::size_t j = 0; j < meshDim; ++j) {

      if ("cuda-executor" == ginkgoParameter.executor || "hip-executor" == ginkgoParameter.executor) {

        outputVertices->at(j, i) = outputMesh.vertex(i).coord(j);

      } else {

        outputVertices->at(i, j) = outputMesh.vertex(i).coord(j);

      }

    }

  }


  _allocCopyEvent.start();


  auto dInputVertices  = gko::clone(_deviceExecutor, inputVertices);

  auto dOutputVertices = gko::clone(_deviceExecutor, outputVertices);

  inputVertices->clear();

  outputVertices->clear();


  _deviceExecutor->synchronize();


  _rbfSystemMatrix = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{n, n}));

  _matrixA         = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{outputSize, n}));


  _allocCopyEvent.stop();


  if (polynomial == Polynomial::SEPARATE) {

    const unsigned int separatePolyParams = 4 - std::count(activeAxis.begin(), activeAxis.end(), false);

    _allocCopyEvent.start();

    _matrixQ = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{n, separatePolyParams}));

    _matrixV = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{outputSize, separatePolyParams}));

    _allocCopyEvent.stop();


    _matrixQ->fill(0.0);

    _matrixV->fill(0.0);


    precice::profiling::Event _assemblyEvent{"map.rbf.ginkgo.assembleMatrices"};

    kernel::fill_polynomial_matrix(_deviceExecutor, _ginkgoParameter.enableUnifiedMemory, _matrixQ, dInputVertices, separatePolyParams);

    kernel::fill_polynomial_matrix(_deviceExecutor, _ginkgoParameter.enableUnifiedMemory, _matrixV, dOutputVertices, separatePolyParams);

    _assemblyEvent.stop();


    _deviceExecutor->synchronize();


    _matrixQ_T = gko::share(_matrixQ->transpose());


    _allocCopyEvent.start();

    _matrixQ_TQ                = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{_matrixQ_T->get_size()[0], _matrixQ->get_size()[1]}));

    _polynomialRhs             = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQ_T->get_size()[0], 1}));

    _subPolynomialContribution = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQ->get_size()[0], 1}));

    _addPolynomialContribution = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixV->get_size()[0], 1}));

    _allocCopyEvent.stop();


    _matrixQ_T->apply(_matrixQ, _matrixQ_TQ);


    auto polynomialSolverFactory = cg::build()

                                       .with_criteria(gko::stop::Iteration::build()

                                                          .with_max_iters(static_cast<std::size_t>(40))

                                                          .on(_deviceExecutor),

                                                      gko::stop::ResidualNorm<>::build()

                                                          .with_reduction_factor(1e-6)

                                                          .with_baseline(gko::stop::mode::initial_resnorm)

                                                          .on(_deviceExecutor))

                                       .on(_deviceExecutor);


    _polynomialSolver = polynomialSolverFactory->generate(_matrixQ_TQ);

  }


  // Launch RBF fill kernel on device

  precice::profiling::Event _assemblyEvent{"map.rbf.ginkgo.assembleMatrices"};

  precice::profiling::Event systemMatrixAssemblyEvent{"map.rbf.ginkgo.assembleSystemMatrix"};

  kernel::create_rbf_system_matrix(_deviceExecutor, _ginkgoParameter.enableUnifiedMemory, _rbfSystemMatrix, activeAxis, dInputVertices, dInputVertices, basisFunction,

                                   basisFunction.getFunctionParameters(), Polynomial::ON == polynomial,

                                   polyparams); // polynomial evaluates to true only if ON is set

  _deviceExecutor->synchronize();

  systemMatrixAssemblyEvent.stop();


  precice::profiling::Event outputMatrixAssemblyEvent{"map.rbf.ginkgo.assembleOutputMatrix"};

  kernel::create_rbf_system_matrix(_deviceExecutor, _ginkgoParameter.enableUnifiedMemory, _matrixA, activeAxis, dInputVertices, dOutputVertices, basisFunction,

                                   basisFunction.getFunctionParameters(), Polynomial::ON == polynomial, polyparams);


  // Wait for the kernels to finish

  _deviceExecutor->synchronize();

  outputMatrixAssemblyEvent.stop();

  _assemblyEvent.stop();


  dInputVertices->clear();

  dOutputVertices->clear();


  _iterationCriterion = gko::share(gko::stop::Iteration::build()

                                       .with_max_iters(ginkgoParameter.maxIterations)

                                       .on(_deviceExecutor));


  _residualCriterion = gko::share(gko::stop::ResidualNorm<>::build()

                                      .with_reduction_factor(ginkgoParameter.residualNorm)

                                      .with_baseline(gko::stop::mode::initial_resnorm)

                                      .on(_deviceExecutor));


  // For cases where we reach a stationary solution such that the coupling data doesn't change (or map zero data)

  _absoluteResidualCriterion = gko::share(gko::stop::ResidualNorm<>::build()

                                              .with_reduction_factor(1e-30)

                                              .with_baseline(gko::stop::mode::absolute)

                                              .on(_deviceExecutor));


  if (_solverType == GinkgoSolverType::CG) {


    if (GinkgoPreconditionerType::None != _preconditionerType && ginkgoParameter.usePreconditioner) {

      auto solverFactoryWithPreconditioner = [preconditionerType = _preconditionerType, executor = _deviceExecutor, &ginkgoParameter]() {

        if (preconditionerType == GinkgoPreconditionerType::Jacobi) {

          return cg::build().with_preconditioner(jacobi::build().with_max_block_size(ginkgoParameter.jacobiBlockSize).on(executor));

        } else {

          return cg::build().with_preconditioner(cholesky::build().on(executor));

        }

      }();


      auto solverFactory = solverFactoryWithPreconditioner

                               .with_criteria(_iterationCriterion, _residualCriterion, _absoluteResidualCriterion)

                               .on(_deviceExecutor);


      _cgSolver = gko::share(solverFactory->generate(_rbfSystemMatrix));

    } else {

      auto solverFactory = cg::build()

                               .with_criteria(_iterationCriterion, _residualCriterion, _absoluteResidualCriterion)

                               .on(_deviceExecutor);


      _cgSolver = gko::share(solverFactory->generate(_rbfSystemMatrix));

    }


  } else if (_solverType == GinkgoSolverType::GMRES) {


    if (GinkgoPreconditionerType::None != _preconditionerType && ginkgoParameter.usePreconditioner) {

      auto solverFactoryWithPreconditioner = [preconditionerType = _preconditionerType, executor = _deviceExecutor, &ginkgoParameter]() {

        if (preconditionerType == GinkgoPreconditionerType::Jacobi) {

          return gmres::build().with_preconditioner(jacobi::build().with_max_block_size(ginkgoParameter.jacobiBlockSize).on(executor));

        } else {

          return gmres::build().with_preconditioner(cholesky::build().on(executor));

        }

      }();


      auto solverFactory = solverFactoryWithPreconditioner

                               .with_criteria(_iterationCriterion, _residualCriterion, _absoluteResidualCriterion)

                               .on(_deviceExecutor);


      _gmresSolver = gko::share(solverFactory->generate(_rbfSystemMatrix));

    } else {

      auto solverFactory = gmres::build()

                               .with_criteria(_iterationCriterion, _residualCriterion, _absoluteResidualCriterion)

                               .on(_deviceExecutor);


      _gmresSolver = gko::share(solverFactory->generate(_rbfSystemMatrix));

    }

  } else if (_solverType == GinkgoSolverType::QR) {

    const std::size_t M = _rbfSystemMatrix->get_size()[0];

    const std::size_t N = _rbfSystemMatrix->get_size()[1];

    _decompMatrixQ_T    = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>(N, M)));

    _decompMatrixR      = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>(N, N)));


    if ("cuda-executor" == ginkgoParameter.executor) {

#ifdef PRECICE_WITH_CUDA

      // _rbfSystemMatrix will be overridden into Q

      computeQRDecompositionCuda(_deviceExecutor, _rbfSystemMatrix.get(), _decompMatrixR.get());

#endif

    } else if ("hip-executor" == ginkgoParameter.executor) {

#ifdef PRECICE_WITH_HIP

      // _rbfSystemMatrix will be overridden into Q

      computeQRDecompositionHip(_deviceExecutor, _rbfSystemMatrix.get(), _decompMatrixR.get());

#endif

    } else {

      PRECICE_UNREACHABLE("Not implemented");

    }

    _rbfSystemMatrix->transpose(_decompMatrixQ_T);

    _dQ_T_Rhs = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_decompMatrixQ_T->get_size()[0], 1}));

  } else {

    PRECICE_UNREACHABLE("Unknown solver type");

  }

}


template <typename RADIAL_BASIS_FUNCTION_T>


void GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::_solveRBFSystem(const std::shared_ptr<GinkgoVector> &rhs) const

{

  PRECICE_TRACE();

  auto logger = gko::share(gko::log::Convergence<>::create(gko::log::Logger::all_events_mask));


  _iterationCriterion->add_logger(logger);

  _residualCriterion->add_logger(logger);

  _absoluteResidualCriterion->add_logger(logger);


  precice::profiling::Event solverEvent("map.rbf.ginkgo.solveSystemMatrix");

  if (_solverType == GinkgoSolverType::CG) {

    _cgSolver->apply(rhs, _rbfCoefficients);

  } else if (_solverType == GinkgoSolverType::GMRES) {

    _gmresSolver->apply(rhs, _rbfCoefficients);

  }

  solverEvent.stop();

  PRECICE_INFO("The iterative solver stopped after {} iterations.", logger->get_num_iterations());


// Only compute time-consuming statistics in debug mode

#ifndef NDEBUG

  auto dResidual = gko::initialize<GinkgoScalar>({0.0}, _deviceExecutor);

  _rbfSystemMatrix->apply(_scalarOne, _rbfCoefficients, _scalarNegativeOne, rhs);

  rhs->compute_norm2(dResidual);

  auto residual = gko::clone(_hostExecutor, dResidual);

  PRECICE_INFO("Ginkgo Solver Final Residual: {}", residual->at(0, 0));

#endif


  _iterationCriterion->clear_loggers();

  _residualCriterion->clear_loggers();

  _absoluteResidualCriterion->clear_loggers();

}


template <typename RADIAL_BASIS_FUNCTION_T>


Eigen::MatrixXd GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::solveConsistent(const Eigen::MatrixXd &rhsValues, Polynomial polynomial)

{

  PRECICE_TRACE();


  Eigen::MatrixXd outmatrix(getOutputSize(), rhsValues.cols());


  for (int col = 0; col < rhsValues.cols(); col++) {

    // Copy rhs vector onto GPU by creating a Ginkgo Vector

    auto rhs = gko::share(GinkgoVector::create(_hostExecutor, gko::dim<2>{static_cast<unsigned long>(rhsValues.rows()), 1}));


    for (Eigen::Index i = 0; i < rhsValues.rows(); ++i) {

      rhs->at(i, 0) = rhsValues(i, col);

    }


    precice::profiling::Event _allocCopyEvent{"map.rbf.ginkgo.memoryAllocAndCopy"};

    auto                      dRhs = gko::share(gko::clone(_deviceExecutor, rhs));

    rhs->clear();

    _allocCopyEvent.stop();


    if (polynomial == Polynomial::SEPARATE) {

      _allocCopyEvent.start();

      _polynomialContribution = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQ_TQ->get_size()[1], 1}));

      _allocCopyEvent.stop();

      _polynomialContribution->fill(0.0);


      _matrixQ_T->apply(dRhs, _polynomialRhs);

      _polynomialSolver->apply(_polynomialRhs, _polynomialContribution);


      _matrixQ->apply(_polynomialContribution, _subPolynomialContribution);

      dRhs->sub_scaled(_scalarOne, _subPolynomialContribution);

    }


    if (GinkgoSolverType::QR == _solverType) {

      // Upper Trs U x = b

      if ("cuda-executor" == _ginkgoParameter.executor) {

#ifdef PRECICE_WITH_CUDA

        solvewithQRDecompositionCuda(_deviceExecutor, _decompMatrixR.get(), _rbfCoefficients.get(), _dQ_T_Rhs.get(), _decompMatrixQ_T.get(), dRhs.get());

#endif

      } else if ("hip-executor" == _ginkgoParameter.executor) {

#ifdef PRECICE_WITH_HIP

        solvewithQRDecompositionHip(_deviceExecutor, _decompMatrixR.get(), _rbfCoefficients.get(), _dQ_T_Rhs.get(), _decompMatrixQ_T.get(), dRhs.get());

#endif

      } else {

        PRECICE_UNREACHABLE("Not implemented");

      }

    } else {

      _solveRBFSystem(dRhs);

    }


    dRhs->clear();


    _allocCopyEvent.start();

    auto dOutput = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixA->get_size()[0], _rbfCoefficients->get_size()[1]}));

    _allocCopyEvent.stop();


    _matrixA->apply(_rbfCoefficients, dOutput);


    if (polynomial == Polynomial::SEPARATE) {

      _matrixV->apply(_polynomialContribution, _addPolynomialContribution);

      dOutput->add_scaled(_scalarOne, _addPolynomialContribution);

    }


    _allocCopyEvent.start();

    auto output = gko::clone(_hostExecutor, dOutput);

    _allocCopyEvent.stop();


    for (Eigen::Index i = 0; i < outmatrix.rows(); ++i) {

      outmatrix(i, col) = output->at(i, 0);

    }

  }


  return outmatrix;

}


template <typename RADIAL_BASIS_FUNCTION_T>


Eigen::MatrixXd GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::solveConservative(const Eigen::MatrixXd &rhsValues, Polynomial polynomial)

{

  PRECICE_TRACE();

  // Copy rhs vector onto GPU by creating a Ginkgo Vector

  Eigen::MatrixXd outmatrix(getInputSize(), rhsValues.cols());


  for (int col = 0; col < rhsValues.cols(); col++) {

    auto rhs = gko::share(GinkgoVector::create(_hostExecutor, gko::dim<2>{static_cast<unsigned long>(rhsValues.rows()), 1}));


    for (Eigen::Index i = 0; i < rhsValues.rows(); ++i) {

      rhs->at(i, 0) = rhsValues(i, col);

    }


    precice::profiling::Event _allocCopyEvent{"map.rbf.ginkgo.memoryAllocAndCopy"};

    auto                      dRhs = gko::share(gko::clone(_deviceExecutor, rhs));

    rhs->clear();

    _allocCopyEvent.stop();


    auto dAu = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixA->get_size()[1], dRhs->get_size()[1]}));


    _matrixA->transpose()->apply(dRhs, dAu);


    if (GinkgoSolverType::QR == _solverType) {

      if ("cuda-executor" == _ginkgoParameter.executor) {

#ifdef PRECICE_WITH_CUDA

        solvewithQRDecompositionCuda(_deviceExecutor, _decompMatrixR.get(), _rbfCoefficients.get(), _dQ_T_Rhs.get(), _decompMatrixQ_T.get(), dAu.get());

#endif

      } else if ("hip-executor" == _ginkgoParameter.executor) {

#ifdef PRECICE_WITH_HIP

        solvewithQRDecompositionHip(_deviceExecutor, _decompMatrixR.get(), _rbfCoefficients.get(), _dQ_T_Rhs.get(), _decompMatrixQ_T.get(), dAu.get());

#endif

      } else {

        PRECICE_UNREACHABLE("Not implemented");

      }

    } else {

      _solveRBFSystem(dAu);

    }


    auto dOutput = gko::clone(_deviceExecutor, _rbfCoefficients);


    if (polynomial == Polynomial::SEPARATE) {

      auto dEpsilon = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixV->get_size()[1], dRhs->get_size()[1]}));

      _matrixV->transpose()->apply(dRhs, dEpsilon);


      auto dTmp = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQ->get_size()[1], _rbfCoefficients->get_size()[1]}));

      _matrixQ->transpose()->apply(dOutput, dTmp);


      // epsilon -= tmp

      dEpsilon->sub_scaled(_scalarOne, dTmp);


      // Since this class is constructed for consistent mapping per default, we have to delete unused memory and initialize conservative variables

      if (nullptr == _matrixQQ_T) {

        _matrixQ_TQ->clear();

        _deviceExecutor->synchronize();

        _matrixQQ_T = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{_matrixQ->get_size()[0], _matrixQ_T->get_size()[1]}));


        _matrixQ->apply(_matrixQ_T, _matrixQQ_T);


        auto polynomialSolverFactory = cg::build()

                                           .with_criteria(gko::stop::Iteration::build()

                                                              .with_max_iters(static_cast<std::size_t>(40))

                                                              .on(_deviceExecutor),

                                                          gko::stop::ResidualNorm<>::build()

                                                              .with_reduction_factor(1e-6)

                                                              .with_baseline(gko::stop::mode::initial_resnorm)

                                                              .on(_deviceExecutor))

                                           .on(_deviceExecutor);


        _polynomialSolver = polynomialSolverFactory->generate(_matrixQQ_T);


        _polynomialRhs->clear();

        _deviceExecutor->synchronize();

      }


      _polynomialContribution = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQQ_T->get_size()[1], 1}));

      _polynomialContribution->fill(0.0);


      dEpsilon->scale(_scalarNegativeOne);


      _polynomialRhs = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQ->get_size()[0], dEpsilon->get_size()[1]}));


      _matrixQ->apply(dEpsilon, _polynomialRhs);


      _polynomialSolver->apply(_polynomialRhs, _polynomialContribution);


      // out -= poly

      dOutput->sub_scaled(_scalarOne, _polynomialContribution);

    }


    _allocCopyEvent.start();

    auto output = gko::clone(_hostExecutor, dOutput);

    _allocCopyEvent.stop();


    for (Eigen::Index i = 0; i < outmatrix.rows(); ++i) {

      outmatrix(i, col) = output->at(i, 0);

    }

  }


  return outmatrix;

}


template <typename RADIAL_BASIS_FUNCTION_T>


std::shared_ptr<gko::Executor> GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::getReferenceExecutor() const

{

  return _hostExecutor;

}


template <typename RADIAL_BASIS_FUNCTION_T>


Eigen::Index GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::getInputSize() const

{

  return _matrixA->get_size()[1];

}


template <typename RADIAL_BASIS_FUNCTION_T>


Eigen::Index GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::getOutputSize() const

{

  return _matrixA->get_size()[0];

}


template <typename RADIAL_BASIS_FUNCTION_T>


void GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::clear()

{

  if (nullptr != _rbfSystemMatrix) {

    _rbfSystemMatrix->clear();

  }

  if (nullptr != _matrixA) {

    _matrixA->clear();

  }

  if (nullptr != _matrixV) {

    _matrixV->clear();

  }

  if (nullptr != _matrixQ) {

    _matrixQ->clear();

  }

  if (nullptr != _matrixQ_T) {

    _matrixQ_T->clear();

  }

  if (nullptr != _matrixQ_TQ) {

    _matrixQ_TQ->clear();

  }

  if (nullptr != _rbfCoefficients) {

    _rbfCoefficients->clear();

  }

  if (nullptr != _polynomialRhs) {

    _polynomialRhs->clear();

  }

  if (nullptr != _subPolynomialContribution) {

    _subPolynomialContribution->clear();

  }

  if (nullptr != _addPolynomialContribution) {

    _addPolynomialContribution->clear();

  }

  if (nullptr != _polynomialContribution) {

    _polynomialContribution->clear();

  }

}


} // namespace mapping

} // namespace precice


#endif // PRECICE_NO_GINKGO

BasisFunctions.hpp

Device.hpp

Event.hpp

GinkgoDefinitions.hpp

GinkgoRBFKernels.hpp

HipQRSolver.hip.hpp

PRECICE_TRACE
#define PRECICE_TRACE(...)
Definition LogMacros.hpp:92

PRECICE_INFO
#define PRECICE_INFO(...)
Definition LogMacros.hpp:14

PRECICE_CHECK
#define PRECICE_CHECK(check,...)
Definition LogMacros.hpp:32

MappingConfiguration.hpp

Mesh.hpp

PRECICE_ASSERT
#define PRECICE_ASSERT(...)
Definition assertion.hpp:85

PRECICE_UNREACHABLE
#define PRECICE_UNREACHABLE(...)
Definition assertion.hpp:93

precice::device::Device::initialize
static void initialize(int *argc, char ***argv)
Definition Device.cpp:12

precice::logging::Logger
This class provides a lightweight logger.
Definition Logger.hpp:17

precice::mapping::GinkgoRadialBasisFctSolver::GinkgoRadialBasisFctSolver
GinkgoRadialBasisFctSolver(RADIAL_BASIS_FUNCTION_T basisFunction, const mesh::Mesh &inputMesh, const IndexContainer &inputIDs, const mesh::Mesh &outputMesh, const IndexContainer &outputIDs, std::vector< bool > deadAxis, Polynomial polynomial, MappingConfiguration::GinkgoParameter ginkgoParameter)
Assembles the system matrices and computes the decomposition of the interpolation matrix.
Definition GinkgoRadialBasisFctSolver.hpp:170

precice::mapping::GinkgoRadialBasisFctSolver::_matrixQ
std::shared_ptr< GinkgoMatrix > _matrixQ
Polynomial matrix of the input mesh (for separate polynomial)
Definition GinkgoRadialBasisFctSolver.hpp:100

precice::mapping::GinkgoRadialBasisFctSolver::_solveRBFSystem
void _solveRBFSystem(const std::shared_ptr< GinkgoVector > &rhs) const
Definition GinkgoRadialBasisFctSolver.hpp:426

precice::mapping::GinkgoRadialBasisFctSolver::_iterationCriterion
std::shared_ptr< gko::stop::Iteration::Factory > _iterationCriterion
Definition GinkgoRadialBasisFctSolver.hpp:159

precice::mapping::GinkgoRadialBasisFctSolver::_ginkgoParameter
MappingConfiguration::GinkgoParameter _ginkgoParameter
Definition GinkgoRadialBasisFctSolver.hpp:165

precice::mapping::GinkgoRadialBasisFctSolver::getReferenceExecutor
std::shared_ptr< gko::Executor > getReferenceExecutor() const
Definition GinkgoRadialBasisFctSolver.hpp:636

precice::mapping::GinkgoRadialBasisFctSolver::_matrixV
std::shared_ptr< GinkgoMatrix > _matrixV
Polynomial matrix of the output mesh (for separate polynomial)
Definition GinkgoRadialBasisFctSolver.hpp:121

precice::mapping::GinkgoRadialBasisFctSolver::_rbfCoefficients
std::shared_ptr< GinkgoVector > _rbfCoefficients
Stores the calculated coefficients of the RBF interpolation.
Definition GinkgoRadialBasisFctSolver.hpp:124

precice::mapping::GinkgoRadialBasisFctSolver::_polynomialContribution
std::shared_ptr< GinkgoVector > _polynomialContribution
Definition GinkgoRadialBasisFctSolver.hpp:126

precice::mapping::GinkgoRadialBasisFctSolver::_dQ_T_Rhs
std::shared_ptr< GinkgoMatrix > _dQ_T_Rhs
Q^T * b of QR decomposition.
Definition GinkgoRadialBasisFctSolver.hpp:132

precice::mapping::GinkgoRadialBasisFctSolver::getOutputSize
Eigen::Index getOutputSize() const
Definition GinkgoRadialBasisFctSolver.hpp:648

precice::mapping::GinkgoRadialBasisFctSolver::solveConservative
Eigen::MatrixXd solveConservative(const Eigen::MatrixXd &inputData, Polynomial polynomial)
Maps the given input data.
Definition GinkgoRadialBasisFctSolver.hpp:534

precice::mapping::GinkgoRadialBasisFctSolver::_polynomialRhs
std::shared_ptr< GinkgoVector > _polynomialRhs
Right-hand side of the polynomial system.
Definition GinkgoRadialBasisFctSolver.hpp:112

precice::mapping::GinkgoRadialBasisFctSolver::_scalarNegativeOne
std::shared_ptr< GinkgoScalar > _scalarNegativeOne
Definition GinkgoRadialBasisFctSolver.hpp:155

precice::mapping::GinkgoRadialBasisFctSolver::_matrixQ_TQ
std::shared_ptr< gko::LinOp > _matrixQ_TQ
Product Q^T*Q (to solve Q^TQx=Q^Tb)
Definition GinkgoRadialBasisFctSolver.hpp:106

precice::mapping::GinkgoRadialBasisFctSolver::_hostExecutor
std::shared_ptr< gko::Executor > _hostExecutor
Definition GinkgoRadialBasisFctSolver.hpp:91

precice::mapping::GinkgoRadialBasisFctSolver::_polynomialSolver
std::shared_ptr< cg > _polynomialSolver
Definition GinkgoRadialBasisFctSolver.hpp:147

precice::mapping::GinkgoRadialBasisFctSolver::_rbfSystemMatrix
std::shared_ptr< GinkgoMatrix > _rbfSystemMatrix
Definition GinkgoRadialBasisFctSolver.hpp:94

precice::mapping::GinkgoRadialBasisFctSolver::_log
precice::logging::Logger _log
Definition GinkgoRadialBasisFctSolver.hpp:88

precice::mapping::GinkgoRadialBasisFctSolver::_gmresSolver
std::shared_ptr< gmres > _gmresSolver
Definition GinkgoRadialBasisFctSolver.hpp:145

precice::mapping::GinkgoRadialBasisFctSolver::_residualCriterion
std::shared_ptr< gko::stop::ResidualNorm<>::Factory > _residualCriterion
Definition GinkgoRadialBasisFctSolver.hpp:161

precice::mapping::GinkgoRadialBasisFctSolver::_decompMatrixR
std::shared_ptr< GinkgoMatrix > _decompMatrixR
Matrix R of QR decomposition.
Definition GinkgoRadialBasisFctSolver.hpp:135

precice::mapping::GinkgoRadialBasisFctSolver::clear
void clear()
Definition GinkgoRadialBasisFctSolver.hpp:654

precice::mapping::GinkgoRadialBasisFctSolver::getInputSize
Eigen::Index getInputSize() const
Definition GinkgoRadialBasisFctSolver.hpp:642

precice::mapping::GinkgoRadialBasisFctSolver::_decompMatrixQ_T
std::shared_ptr< GinkgoMatrix > _decompMatrixQ_T
Matrix Q^T of QR decomposition.
Definition GinkgoRadialBasisFctSolver.hpp:129

precice::mapping::GinkgoRadialBasisFctSolver::_subPolynomialContribution
std::shared_ptr< GinkgoVector > _subPolynomialContribution
Subtraction of the polynomial contribution.
Definition GinkgoRadialBasisFctSolver.hpp:115

precice::mapping::GinkgoRadialBasisFctSolver::_cgSolver
std::shared_ptr< cg > _cgSolver
QR Solver.
Definition GinkgoRadialBasisFctSolver.hpp:144

precice::mapping::GinkgoRadialBasisFctSolver::_matrixQ_T
std::shared_ptr< gko::LinOp > _matrixQ_T
Transposed Polynomial matrix of the input mesh (for separate polynomial) (to solve Q^T*Q*x=Q^T*b)
Definition GinkgoRadialBasisFctSolver.hpp:103

precice::mapping::GinkgoRadialBasisFctSolver::_deviceExecutor
std::shared_ptr< gko::Executor > _deviceExecutor
Definition GinkgoRadialBasisFctSolver.hpp:90

precice::mapping::GinkgoRadialBasisFctSolver::solveConsistent
Eigen::MatrixXd solveConsistent(const Eigen::MatrixXd &inputData, Polynomial polynomial)
Maps the given input data.
Definition GinkgoRadialBasisFctSolver.hpp:459

precice::mapping::GinkgoRadialBasisFctSolver::BASIS_FUNCTION_T
RADIAL_BASIS_FUNCTION_T BASIS_FUNCTION_T
Definition GinkgoRadialBasisFctSolver.hpp:65

precice::mapping::GinkgoRadialBasisFctSolver::_scalarOne
std::shared_ptr< GinkgoScalar > _scalarOne
Definition GinkgoRadialBasisFctSolver.hpp:154

precice::mapping::GinkgoRadialBasisFctSolver::_preconditionerType
GinkgoPreconditionerType _preconditionerType
Definition GinkgoRadialBasisFctSolver.hpp:151

precice::mapping::GinkgoRadialBasisFctSolver::_absoluteResidualCriterion
std::shared_ptr< gko::stop::ResidualNorm<>::Factory > _absoluteResidualCriterion
Definition GinkgoRadialBasisFctSolver.hpp:163

precice::mapping::GinkgoRadialBasisFctSolver::_addPolynomialContribution
std::shared_ptr< GinkgoVector > _addPolynomialContribution
Addition of the polynomial contribution.
Definition GinkgoRadialBasisFctSolver.hpp:118

precice::mapping::GinkgoRadialBasisFctSolver::_matrixA
std::shared_ptr< GinkgoMatrix > _matrixA
Evaluation matrix (output x input)
Definition GinkgoRadialBasisFctSolver.hpp:97

precice::mapping::GinkgoRadialBasisFctSolver::_triangularSolver
std::shared_ptr< triangular > _triangularSolver
Backwards Solver.
Definition GinkgoRadialBasisFctSolver.hpp:138

precice::mapping::GinkgoRadialBasisFctSolver::_matrixQQ_T
std::shared_ptr< gko::LinOp > _matrixQQ_T
Product Q*Q^T.
Definition GinkgoRadialBasisFctSolver.hpp:109

precice::mapping::GinkgoRadialBasisFctSolver::_solverType
GinkgoSolverType _solverType
Definition GinkgoRadialBasisFctSolver.hpp:149

precice::mesh::Mesh
Container and creator for meshes.
Definition Mesh.hpp:38

precice::mesh::Mesh::getDimensions
int getDimensions() const
Definition Mesh.cpp:99

precice::mesh::Mesh::nVertices
std::size_t nVertices() const
Returns the number of vertices.
Definition Mesh.cpp:64

precice::mesh::Mesh::vertex
Vertex & vertex(VertexID id)
Mutable access to a vertex by VertexID.
Definition Mesh.cpp:42

precice::mesh::Vertex::coord
double coord(int index) const
Returns a coordinate of a vertex.
Definition Vertex.hpp:126

precice::mesh::Vertex::getDimensions
int getDimensions() const
Returns spatial dimensionality of vertex.
Definition Vertex.cpp:7

precice::profiling::Event
Definition Event.hpp:40

precice::profiling::Event::start
void start()
Starts or restarts a stopped event.
Definition Event.cpp:28

precice::profiling::Event::stop
void stop()
Stops a running event.
Definition Event.cpp:51

Types.hpp

precice::mapping::kernel::fill_polynomial_matrix
void fill_polynomial_matrix(std::shared_ptr< const gko::Executor > exec, bool unifiedMemory, gko::ptr_param< GinkgoMatrix > mtx, gko::ptr_param< const GinkgoMatrix > x, const unsigned int dims)
Definition GinkgoRBFKernels.cpp:225

precice::mapping::kernel::create_rbf_system_matrix
void create_rbf_system_matrix(std::shared_ptr< const gko::Executor > exec, bool unifiedMemory, gko::ptr_param< GinkgoMatrix > mtx, const std::array< bool, 3 > activeAxis, gko::ptr_param< GinkgoMatrix > supportPoints, gko::ptr_param< GinkgoMatrix > targetPoints, EvalFunctionType f, ::precice::mapping::RadialBasisParameters rbf_params, bool addPolynomial, unsigned int extraDims)
Definition GinkgoRBFKernels.cpp:120

precice::mapping
contains data mapping from points to meshes.
Definition AxialGeoMultiscaleMapping.cpp:5

precice::mapping::preconditionerTypeLookup
const std::map< std::string, GinkgoPreconditionerType > preconditionerTypeLookup
Definition GinkgoRadialBasisFctSolver.hpp:51

precice::mapping::GinkgoSolverType
GinkgoSolverType
Definition GinkgoRadialBasisFctSolver.hpp:32

precice::mapping::GinkgoSolverType::CG
@ CG
Definition GinkgoRadialBasisFctSolver.hpp:33

precice::mapping::GinkgoSolverType::QR
@ QR
Definition GinkgoRadialBasisFctSolver.hpp:35

precice::mapping::GinkgoSolverType::GMRES
@ GMRES
Definition GinkgoRadialBasisFctSolver.hpp:34

precice::mapping::solverTypeLookup
const std::map< std::string, GinkgoSolverType > solverTypeLookup
Definition GinkgoRadialBasisFctSolver.hpp:46

precice::mapping::GinkgoPreconditionerType
GinkgoPreconditionerType
Definition GinkgoRadialBasisFctSolver.hpp:38

precice::mapping::GinkgoPreconditionerType::Jacobi
@ Jacobi
Definition GinkgoRadialBasisFctSolver.hpp:39

precice::mapping::GinkgoPreconditionerType::None
@ None
Definition GinkgoRadialBasisFctSolver.hpp:41

precice::mapping::GinkgoPreconditionerType::Cholesky
@ Cholesky
Definition GinkgoRadialBasisFctSolver.hpp:40

precice::mapping::create_device_executor
std::shared_ptr< gko::Executor > create_device_executor(const std::string &execName, bool enableUnifiedMemory)
Definition GinkgoRBFKernels.cpp:17

precice::mapping::Polynomial
Polynomial
How to handle the polynomial?
Definition MappingConfigurationTypes.hpp:11

precice::mapping::Polynomial::SEPARATE
@ SEPARATE
Definition MappingConfigurationTypes.hpp:14

precice::mapping::Polynomial::ON
@ ON
Definition MappingConfigurationTypes.hpp:12

precice
Main namespace of the precice library.
Definition Acceleration.cpp:5

precice::mapping::MappingConfiguration::GinkgoParameter
Definition MappingConfiguration.hpp:40

precice::mapping::MappingConfiguration::GinkgoParameter::enableUnifiedMemory
bool enableUnifiedMemory
Definition MappingConfiguration.hpp:50

precice::mapping::MappingConfiguration::GinkgoParameter::maxIterations
std::size_t maxIterations
Definition MappingConfiguration.hpp:45

precice::mapping::MappingConfiguration::GinkgoParameter::preconditioner
std::string preconditioner
Definition MappingConfiguration.hpp:43

precice::mapping::MappingConfiguration::GinkgoParameter::usePreconditioner
bool usePreconditioner
Definition MappingConfiguration.hpp:46

precice::mapping::MappingConfiguration::GinkgoParameter::residualNorm
double residualNorm
Definition MappingConfiguration.hpp:44

precice::mapping::MappingConfiguration::GinkgoParameter::solver
std::string solver
Definition MappingConfiguration.hpp:42

precice::mapping::MappingConfiguration::GinkgoParameter::jacobiBlockSize
unsigned int jacobiBlockSize
Definition MappingConfiguration.hpp:47

precice::mapping::MappingConfiguration::GinkgoParameter::executor
std::string executor
Definition MappingConfiguration.hpp:41

precice::mapping::RadialBasisParameters
Wrapper struct that is used to transfer RBF-specific parameters to the GPU.
Definition BasisFunctions.hpp:51