preCICE
Loading...
Searching...
No Matches
GinkgoRadialBasisFctSolver.hpp
Go to the documentation of this file.
1#pragma once
2
3#ifndef PRECICE_NO_GINKGO
4
5#include <array>
6#include <cmath>
7#include <functional>
8#include <numeric>
14#include "mesh/Mesh.hpp"
16#include "profiling/Event.hpp"
17#ifdef PRECICE_WITH_HIP
19#endif
20#ifdef PRECICE_WITH_CUDA
21#include "mapping/device/CudaQRSolver.cuh"
22#endif
23#ifdef PRECICE_WITH_OPENMP
24#include <omp.h>
25#endif
26
28
29namespace precice {
30namespace mapping {
31
37
43
44// Runtime lookups as suggested by Ginkgo
45
46const std::map<std::string, GinkgoSolverType> solverTypeLookup{
47 {"cg-solver", GinkgoSolverType::CG},
48 {"gmres-solver", GinkgoSolverType::GMRES},
49 {"qr-solver", GinkgoSolverType::QR}};
50
51const std::map<std::string, GinkgoPreconditionerType> preconditionerTypeLookup{
52 {"jacobi-preconditioner", GinkgoPreconditionerType::Jacobi},
53 {"cholesky-preconditioner", GinkgoPreconditionerType::Cholesky},
54 {"no-preconditioner", GinkgoPreconditionerType::None}};
55
62template <typename RADIAL_BASIS_FUNCTION_T>
64public:
65 using BASIS_FUNCTION_T = RADIAL_BASIS_FUNCTION_T;
66
68 template <typename IndexContainer>
69 GinkgoRadialBasisFctSolver(RADIAL_BASIS_FUNCTION_T basisFunction, const mesh::Mesh &inputMesh, const IndexContainer &inputIDs,
70 const mesh::Mesh &outputMesh, const IndexContainer &outputIDs, std::vector<bool> deadAxis, Polynomial polynomial,
72
74 Eigen::MatrixXd solveConsistent(const Eigen::MatrixXd &inputData, Polynomial polynomial);
75
77 Eigen::MatrixXd solveConservative(const Eigen::MatrixXd &inputData, Polynomial polynomial);
78
79 void clear();
80
81 Eigen::Index getInputSize() const;
82
83 Eigen::Index getOutputSize() const;
84
85 std::shared_ptr<gko::Executor> getReferenceExecutor() const;
86
87private:
88 mutable precice::logging::Logger _log{"mapping::GinkgoRadialBasisFctSolver"};
89
90 std::shared_ptr<gko::Executor> _deviceExecutor;
91 std::shared_ptr<gko::Executor> _hostExecutor = gko::ReferenceExecutor::create();
92
93 // Stores the RBF interpolation matrix
94 std::shared_ptr<GinkgoMatrix> _rbfSystemMatrix;
95
97 std::shared_ptr<GinkgoMatrix> _matrixA;
98
100 std::shared_ptr<GinkgoMatrix> _matrixQ;
101
103 std::shared_ptr<gko::LinOp> _matrixQ_T;
104
106 std::shared_ptr<gko::LinOp> _matrixQ_TQ;
107
109 std::shared_ptr<gko::LinOp> _matrixQQ_T;
110
112 std::shared_ptr<GinkgoVector> _polynomialRhs;
113
115 std::shared_ptr<GinkgoVector> _subPolynomialContribution;
116
118 std::shared_ptr<GinkgoVector> _addPolynomialContribution;
119
121 std::shared_ptr<GinkgoMatrix> _matrixV;
122
124 std::shared_ptr<GinkgoVector> _rbfCoefficients;
125
126 std::shared_ptr<GinkgoVector> _polynomialContribution;
127
129 std::shared_ptr<GinkgoMatrix> _decompMatrixQ_T;
130
132 std::shared_ptr<GinkgoMatrix> _dQ_T_Rhs;
133
135 std::shared_ptr<GinkgoMatrix> _decompMatrixR;
136
138 std::shared_ptr<triangular> _triangularSolver;
139
141 // std::unique_ptr<QRSolver> _qrSolver;
142
143 // Solver used for iteratively solving linear systems of equations
144 std::shared_ptr<cg> _cgSolver = nullptr;
145 std::shared_ptr<gmres> _gmresSolver = nullptr;
146
147 std::shared_ptr<cg> _polynomialSolver = nullptr;
148
150
152
153 // 1x1 identity matrix used for AXPY operations
154 std::shared_ptr<GinkgoScalar> _scalarOne;
155 std::shared_ptr<GinkgoScalar> _scalarNegativeOne;
156
157 void _solveRBFSystem(const std::shared_ptr<GinkgoVector> &rhs) const;
158
159 std::shared_ptr<gko::stop::Iteration::Factory> _iterationCriterion;
160
161 std::shared_ptr<gko::stop::ResidualNorm<>::Factory> _residualCriterion;
162
163 std::shared_ptr<gko::stop::ResidualNorm<>::Factory> _absoluteResidualCriterion;
164
166};
167
168template <typename RADIAL_BASIS_FUNCTION_T>
169template <typename IndexContainer>
170GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::GinkgoRadialBasisFctSolver(RADIAL_BASIS_FUNCTION_T basisFunction, const mesh::Mesh &inputMesh, const IndexContainer &inputIDs,
171 const mesh::Mesh &outputMesh, const IndexContainer &outputIDs, std::vector<bool> deadAxis, Polynomial polynomial,
173 : _ginkgoParameter(ginkgoParameter)
174{
176 // We have to initialize Kokkos and Ginkgo here, as the initialization call allocates memory
177 // in the current setup, this will only initialize the device (and allocate memory) on the primary rank
179 PRECICE_INFO("Using Ginkgo solver {} on executor {} with max. iterations {} and residual reduction {}",
180 ginkgoParameter.solver,
181 ginkgoParameter.executor,
182 ginkgoParameter.maxIterations,
183 ginkgoParameter.residualNorm);
184 _deviceExecutor = create_device_executor(ginkgoParameter.executor, ginkgoParameter.enableUnifiedMemory);
185#ifdef PRECICE_WITH_OPENMP
186 if (_ginkgoParameter.nThreads > 0 && _ginkgoParameter.executor == "omp-executor")
187 omp_set_num_threads(_ginkgoParameter.nThreads);
188#endif
189 _solverType = solverTypeLookup.at(ginkgoParameter.solver);
191
192 PRECICE_CHECK(!(RADIAL_BASIS_FUNCTION_T::isStrictlyPositiveDefinite() && polynomial == Polynomial::ON), "The integrated polynomial (polynomial=\"on\") is not supported for the selected radial-basis function. Please select another radial-basis function or change the polynomial configuration.");
193 // Convert dead axis vector into an active axis array so that we can handle the reduction more easily
194 std::array<bool, 3> activeAxis({{false, false, false}});
195 std::transform(deadAxis.begin(), deadAxis.end(), activeAxis.begin(), [](const auto ax) { return !ax; });
196
197 const std::size_t deadDimensions = std::count(activeAxis.begin(), activeAxis.end(), false);
198 const std::size_t dimensions = 3;
199 const std::size_t polyparams = polynomial == Polynomial::ON ? 1 + dimensions - deadDimensions : 0;
200
201 // Add linear polynom degrees if polynomial requires this
202 const auto inputSize = inputIDs.size();
203 const auto outputSize = outputIDs.size();
204 const auto n = inputSize + polyparams;
205
206 PRECICE_ASSERT((inputMesh.getDimensions() == 3) || activeAxis[2] == false);
207 PRECICE_ASSERT((inputSize >= 1 + polyparams) || polynomial != Polynomial::ON, inputSize);
208
209 const std::size_t inputMeshSize = inputMesh.nVertices();
210 const std::size_t outputMeshSize = outputMesh.nVertices();
211 const std::size_t meshDim = inputMesh.vertex(0).getDimensions();
212
213 _scalarOne = gko::share(gko::initialize<GinkgoScalar>({1.0}, _deviceExecutor));
214 _scalarNegativeOne = gko::share(gko::initialize<GinkgoScalar>({-1.0}, _deviceExecutor));
215
216 // Now we fill the RBF system matrix on the GPU (or any other selected device)
217 precice::profiling::Event _allocCopyEvent{"map.rbf.ginkgo.memoryAllocAndCopy"};
218 _rbfCoefficients = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{n, 1}));
219 _allocCopyEvent.stop();
220 // Initial guess is required since uninitialized memory could lead to a never converging system
221 _rbfCoefficients->fill(0.0);
222
223 // We need to copy the input data into a CPU stored vector first and copy it to the GPU afterwards
224 // To allow for coalesced memory accesses on the GPU, we need to store them in transposed order IFF the backend is the GPU
225 // However, the CPU does not need that; in fact, it would make it slower
226 std::size_t inputVerticesM, inputVerticesN, outputVerticesM, outputVerticesN;
227
228 if ("cuda-executor" == ginkgoParameter.executor || "hip-executor" == ginkgoParameter.executor) {
229 inputVerticesM = meshDim;
230 inputVerticesN = inputMeshSize;
231 outputVerticesM = meshDim;
232 outputVerticesN = outputMeshSize;
233 } else {
234 inputVerticesM = inputMeshSize;
235 inputVerticesN = meshDim;
236 outputVerticesM = outputMeshSize;
237 outputVerticesN = meshDim;
238 }
239
240 auto inputVertices = gko::share(GinkgoMatrix::create(_hostExecutor, gko::dim<2>{inputVerticesM, inputVerticesN}));
241 auto outputVertices = gko::share(GinkgoMatrix::create(_hostExecutor, gko::dim<2>{outputVerticesM, outputVerticesN}));
242 for (std::size_t i = 0; i < inputMeshSize; ++i) {
243 for (std::size_t j = 0; j < meshDim; ++j) {
244 if ("cuda-executor" == ginkgoParameter.executor || "hip-executor" == ginkgoParameter.executor) {
245 inputVertices->at(j, i) = inputMesh.vertex(i).coord(j);
246 } else {
247 inputVertices->at(i, j) = inputMesh.vertex(i).coord(j);
248 }
249 }
250 }
251 for (std::size_t i = 0; i < outputMeshSize; ++i) {
252 for (std::size_t j = 0; j < meshDim; ++j) {
253 if ("cuda-executor" == ginkgoParameter.executor || "hip-executor" == ginkgoParameter.executor) {
254 outputVertices->at(j, i) = outputMesh.vertex(i).coord(j);
255 } else {
256 outputVertices->at(i, j) = outputMesh.vertex(i).coord(j);
257 }
258 }
259 }
260
261 _allocCopyEvent.start();
262
263 auto dInputVertices = gko::clone(_deviceExecutor, inputVertices);
264 auto dOutputVertices = gko::clone(_deviceExecutor, outputVertices);
265 inputVertices->clear();
266 outputVertices->clear();
267
268 _deviceExecutor->synchronize();
269
270 _rbfSystemMatrix = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{n, n}));
271 _matrixA = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{outputSize, n}));
272
273 _allocCopyEvent.stop();
274
275 if (polynomial == Polynomial::SEPARATE) {
276 const unsigned int separatePolyParams = 4 - std::count(activeAxis.begin(), activeAxis.end(), false);
277 _allocCopyEvent.start();
278 _matrixQ = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{n, separatePolyParams}));
279 _matrixV = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{outputSize, separatePolyParams}));
280 _allocCopyEvent.stop();
281
282 _matrixQ->fill(0.0);
283 _matrixV->fill(0.0);
284
285 precice::profiling::Event _assemblyEvent{"map.rbf.ginkgo.assembleMatrices"};
286 kernel::fill_polynomial_matrix(_deviceExecutor, _ginkgoParameter.enableUnifiedMemory, _matrixQ, dInputVertices, separatePolyParams);
287 kernel::fill_polynomial_matrix(_deviceExecutor, _ginkgoParameter.enableUnifiedMemory, _matrixV, dOutputVertices, separatePolyParams);
288 _assemblyEvent.stop();
289
290 _deviceExecutor->synchronize();
291
292 _matrixQ_T = gko::share(_matrixQ->transpose());
293
294 _allocCopyEvent.start();
295 _matrixQ_TQ = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{_matrixQ_T->get_size()[0], _matrixQ->get_size()[1]}));
296 _polynomialRhs = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQ_T->get_size()[0], 1}));
297 _subPolynomialContribution = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQ->get_size()[0], 1}));
298 _addPolynomialContribution = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixV->get_size()[0], 1}));
299 _allocCopyEvent.stop();
300
302
303 auto polynomialSolverFactory = cg::build()
304 .with_criteria(gko::stop::Iteration::build()
305 .with_max_iters(static_cast<std::size_t>(40))
306 .on(_deviceExecutor),
307 gko::stop::ResidualNorm<>::build()
308 .with_reduction_factor(1e-6)
309 .with_baseline(gko::stop::mode::initial_resnorm)
310 .on(_deviceExecutor))
311 .on(_deviceExecutor);
312
313 _polynomialSolver = polynomialSolverFactory->generate(_matrixQ_TQ);
314 }
315
316 // Launch RBF fill kernel on device
317 precice::profiling::Event _assemblyEvent{"map.rbf.ginkgo.assembleMatrices"};
318 precice::profiling::Event systemMatrixAssemblyEvent{"map.rbf.ginkgo.assembleSystemMatrix"};
319 kernel::create_rbf_system_matrix(_deviceExecutor, _ginkgoParameter.enableUnifiedMemory, _rbfSystemMatrix, activeAxis, dInputVertices, dInputVertices, basisFunction,
320 basisFunction.getFunctionParameters(), Polynomial::ON == polynomial,
321 polyparams); // polynomial evaluates to true only if ON is set
322 _deviceExecutor->synchronize();
323 systemMatrixAssemblyEvent.stop();
324
325 precice::profiling::Event outputMatrixAssemblyEvent{"map.rbf.ginkgo.assembleOutputMatrix"};
326 kernel::create_rbf_system_matrix(_deviceExecutor, _ginkgoParameter.enableUnifiedMemory, _matrixA, activeAxis, dInputVertices, dOutputVertices, basisFunction,
327 basisFunction.getFunctionParameters(), Polynomial::ON == polynomial, polyparams);
328
329 // Wait for the kernels to finish
330 _deviceExecutor->synchronize();
331 outputMatrixAssemblyEvent.stop();
332 _assemblyEvent.stop();
333
334 dInputVertices->clear();
335 dOutputVertices->clear();
336
337 _iterationCriterion = gko::share(gko::stop::Iteration::build()
338 .with_max_iters(ginkgoParameter.maxIterations)
339 .on(_deviceExecutor));
340
341 _residualCriterion = gko::share(gko::stop::ResidualNorm<>::build()
342 .with_reduction_factor(ginkgoParameter.residualNorm)
343 .with_baseline(gko::stop::mode::initial_resnorm)
344 .on(_deviceExecutor));
345
346 // For cases where we reach a stationary solution such that the coupling data doesn't change (or map zero data)
347 _absoluteResidualCriterion = gko::share(gko::stop::ResidualNorm<>::build()
348 .with_reduction_factor(1e-30)
349 .with_baseline(gko::stop::mode::absolute)
350 .on(_deviceExecutor));
351
353
355 auto solverFactoryWithPreconditioner = [preconditionerType = _preconditionerType, executor = _deviceExecutor, &ginkgoParameter]() {
356 if (preconditionerType == GinkgoPreconditionerType::Jacobi) {
357 return cg::build().with_preconditioner(jacobi::build().with_max_block_size(ginkgoParameter.jacobiBlockSize).on(executor));
358 } else {
359 return cg::build().with_preconditioner(cholesky::build().on(executor));
360 }
361 }();
362
363 auto solverFactory = solverFactoryWithPreconditioner
365 .on(_deviceExecutor);
366
367 _cgSolver = gko::share(solverFactory->generate(_rbfSystemMatrix));
368 } else {
369 auto solverFactory = cg::build()
371 .on(_deviceExecutor);
372
373 _cgSolver = gko::share(solverFactory->generate(_rbfSystemMatrix));
374 }
375
376 } else if (_solverType == GinkgoSolverType::GMRES) {
377
379 auto solverFactoryWithPreconditioner = [preconditionerType = _preconditionerType, executor = _deviceExecutor, &ginkgoParameter]() {
380 if (preconditionerType == GinkgoPreconditionerType::Jacobi) {
381 return gmres::build().with_preconditioner(jacobi::build().with_max_block_size(ginkgoParameter.jacobiBlockSize).on(executor));
382 } else {
383 return gmres::build().with_preconditioner(cholesky::build().on(executor));
384 }
385 }();
386
387 auto solverFactory = solverFactoryWithPreconditioner
389 .on(_deviceExecutor);
390
391 _gmresSolver = gko::share(solverFactory->generate(_rbfSystemMatrix));
392 } else {
393 auto solverFactory = gmres::build()
395 .on(_deviceExecutor);
396
397 _gmresSolver = gko::share(solverFactory->generate(_rbfSystemMatrix));
398 }
399 } else if (_solverType == GinkgoSolverType::QR) {
400 const std::size_t M = _rbfSystemMatrix->get_size()[0];
401 const std::size_t N = _rbfSystemMatrix->get_size()[1];
402 _decompMatrixQ_T = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>(N, M)));
403 _decompMatrixR = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>(N, N)));
404
405 if ("cuda-executor" == ginkgoParameter.executor) {
406#ifdef PRECICE_WITH_CUDA
407 // _rbfSystemMatrix will be overridden into Q
408 computeQRDecompositionCuda(_deviceExecutor, _rbfSystemMatrix.get(), _decompMatrixR.get());
409#endif
410 } else if ("hip-executor" == ginkgoParameter.executor) {
411#ifdef PRECICE_WITH_HIP
412 // _rbfSystemMatrix will be overridden into Q
413 computeQRDecompositionHip(_deviceExecutor, _rbfSystemMatrix.get(), _decompMatrixR.get());
414#endif
415 } else {
416 PRECICE_UNREACHABLE("Not implemented");
417 }
419 _dQ_T_Rhs = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_decompMatrixQ_T->get_size()[0], 1}));
420 } else {
421 PRECICE_UNREACHABLE("Unknown solver type");
422 }
423}
424
425template <typename RADIAL_BASIS_FUNCTION_T>
426void GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::_solveRBFSystem(const std::shared_ptr<GinkgoVector> &rhs) const
427{
429 auto logger = gko::share(gko::log::Convergence<>::create(gko::log::Logger::all_events_mask));
430
431 _iterationCriterion->add_logger(logger);
432 _residualCriterion->add_logger(logger);
433 _absoluteResidualCriterion->add_logger(logger);
434
435 precice::profiling::Event solverEvent("map.rbf.ginkgo.solveSystemMatrix");
437 _cgSolver->apply(rhs, _rbfCoefficients);
438 } else if (_solverType == GinkgoSolverType::GMRES) {
439 _gmresSolver->apply(rhs, _rbfCoefficients);
440 }
441 solverEvent.stop();
442 PRECICE_INFO("The iterative solver stopped after {} iterations.", logger->get_num_iterations());
443
444// Only compute time-consuming statistics in debug mode
445#ifndef NDEBUG
446 auto dResidual = gko::initialize<GinkgoScalar>({0.0}, _deviceExecutor);
448 rhs->compute_norm2(dResidual);
449 auto residual = gko::clone(_hostExecutor, dResidual);
450 PRECICE_INFO("Ginkgo Solver Final Residual: {}", residual->at(0, 0));
451#endif
452
453 _iterationCriterion->clear_loggers();
454 _residualCriterion->clear_loggers();
455 _absoluteResidualCriterion->clear_loggers();
456}
457
458template <typename RADIAL_BASIS_FUNCTION_T>
459Eigen::MatrixXd GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::solveConsistent(const Eigen::MatrixXd &rhsValues, Polynomial polynomial)
460{
462
463 Eigen::MatrixXd outmatrix(getOutputSize(), rhsValues.cols());
464
465 for (int col = 0; col < rhsValues.cols(); col++) {
466 // Copy rhs vector onto GPU by creating a Ginkgo Vector
467 auto rhs = gko::share(GinkgoVector::create(_hostExecutor, gko::dim<2>{static_cast<unsigned long>(rhsValues.rows()), 1}));
468
469 for (Eigen::Index i = 0; i < rhsValues.rows(); ++i) {
470 rhs->at(i, 0) = rhsValues(i, col);
471 }
472
473 precice::profiling::Event _allocCopyEvent{"map.rbf.ginkgo.memoryAllocAndCopy"};
474 auto dRhs = gko::share(gko::clone(_deviceExecutor, rhs));
475 rhs->clear();
476 _allocCopyEvent.stop();
477
478 if (polynomial == Polynomial::SEPARATE) {
479 _allocCopyEvent.start();
480 _polynomialContribution = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQ_TQ->get_size()[1], 1}));
481 _allocCopyEvent.stop();
482 _polynomialContribution->fill(0.0);
483
484 _matrixQ_T->apply(dRhs, _polynomialRhs);
486
488 dRhs->sub_scaled(_scalarOne, _subPolynomialContribution);
489 }
490
492 // Upper Trs U x = b
493 if ("cuda-executor" == _ginkgoParameter.executor) {
494#ifdef PRECICE_WITH_CUDA
495 solvewithQRDecompositionCuda(_deviceExecutor, _decompMatrixR.get(), _rbfCoefficients.get(), _dQ_T_Rhs.get(), _decompMatrixQ_T.get(), dRhs.get());
496#endif
497 } else if ("hip-executor" == _ginkgoParameter.executor) {
498#ifdef PRECICE_WITH_HIP
499 solvewithQRDecompositionHip(_deviceExecutor, _decompMatrixR.get(), _rbfCoefficients.get(), _dQ_T_Rhs.get(), _decompMatrixQ_T.get(), dRhs.get());
500#endif
501 } else {
502 PRECICE_UNREACHABLE("Not implemented");
503 }
504 } else {
505 _solveRBFSystem(dRhs);
506 }
507
508 dRhs->clear();
509
510 _allocCopyEvent.start();
511 auto dOutput = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixA->get_size()[0], _rbfCoefficients->get_size()[1]}));
512 _allocCopyEvent.stop();
513
514 _matrixA->apply(_rbfCoefficients, dOutput);
515
516 if (polynomial == Polynomial::SEPARATE) {
518 dOutput->add_scaled(_scalarOne, _addPolynomialContribution);
519 }
520
521 _allocCopyEvent.start();
522 auto output = gko::clone(_hostExecutor, dOutput);
523 _allocCopyEvent.stop();
524
525 for (Eigen::Index i = 0; i < outmatrix.rows(); ++i) {
526 outmatrix(i, col) = output->at(i, 0);
527 }
528 }
529
530 return outmatrix;
531}
532
533template <typename RADIAL_BASIS_FUNCTION_T>
534Eigen::MatrixXd GinkgoRadialBasisFctSolver<RADIAL_BASIS_FUNCTION_T>::solveConservative(const Eigen::MatrixXd &rhsValues, Polynomial polynomial)
535{
537 // Copy rhs vector onto GPU by creating a Ginkgo Vector
538 Eigen::MatrixXd outmatrix(getInputSize(), rhsValues.cols());
539
540 for (int col = 0; col < rhsValues.cols(); col++) {
541 auto rhs = gko::share(GinkgoVector::create(_hostExecutor, gko::dim<2>{static_cast<unsigned long>(rhsValues.rows()), 1}));
542
543 for (Eigen::Index i = 0; i < rhsValues.rows(); ++i) {
544 rhs->at(i, 0) = rhsValues(i, col);
545 }
546
547 precice::profiling::Event _allocCopyEvent{"map.rbf.ginkgo.memoryAllocAndCopy"};
548 auto dRhs = gko::share(gko::clone(_deviceExecutor, rhs));
549 rhs->clear();
550 _allocCopyEvent.stop();
551
552 auto dAu = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixA->get_size()[1], dRhs->get_size()[1]}));
553
554 _matrixA->transpose()->apply(dRhs, dAu);
555
557 if ("cuda-executor" == _ginkgoParameter.executor) {
558#ifdef PRECICE_WITH_CUDA
559 solvewithQRDecompositionCuda(_deviceExecutor, _decompMatrixR.get(), _rbfCoefficients.get(), _dQ_T_Rhs.get(), _decompMatrixQ_T.get(), dAu.get());
560#endif
561 } else if ("hip-executor" == _ginkgoParameter.executor) {
562#ifdef PRECICE_WITH_HIP
563 solvewithQRDecompositionHip(_deviceExecutor, _decompMatrixR.get(), _rbfCoefficients.get(), _dQ_T_Rhs.get(), _decompMatrixQ_T.get(), dAu.get());
564#endif
565 } else {
566 PRECICE_UNREACHABLE("Not implemented");
567 }
568 } else {
569 _solveRBFSystem(dAu);
570 }
571
572 auto dOutput = gko::clone(_deviceExecutor, _rbfCoefficients);
573
574 if (polynomial == Polynomial::SEPARATE) {
575 auto dEpsilon = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixV->get_size()[1], dRhs->get_size()[1]}));
576 _matrixV->transpose()->apply(dRhs, dEpsilon);
577
578 auto dTmp = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQ->get_size()[1], _rbfCoefficients->get_size()[1]}));
579 _matrixQ->transpose()->apply(dOutput, dTmp);
580
581 // epsilon -= tmp
582 dEpsilon->sub_scaled(_scalarOne, dTmp);
583
584 // Since this class is constructed for consistent mapping per default, we have to delete unused memory and initialize conservative variables
585 if (nullptr == _matrixQQ_T) {
586 _matrixQ_TQ->clear();
587 _deviceExecutor->synchronize();
588 _matrixQQ_T = gko::share(GinkgoMatrix::create(_deviceExecutor, gko::dim<2>{_matrixQ->get_size()[0], _matrixQ_T->get_size()[1]}));
589
591
592 auto polynomialSolverFactory = cg::build()
593 .with_criteria(gko::stop::Iteration::build()
594 .with_max_iters(static_cast<std::size_t>(40))
595 .on(_deviceExecutor),
596 gko::stop::ResidualNorm<>::build()
597 .with_reduction_factor(1e-6)
598 .with_baseline(gko::stop::mode::initial_resnorm)
599 .on(_deviceExecutor))
600 .on(_deviceExecutor);
601
602 _polynomialSolver = polynomialSolverFactory->generate(_matrixQQ_T);
603
604 _polynomialRhs->clear();
605 _deviceExecutor->synchronize();
606 }
607
608 _polynomialContribution = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQQ_T->get_size()[1], 1}));
609 _polynomialContribution->fill(0.0);
610
611 dEpsilon->scale(_scalarNegativeOne);
612
613 _polynomialRhs = gko::share(GinkgoVector::create(_deviceExecutor, gko::dim<2>{_matrixQ->get_size()[0], dEpsilon->get_size()[1]}));
614
615 _matrixQ->apply(dEpsilon, _polynomialRhs);
616
618
619 // out -= poly
620 dOutput->sub_scaled(_scalarOne, _polynomialContribution);
621 }
622
623 _allocCopyEvent.start();
624 auto output = gko::clone(_hostExecutor, dOutput);
625 _allocCopyEvent.stop();
626
627 for (Eigen::Index i = 0; i < outmatrix.rows(); ++i) {
628 outmatrix(i, col) = output->at(i, 0);
629 }
630 }
631
632 return outmatrix;
633}
634
635template <typename RADIAL_BASIS_FUNCTION_T>
637{
638 return _hostExecutor;
639}
640
641template <typename RADIAL_BASIS_FUNCTION_T>
643{
644 return _matrixA->get_size()[1];
645}
646
647template <typename RADIAL_BASIS_FUNCTION_T>
649{
650 return _matrixA->get_size()[0];
651}
652
653template <typename RADIAL_BASIS_FUNCTION_T>
655{
656 if (nullptr != _rbfSystemMatrix) {
657 _rbfSystemMatrix->clear();
658 }
659 if (nullptr != _matrixA) {
660 _matrixA->clear();
661 }
662 if (nullptr != _matrixV) {
663 _matrixV->clear();
664 }
665 if (nullptr != _matrixQ) {
666 _matrixQ->clear();
667 }
668 if (nullptr != _matrixQ_T) {
669 _matrixQ_T->clear();
670 }
671 if (nullptr != _matrixQ_TQ) {
672 _matrixQ_TQ->clear();
673 }
674 if (nullptr != _rbfCoefficients) {
675 _rbfCoefficients->clear();
676 }
677 if (nullptr != _polynomialRhs) {
678 _polynomialRhs->clear();
679 }
680 if (nullptr != _subPolynomialContribution) {
682 }
683 if (nullptr != _addPolynomialContribution) {
685 }
686 if (nullptr != _polynomialContribution) {
688 }
689}
690
691} // namespace mapping
692} // namespace precice
693
694#endif // PRECICE_NO_GINKGO
#define PRECICE_TRACE(...)
Definition LogMacros.hpp:92
#define PRECICE_INFO(...)
Definition LogMacros.hpp:14
#define PRECICE_CHECK(check,...)
Definition LogMacros.hpp:32
#define PRECICE_ASSERT(...)
Definition assertion.hpp:85
#define PRECICE_UNREACHABLE(...)
Definition assertion.hpp:93
static void initialize(int *argc, char ***argv)
Definition Device.cpp:12
This class provides a lightweight logger.
Definition Logger.hpp:17
GinkgoRadialBasisFctSolver(RADIAL_BASIS_FUNCTION_T basisFunction, const mesh::Mesh &inputMesh, const IndexContainer &inputIDs, const mesh::Mesh &outputMesh, const IndexContainer &outputIDs, std::vector< bool > deadAxis, Polynomial polynomial, MappingConfiguration::GinkgoParameter ginkgoParameter)
Assembles the system matrices and computes the decomposition of the interpolation matrix.
std::shared_ptr< GinkgoMatrix > _matrixQ
Polynomial matrix of the input mesh (for separate polynomial)
void _solveRBFSystem(const std::shared_ptr< GinkgoVector > &rhs) const
std::shared_ptr< gko::stop::Iteration::Factory > _iterationCriterion
MappingConfiguration::GinkgoParameter _ginkgoParameter
std::shared_ptr< gko::Executor > getReferenceExecutor() const
std::shared_ptr< GinkgoMatrix > _matrixV
Polynomial matrix of the output mesh (for separate polynomial)
std::shared_ptr< GinkgoVector > _rbfCoefficients
Stores the calculated coefficients of the RBF interpolation.
std::shared_ptr< GinkgoMatrix > _dQ_T_Rhs
Q^T * b of QR decomposition.
Eigen::MatrixXd solveConservative(const Eigen::MatrixXd &inputData, Polynomial polynomial)
Maps the given input data.
std::shared_ptr< GinkgoVector > _polynomialRhs
Right-hand side of the polynomial system.
std::shared_ptr< gko::LinOp > _matrixQ_TQ
Product Q^T*Q (to solve Q^TQx=Q^Tb)
std::shared_ptr< gko::stop::ResidualNorm<>::Factory > _residualCriterion
std::shared_ptr< GinkgoMatrix > _decompMatrixR
Matrix R of QR decomposition.
std::shared_ptr< GinkgoMatrix > _decompMatrixQ_T
Matrix Q^T of QR decomposition.
std::shared_ptr< GinkgoVector > _subPolynomialContribution
Subtraction of the polynomial contribution.
std::shared_ptr< gko::LinOp > _matrixQ_T
Transposed Polynomial matrix of the input mesh (for separate polynomial) (to solve Q^T*Q*x=Q^T*b)
Eigen::MatrixXd solveConsistent(const Eigen::MatrixXd &inputData, Polynomial polynomial)
Maps the given input data.
std::shared_ptr< gko::stop::ResidualNorm<>::Factory > _absoluteResidualCriterion
std::shared_ptr< GinkgoVector > _addPolynomialContribution
Addition of the polynomial contribution.
std::shared_ptr< GinkgoMatrix > _matrixA
Evaluation matrix (output x input)
std::shared_ptr< triangular > _triangularSolver
Backwards Solver.
std::shared_ptr< gko::LinOp > _matrixQQ_T
Product Q*Q^T.
Container and creator for meshes.
Definition Mesh.hpp:38
int getDimensions() const
Definition Mesh.cpp:99
std::size_t nVertices() const
Returns the number of vertices.
Definition Mesh.cpp:64
Vertex & vertex(VertexID id)
Mutable access to a vertex by VertexID.
Definition Mesh.cpp:42
double coord(int index) const
Returns a coordinate of a vertex.
Definition Vertex.hpp:126
int getDimensions() const
Returns spatial dimensionality of vertex.
Definition Vertex.cpp:7
void start()
Starts or restarts a stopped event.
Definition Event.cpp:28
void stop()
Stops a running event.
Definition Event.cpp:51
void fill_polynomial_matrix(std::shared_ptr< const gko::Executor > exec, bool unifiedMemory, gko::ptr_param< GinkgoMatrix > mtx, gko::ptr_param< const GinkgoMatrix > x, const unsigned int dims)
void create_rbf_system_matrix(std::shared_ptr< const gko::Executor > exec, bool unifiedMemory, gko::ptr_param< GinkgoMatrix > mtx, const std::array< bool, 3 > activeAxis, gko::ptr_param< GinkgoMatrix > supportPoints, gko::ptr_param< GinkgoMatrix > targetPoints, EvalFunctionType f, ::precice::mapping::RadialBasisParameters rbf_params, bool addPolynomial, unsigned int extraDims)
contains data mapping from points to meshes.
const std::map< std::string, GinkgoPreconditionerType > preconditionerTypeLookup
const std::map< std::string, GinkgoSolverType > solverTypeLookup
std::shared_ptr< gko::Executor > create_device_executor(const std::string &execName, bool enableUnifiedMemory)
Polynomial
How to handle the polynomial?
Main namespace of the precice library.
Wrapper struct that is used to transfer RBF-specific parameters to the GPU.