preCICE
Loading...
Searching...
No Matches
KokkosPUMKernels_Impl.hpp
Go to the documentation of this file.
1#include <KokkosBatched_LU_Decl.hpp>
2#include <KokkosBatched_Util.hpp>
3
4#include <KokkosBatched_ApplyPivot_Decl.hpp>
5#include <KokkosBatched_ApplyQ_Decl.hpp>
6#include <KokkosBatched_QR_WithColumnPivoting_Decl.hpp>
7#include <KokkosBatched_Trsv_Decl.hpp>
8#include <KokkosBlas2_gemv.hpp>
9
11#include "math/math.hpp"
12
13#include <functional>
14#include <optional>
15
18
20
21namespace impl {
22
23// Enum to use when computing the team size
29
30// Helper to compute the power of 2^x for the given n and the above,
31// i.e., either the flooring, ceiling, or the closest one
32// used only below to determine a reasonable team size
33inline int powerOfTwo(int n, const Pow2Mode mode)
34{
35 if (n < 1)
36 return 1;
37 // find highest power-of-two <= n
38 int pow2 = 1;
39 while (pow2 <= n) {
40 pow2 = pow2 * 2;
41 }
42 // now that we have the value larger, we look for the pow2 below n
43 int lower = pow2 / 2;
44
45 if (mode == Pow2Mode::Larger) {
46 return pow2;
47 } else if (mode == Pow2Mode::Smaller) {
48 return lower;
49 } else {
51 // decide whether n is closer to lower or upper (pow2)
52 if (n - lower < pow2 - n) {
53 return lower;
54 } else {
55 return pow2;
56 }
57 }
58}
59
60// Try to find a good team size based on the cluster size
61// avgWork is in our case simply the average cluster size, as it determines
62// the local matrix sizes
63template <typename ExecSpace, typename FunctorType, typename Policy>
64auto findTeamSize(int avgWork, const Pow2Mode mode, const FunctorType &functor, const Policy &policy)
65{
66 // If using OpenMP, Kokkos::AUTO works best
67 if constexpr (std::is_same_v<ExecSpace, Kokkos::HostSpace::execution_space>) {
68 return Kokkos::AUTO;
69 } else {
70 // Compute the power of two according to the configuration
71 int teamSize = powerOfTwo(avgWork, mode);
72 // Ensure minimum of one warp for the GPU to avoid partial-warp inefficiency
73 int warpSize = Kokkos::TeamPolicy<ExecSpace>::vector_length_max();
74 if (teamSize < warpSize) {
75 teamSize = warpSize;
76 }
77 // Query Kokkos for the max recommended team size for this functor
78 // (takes also memory constraints into account)
79 int maxRecommended = policy.team_size_recommended(functor, Kokkos::ParallelForTag());
80 if (teamSize > maxRecommended) {
81 teamSize = maxRecommended;
82 }
83 return teamSize;
84 }
85}
86
87// small helper function to make the compiler handle variables in lambdas which are only conditionally used
88template <typename... Args>
89KOKKOS_INLINE_FUNCTION constexpr void capture_conditional_variables(const Args &...) {}
90} // namespace impl
91
92// For within the kernels
93// The batch matrix has a default layout, which is consistent across kernels
94template <typename MemorySpace = ExecutionSpace>
95using BatchMatrix = Kokkos::View<double **, MemorySpace, UnmanagedMemory>;
96template <typename T = double *, typename MemorySpace = ExecutionSpace>
97using BatchVector = Kokkos::View<T, MemorySpace, UnmanagedMemory>;
98
99template <typename MemorySpace>
100bool compute_weights(const int nCenters,
101 const int avgOutClusterSize,
102 const offset_1d_type nWeights,
103 const int nMeshVertices,
104 const int dim,
106 MeshView<MemorySpace> centers,
109 const CompactPolynomialC2 &w,
110 VectorView<MemorySpace> normalizedWeights)
111{
112 using TeamPolicy = Kokkos::TeamPolicy<MemorySpace>;
113 using TeamMember = typename TeamPolicy::member_type;
114
115 VectorView<MemorySpace> weightSum("weightSum", nMeshVertices);
116 Kokkos::deep_copy(weightSum, 0.0);
117 Kokkos::fence();
118
119 const auto rbf_params = w.getFunctionParameters();
120
121 // We launch one team per local system
122 auto kernel = KOKKOS_LAMBDA(const TeamMember &team)
123 {
124 const int batch = team.league_rank();
125 const auto begin = offsets(batch);
126 const auto end = offsets(batch + 1);
127
128 Kokkos::parallel_for(
129 Kokkos::TeamThreadRange(team, begin, end),
130 [&](offset_1d_type i) {
131 auto globalID = globalIDs(i);
132 double dist = 0.0;
133 for (int d = 0; d < dim; ++d) {
134 double diff = mesh(globalID, d) - centers(batch, d);
135 dist += diff * diff;
136 }
137 dist = Kokkos::sqrt(dist);
138 double val = w(dist, rbf_params);
139 // is NUMERICAL_ZERO_DIFFERENCE_DEVICE
140 double res = std::max(val, 1.0e-14);
141 normalizedWeights(i) = res;
142
143 Kokkos::atomic_add(&weightSum(globalID), res);
144 }); // TeamThreadRange
145 };
146
147 // auto teamSize = impl::findTeamSize<typename MemorySpace::execution_space>(avgOutClusterSize, impl::Pow2Mode::Larger, kernel, TeamPolicy(nCenters, Kokkos::AUTO));
148 Kokkos::parallel_for("compute_weights", TeamPolicy(nCenters, Kokkos::AUTO), kernel);
149
150 // Check for output mesh vertices which are unassigned
151 // This check is a pure sanity check
152 bool hasZero = false;
153 Kokkos::parallel_reduce(
154 "check_zero",
155 nMeshVertices,
156 KOKKOS_LAMBDA(int i, bool &local) {
157 if (weightSum(i) == 0.0)
158 local = true;
159 },
160 Kokkos::LOr<bool>(hasZero));
161
162 if (hasZero) {
163 return false;
164 }
165
166 // Now scale back the sum
167 Kokkos::parallel_for(
168 "scale_weights",
169 Kokkos::RangePolicy<MemorySpace>(0, nWeights),
170 KOKKOS_LAMBDA(const int i) {
171 const int id = globalIDs(i);
172 normalizedWeights(i) /= weightSum(id);
173 });
174
175 return true;
176}
177
178template <typename MemorySpace>
179void do_batched_qr(int nCluster,
180 int dim,
181 int avgClusterSize,
182 int maxClusterSize,
189{
190 using TeamPolicy = Kokkos::TeamPolicy<MemorySpace>;
191 using MemberType = typename TeamPolicy::member_type;
192 using ScratchView = Kokkos::View<double *, typename MemorySpace::scratch_memory_space, UnmanagedMemory>;
193
194 // First, we fill the entire matrix with ones such that we don't need to fill the 1 separately later
195 Kokkos::deep_copy(qrMatrix, 1.0);
196 Kokkos::fence();
197
198 auto kernel = KOKKOS_LAMBDA(const MemberType &team)
199 {
200 // Step 1: define some pointers we need
201 const int batch = team.league_rank();
202
203 // For the batch
204 const auto begin = offsets(batch);
205 const int verticesPerCluster = offsets(batch + 1) - begin; // the local cluster size
206 const int matrixCols = dim + 1; // our polyParams
207 const offset_1d_type matrixBegin = begin * matrixCols;
208
209 // Step 2: fill the polynomial matrix
210 BatchMatrix<MemorySpace> qr(&qrMatrix(matrixBegin), verticesPerCluster, matrixCols);
211
212 Kokkos::parallel_for(
213 Kokkos::TeamThreadRange(team, verticesPerCluster),
214 [&](int i) {
215 auto globalID = globalIDs(i + begin);
216 // the 1 is already set in the last column
217 for (int d = 0; d < dim; ++d) {
218 qr(i, d) = mesh(globalID, d);
219 }
220 });
221
222 // Step 3: Compute the QR decomposition
223 const offset_1d_type tauBegin = batch * matrixCols;
224 // the 1 here is for the local rank and has nothing to do with the permutation itself
225 const offset_1d_type PBegin = batch * (matrixCols + 1);
226
227 BatchVector<double *, MemorySpace> tau(&qrTau(tauBegin), matrixCols);
228 BatchVector<int *, MemorySpace> P(&qrP(PBegin), matrixCols);
229
230 // Essentially P.end() - 1 = matrixCols + 1 - 1
231 int &rank = qrP(PBegin + matrixCols);
232
233 // The scratch memory (shared memory for the device)
234 ScratchView work(team.team_scratch(0), 2 * verticesPerCluster);
235
236 // auto b = Kokkos::subview(work, std::pair<int, int>(0, n));
237 // auto res = Kokkos::subview(work, std::pair<int, int>(n, n + m));
238 // A Serial QR_WithColumnPivoting would probably be better suited here, but this is as of now not available
239 // The workload to put a Team on the whole matrix is most likely not a good balance
240 KokkosBatched::TeamVectorQR_WithColumnPivoting<MemberType,
241 KokkosBatched::Algo::QR::Unblocked>::invoke(team, qr, tau,
242 P, work,
243 rank);
244 // We have to define our own criterion for the rank, as the one provided is not stable enough
245 // A pivot will be considered nonzero if its absolute value is strictly greater than |pivot|⩽threshold×|maxpivot| where maxpivot is the biggest pivot.
246 // We use 1e-6 as threshold, which is much stricter than the Kokkos internal criterion
247 // The kokkos internal criterion can be found in KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp
248 // if diagonal value is smaller than threshold(10 * max_diag * ats::epsilon())
249 // note that the pivoted algorithm aborts once the rank deficiency is detected, thus, values larger than rank just contain rubbish
250 double threshold = 1e-5;
251 if (team.team_rank() == 0) {
252 const double maxp = Kokkos::abs(qr(0, 0)); // largest pivot
253 int r = 0;
254 for (int i = 0; i < rank; ++i) {
255 r += static_cast<int>(Kokkos::abs(qr(i, i)) > (threshold * maxp));
256 }
257 rank = r;
258 }
259 // parallel_for
260 };
261
262 // Required as workspace for the pivoted QR, see code comment
263 // workspace (norm and householder application, 2 * max(m,n) is needed)
264
265 auto scratchSize = ScratchView::shmem_size(2 * maxClusterSize);
266 auto teamSize = impl::findTeamSize<typename MemorySpace::execution_space>(avgClusterSize, impl::Pow2Mode::Smaller, kernel, TeamPolicy(nCluster, Kokkos::AUTO).set_scratch_size(0, Kokkos::PerTeam(scratchSize)));
267 // const int VL = TeamPolicy::vector_length_max();
268 // The inner loop uses vector parallelism, so we should try to configure accordingly
269 std::unique_ptr<TeamPolicy> policy;
270 if constexpr (std::is_same_v<decltype(teamSize), Kokkos::AUTO_t>) {
271 policy = std::make_unique<TeamPolicy>(nCluster, Kokkos::AUTO);
272 } else {
273 policy = std::make_unique<TeamPolicy>(nCluster, 4, teamSize / 4);
274 }
275 policy->set_scratch_size(/* level = */ 0, Kokkos::PerTeam(scratchSize));
276 Kokkos::parallel_for("do_batched_qr", *policy, kernel);
277}
278
279template <typename MemorySpace>
281 MatrixOffsetView<MemorySpace> dst, int nCluster)
282{
283 PRECICE_ASSERT(src1.extent(0) == src2.extent(0));
284 PRECICE_ASSERT(src2.extent(0) == dst.extent(0));
285 Kokkos::parallel_scan("compute_offsets", nCluster, KOKKOS_LAMBDA(const int i, offset_2d_type &update, const bool final) {
286 // Number of rows for local system i
287 int nrows = src1(i + 1) - src1(i);
288 // Number of columns for local system i
289 int ncols = src2(i + 1) - src2(i);
290
291 // Number of entries in the i-th local matrix
292 int localSize = nrows * ncols;
293
294 // Add to running sum
295 update += static_cast<offset_2d_type>(localSize);
296
297 // 'final == true' indicates we should write to matrixOffsets
298 if (final) {
299 // matrixOffsets(i+1) = partial sum up to i
300 dst(i + 1) = update;
301 }
302 // end parallel_for
303 });
304}
305
306template <typename EvalFunctionType, typename MemorySpace>
308 int nCluster, // Number of local systems
309 int dim, // Dimension of points
310 int avgClusterSize,
311 int maxInClusterSize,
312 EvalFunctionType f,
313 const VectorOffsetView<MemorySpace> &inOffsets, // vertex offsets (length N+1)
314 const GlobalIDView<MemorySpace> &globalInIDs,
315 const MeshView<MemorySpace> &inCoords, // meshes
316 const MatrixOffsetView<MemorySpace> &matrixOffsets,
317 VectorView<MemorySpace> matrices) // 1D view of batched matrices
318{
319 using ExecSpace = typename MemorySpace::execution_space;
320 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
321 using MemberType = typename TeamPolicy::member_type;
322
323 using ScratchSpace = typename MemorySpace::scratch_memory_space;
324 using ScratchView1d = Kokkos::View<double *, ScratchSpace, UnmanagedMemory>;
325 using ScratchMatrix = Kokkos::View<double **, Kokkos::LayoutRight, ScratchSpace, UnmanagedMemory>;
326
327 const auto rbf_params = f.getFunctionParameters();
328 auto kernel = KOKKOS_LAMBDA(const MemberType &team)
329 {
330 const int batch = team.league_rank();
331 // Ranges
332 const auto inBegin = inOffsets(batch);
333 const auto inEnd = inOffsets(batch + 1);
334 const int n = inEnd - inBegin;
335
336 ScratchMatrix mesh(team.team_scratch(0), n, dim);
337
338 Kokkos::parallel_for(
339 Kokkos::TeamThreadRange(team, n), [&](int i) {
340 auto globalID = globalInIDs(i + inBegin);
341 for (int d = 0; d < dim; ++d) {
342 mesh(i, d) = inCoords(globalID, d);
343 }
344 });
345
346 // The matrix offset
347 const auto matrixBegin = matrixOffsets(batch);
348
349 team.team_barrier();
350
351 // Create an unmanaged 2D subview pointing into matrices
352 // This constructor: View(pointer, layout)
353 BatchMatrix<MemorySpace> localMatrix(&matrices(matrixBegin), n, n);
354
355 // Now fill localMatrix(r,c). We'll do a standard 2D nested parallel loop
356 Kokkos::parallel_for(
357 Kokkos::TeamThreadMDRange(team, n, n),
358 [=](int r, int c) {
359 // global indices in the original support/target arrays
360 // 1) Compute Euclidean distance
361 double dist = 0;
362 for (int d = 0; d < dim; ++d) {
363 double diff = mesh(r, d) - mesh(c, d);
364 dist += diff * diff;
365 }
366 dist = Kokkos::sqrt(dist);
367
368 // 2) Evaluate the RBF
369 double val = f(dist, rbf_params);
370
371 // 3) Store into localMatrix (2D)
372 localMatrix(c, r) = val;
373 }); // TeamThreadRange
374 };
375
376 // We put the solution and the in data values into shared memory
377 auto inBytes = ScratchView1d::shmem_size(maxInClusterSize);
378
379 auto teamSize = impl::findTeamSize<ExecSpace>(avgClusterSize, impl::Pow2Mode::Larger, kernel, TeamPolicy(nCluster, Kokkos::AUTO).set_scratch_size(/* level = */ 0, Kokkos::PerTeam(dim * inBytes)));
380 Kokkos::parallel_for("do_input_assembly", TeamPolicy(nCluster, teamSize).set_scratch_size(/* level = */ 0, Kokkos::PerTeam(dim * inBytes)), kernel);
381}
382
383// TODO: Using Kokkos::LayoutRight for the Coords performs a bit better for the assembly,
384// but it might deteriorate performance related to the polynomials. Especially for the gemv
385// for the polynomial contributions etc
386// For the full GPU porting, the Layout can only be LayoutRight, as we don't access the coordinates
387// coalesced, but rather first pick what we need
388template <typename EvalFunctionType, typename MemorySpace>
390 int nCluster, // Number of local systems
391 int dim, // Dimension of points
392 int avgClusterSize,
393 EvalFunctionType f,
394 const VectorOffsetView<MemorySpace> &inOffsets, // vertex offsets (length N+1)
395 const GlobalIDView<MemorySpace> &globalInIDs,
396 const MeshView<MemorySpace> &inCoords, // meshes
397 const VectorOffsetView<MemorySpace> &targetOffsets,
398 const GlobalIDView<MemorySpace> &globalTargetIDs,
399 const MeshView<MemorySpace> &targetCoords,
400 const MatrixOffsetView<MemorySpace> &matrixOffsets,
401 VectorView<MemorySpace> matrices) // 1D view of batched matrices
402{
403 using ExecSpace = typename MemorySpace::execution_space;
404 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
405 using MemberType = typename TeamPolicy::member_type;
406
407 const auto rbf_params = f.getFunctionParameters();
408 auto kernel = KOKKOS_LAMBDA(const MemberType &team)
409 {
410 const int batch = team.league_rank();
411 // Ranges
412 const auto inBegin = inOffsets(batch);
413 const auto inEnd = inOffsets(batch + 1);
414 const auto targetBegin = targetOffsets(batch);
415 const auto targetEnd = targetOffsets(batch + 1);
416
417 // For our batched matrix, this results in
418 const int nrows = targetEnd - targetBegin;
419 const int ncols = inEnd - inBegin;
420
421 // The matrix offset
422 const auto matrixBegin = matrixOffsets(batch);
423 // const size_t matrixEnd = matrixOffsets(batch + 1);
424
425 // Create an unmanaged 2D subview pointing into matrices
426 // This constructor: View(pointer, layout)
427 BatchMatrix<MemorySpace> localMatrix(&matrices(matrixBegin), nrows, ncols);
428
429 // Now fill localMatrix(r,c). We'll do a standard 2D nested parallel loop
430 Kokkos::parallel_for(
431 Kokkos::TeamThreadMDRange(team, nrows, ncols),
432 [=](int r, int c) {
433 // global indices in the original support/target arrays
434 offset_1d_type targetIdx = targetBegin + r;
435 offset_1d_type inIdx = inBegin + c;
436
437 auto globalIn = globalInIDs(inIdx);
438 auto globalTarget = globalTargetIDs(targetIdx);
439 // 1) Compute Euclidean distance
440 double dist = 0;
441 for (int d = 0; d < dim; ++d) {
442 double diff = inCoords(globalIn, d) - targetCoords(globalTarget, d);
443 dist += diff * diff;
444 }
445 dist = Kokkos::sqrt(dist);
446
447 // 2) Evaluate the RBF
448 double val = f(dist, rbf_params);
449
450 // 3) Store into localMatrix (2D)
451 localMatrix(r, c) = val;
452 }); // ThreadVectorRange
453 };
454
455 auto teamSize = impl::findTeamSize<ExecSpace>(avgClusterSize, impl::Pow2Mode::Larger, kernel, TeamPolicy(nCluster, Kokkos::AUTO));
456 Kokkos::parallel_for("do_batched_assembly", TeamPolicy(nCluster, teamSize), kernel);
457}
458
459template <typename MemorySpace>
461 int nCluster,
462 int avgClusterSize,
463 const MatrixOffsetView<MemorySpace> &matrixOffsets,
465{
466 using ExecSpace = typename MemorySpace::execution_space;
467 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
468 using MemberType = typename TeamPolicy::member_type;
469
470 auto kernel = KOKKOS_LAMBDA(const MemberType &team)
471 {
472 const int i = team.league_rank();
473 auto start = matrixOffsets(i);
474 auto end = matrixOffsets(i + 1);
475 int n = static_cast<int>(Kokkos::sqrt(end - start));
476
477 BatchMatrix<MemorySpace> A(&matrices(start), n, n);
478
479 KokkosBatched::TeamLU<MemberType, KokkosBatched::Algo::LU::Blocked>::invoke(team, A);
480 // Parallel end
481 };
482
483 // Using Kokkos::AUTO resulted in the best performance for all cases
484 auto teamSize = impl::findTeamSize<ExecSpace>(avgClusterSize, impl::Pow2Mode::Larger, kernel, TeamPolicy(nCluster, Kokkos::AUTO));
485 Kokkos::parallel_for("do_batched_lu", TeamPolicy(nCluster, teamSize), kernel);
486}
487
492template <bool polynomial, bool evaluation_op_available, typename EvalFunctionType, typename MemorySpace>
494 int nCluster,
495 int dim,
496 int avgInClusterSize,
497 int maxInClusterSize,
498 int maxOutClusterSize,
499 EvalFunctionType f,
500 const VectorOffsetView<MemorySpace> &rhsOffsets,
501 const GlobalIDView<MemorySpace> &globalRhsIDs,
503 const MatrixOffsetView<MemorySpace> &matrixOffsets,
504 const VectorView<MemorySpace> &matrices,
505 const VectorView<MemorySpace> &normalizedWeights,
506 const MatrixOffsetView<MemorySpace> &evalOffsets,
507 const VectorView<MemorySpace> &evalMat,
508 const VectorOffsetView<MemorySpace> &outOffsets,
509 const GlobalIDView<MemorySpace> &globalOutIDs,
511 // For the polynomial required in addition
512 const MeshView<MemorySpace> &inMesh,
513 const MeshView<MemorySpace> &outMesh,
514 const VectorView<MemorySpace> &qrMatrix,
515 const VectorView<MemorySpace> &qrTau,
516 const PivotView<MemorySpace> &qrP)
517{
518 using ExecSpace = typename MemorySpace::execution_space;
519 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
520 using MemberType = typename TeamPolicy::member_type;
521
522 using ScratchSpace = typename MemorySpace::scratch_memory_space;
523 // Layout is important for how we use these matrices: we need to ensure that cols are contiguous in memory
524 // We use the scratch memory for the input data and potentially for workspace required for the
525 // polynomial or for the input mesh in case we have to compute the evaluation on the fly
526 using ScratchView1d = Kokkos::View<double *[1], Kokkos::LayoutLeft, ScratchSpace, UnmanagedMemory>;
527 using ScratchView4d = Kokkos::View<double *[4], Kokkos::LayoutLeft, ScratchSpace, UnmanagedMemory>;
528 using ScratchVector = Kokkos::View<double *, ScratchSpace, UnmanagedMemory>;
529 using ScratchMatrix = std::conditional_t<!evaluation_op_available || polynomial, ScratchView4d, ScratchView1d>;
530 using ScratchMesh = Kokkos::View<double **, Kokkos::LayoutRight, ScratchSpace, UnmanagedMemory>;
531
532 // TODO: Add checks for the matrices (extent() > 0 with the template params)
533 // PRECICE_ASSERT()
534
535 const auto rbf_params = f.getFunctionParameters();
536 // We define the lambda here such that we can query the recommended team size from Kokkos
537 // Launch policy is then handled below
538 auto kernel = KOKKOS_LAMBDA(const MemberType &team)
539 {
540 // Required for correct capturing (mostly by device compilers), as these variables are only conditionally used further down
541 impl::capture_conditional_variables(dim, qrMatrix, qrTau, qrP, inMesh, outMesh, evalOffsets, evalMat, globalOutIDs, f, rbf_params, normalizedWeights, out);
542
543 // Step 1: Define some pointers
544 // TODO: We could potentially remove the rhsOffsets here and use a sqrt instead
545 const int batch = team.league_rank();
546 const auto inBegin = rhsOffsets(batch);
547 const int inSize = rhsOffsets(batch + 1) - inBegin;
548 const auto outBegin = outOffsets(batch);
549 const int outSize = outOffsets(batch + 1) - outBegin;
550
551 // Step 2: Allocate shared memory for the team and fill it with the inData of this cluster
552 // The scratch memory (shared memory for the device)
553 ScratchMatrix work(team.team_scratch(0), Kokkos::max(4, inSize));
554 auto in = Kokkos::subview(work, std::pair<int, int>(0, inSize), 0);
555
556 Kokkos::parallel_for(
557 Kokkos::TeamThreadRange(team, inSize), [&](int i) {
558 auto globalID = globalRhsIDs(i + inBegin);
559 in(i) = rhs(globalID);
560 });
561 team.team_barrier();
562
563 Kokkos::Array<double, 4> qrCoeffs = {0., 0., 0., 0.};
564
565 // Step 3: Solve the polynomial QR system, if we have one
566 if constexpr (polynomial) {
567
568 // Step 3a: Backup the current in data, since we solve the QR in place
569 // In principle, we need a vector here (just as in), but the ApplyQ routine expects a Rank2 matrix,
570 // so we have to stick to this particular syntax (keeping it rank 2 with one column)
571 auto in_cp = Kokkos::subview(work, std::pair<int, int>(0, inSize), std::pair<int, int>(1, 2));
572 Kokkos::parallel_for(
573 Kokkos::TeamThreadRange(team, inSize), [&](int i) { in_cp(i, 0) = in(i); });
574 team.team_barrier();
575
576 // Step 3b: Define pointers and matrices
577 const int matrixCols = dim + 1;
578 const offset_1d_type qrBegin = inBegin * matrixCols;
579 const offset_1d_type tauBegin = batch * matrixCols;
580 const offset_1d_type PBegin = batch * (matrixCols + 1);
581 const int rank = qrP(PBegin + matrixCols);
582
583 BatchMatrix<MemorySpace> qr(&qrMatrix(qrBegin), inSize, matrixCols);
584 BatchVector<double *, MemorySpace> tau(&qrTau(tauBegin), matrixCols);
585 BatchVector<int *, MemorySpace> P(&qrP(PBegin), matrixCols);
586
587 // Step 3c: Apply Q on the left of in, i.e., y = Q^T * in
588 if (team.team_rank() == 0) {
589
590 // tmp size might be insufficient: there was no size requirement specified for the workspace
591 // however, it needs to be contiguous
592 auto tmp = Kokkos::subview(work, Kokkos::ALL, 2);
593 KokkosBatched::ApplyQ<MemberType,
594 KokkosBatched::Side::Left,
595 KokkosBatched::Trans::Transpose,
596 KokkosBatched::Mode::Serial,
597 KokkosBatched::Algo::ApplyQ::Unblocked>::invoke(team, qr, tau, in_cp, tmp);
598
599 // Step 3d: Solve triangular solve R z = y
600 auto in_r = Kokkos::subview(in_cp, std::pair<int, int>(0, rank), 0);
601 auto R = Kokkos::subview(qr, std::pair<int, int>(0, rank), std::pair<int, int>(0, rank));
602
603 KokkosBatched::Trsv<
604 MemberType,
605 KokkosBatched::Uplo::Upper,
606 KokkosBatched::Trans::NoTranspose,
607 KokkosBatched::Diag::NonUnit,
608 KokkosBatched::Mode::Serial,
609 KokkosBatched::Algo::Trsv::Unblocked>::invoke(team, 1.0, R, in_r);
610 }
611
612 team.team_barrier();
613
614 // Step 3e: Copy the result over into memory for each thread
615 for (int r = 0; r < rank; ++r) {
616 qrCoeffs[r] = in_cp(r, 0);
617 }
618
619 // Step 3f: Apply pivoting x = P z
620 // There is also convenience routines for the pivoting, but we let every thread just
621 // apply the pivoting on its own, as it is more compact and the routine doesn't allow
622 // using an Array The below is the equivalent of the following (but qrCoeffs would need to be a view):
623 // KokkosBatched::TeamVectorApplyPivot<MemberType, Side::Left, Direct::Backward>::invoke(team, P, qrCoeffs);
624 for (int i = (matrixCols - 1); i >= 0; --i) {
625 Kokkos::kokkos_swap(qrCoeffs[i], qrCoeffs[i + P(i)]);
626 }
627 }
628
629 // Step 3g: Subtract polynomial portion from the input data: in -= Q * p
630 // In case we have to compute the evaluation operator further down, we
631 // also pull the in mesh into local shared memory
632 // threading over inSize
633 // This vector uses memory we have in principle managed by "work" (for the QR solve as tmp),
634 // but its layout is different (LayoutRight to have the coordinates aligned in memory).
635 // We have to declare it here outsie the "if" to be able to use it further down then.
636 // The memory here might point to null (or rather the end of "work"), in case polynomial = false
637 // and the evaluation_op is available but then it also remains unused. We make it here an optional
638 // to prevent Kokkos throwing errors about out-of-bounds accesses, the view is instantiated in the "if"
639 // branch below
640 std::optional<ScratchMesh> localInMesh;
641
642 if constexpr (!evaluation_op_available || polynomial) {
643 localInMesh.emplace(&work(0, 1), inSize, dim);
644
645 Kokkos::parallel_for(
646 Kokkos::TeamThreadRange(team, inSize),
647 [&](int i) {
648 auto globalID = globalRhsIDs(i + inBegin);
649 // The "1"/constant term is the last value in the result
650 // dim is here matrixCols - 1
651 double sum = qrCoeffs[dim];
652 // ... and the linear polynomial
653 for (int d = 0; d < dim; ++d) {
654 sum += inMesh(globalID, d) * qrCoeffs[d];
655 // Put it in shared memory as we later need it in the output evaluation
656 localInMesh->operator()(i, d) = inMesh(globalID, d);
657 }
658 in(i) -= sum;
659 });
660 team.team_barrier();
661 }
662
663 // Step 4: Solve the LU decomposition
664 // The lu inplace lu decomposition computed with KokkosBatched
665 // There is also a convenience routine for the LU solve, but it
666 // uses Trsm under the hood, which is quite a bit slower
667 auto matStart = matrixOffsets(batch);
668 BatchMatrix<MemorySpace> A(&matrices(matStart), inSize, inSize);
669
670 // Forward substitution: solve L * y = b and
671 KokkosBatched::Trsv<
672 MemberType,
673 KokkosBatched::Uplo::Lower,
674 KokkosBatched::Trans::NoTranspose,
675 KokkosBatched::Diag::Unit,
676 KokkosBatched::Mode::Team,
677 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, A, in);
678
679 team.team_barrier();
680
681 // Backward substitution: solve U * x = y
682 KokkosBatched::Trsv<
683 MemberType,
684 KokkosBatched::Uplo::Upper,
685 KokkosBatched::Trans::NoTranspose,
686 KokkosBatched::Diag::NonUnit,
687 KokkosBatched::Mode::Team,
688 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, A, in);
689 team.team_barrier();
690
691 // Step 5: Apply the output operator
692 // If we have the evaluation operator, we use it (might be slower though)
693 if constexpr (evaluation_op_available) {
694 // Step 5a: Allocate and zero out a local result vector (more of a safety feature)
695 ScratchVector res(team.team_scratch(1), outSize);
696 Kokkos::parallel_for(
697 Kokkos::TeamThreadRange(team, outSize),
698 [&](int i) { res(i) = 0; });
699 team.team_barrier();
700
701 // Step 5b: Multiply by the evaluation operator
702 // the evaluation matrix
703 auto startEval = evalOffsets(batch);
704 BatchMatrix<MemorySpace> eval(&evalMat(startEval), outSize, inSize);
705 // res := 1.0 * eval * b + 0.0 * res
706 KokkosBlas::Experimental::Gemv<
707 KokkosBlas::Mode::Team,
708 KokkosBlas::Algo::Gemv::Blocked>::invoke(team, 'N', 1.0, eval, in, 0.0, res);
709
710 team.team_barrier();
711
712 // Step 5c: write the weightes result back to the global vector
713 // potentially applying the polynomial term alongside
714 Kokkos::parallel_for(
715 Kokkos::TeamThreadRange(team, outSize),
716 [&](int i) {
717 auto globalID = globalOutIDs(i + outBegin);
718 double sum = res(i);
719 // Add polynomial portion to the output data: out += V * p
720 if constexpr (polynomial) {
721 // The "1"/constant term is the last value in the result
722 // dim is here matrixCols - 1
723 sum += qrCoeffs[dim];
724 // ... and the linear polynomial
725 for (int d = 0; d < dim; ++d) {
726 sum += outMesh(globalID, d) * qrCoeffs[d];
727 }
728 }
729 auto w = normalizedWeights(i + outBegin);
730 Kokkos::atomic_add(&out(globalID), sum * w);
731 }); // TeamThreadRange
732 } else {
733 // Alternative approach: do all in one go:
734 // Step 5a: each thread takes care of one output vertex
735 Kokkos::parallel_for(
736 Kokkos::TeamThreadRange(team, outSize), [&](int r) {
737 auto globalID = globalOutIDs(r + outBegin);
738
739 // we first extract the vertex coordinates
740 Kokkos::Array<double, 3> outVertex = {0., 0., 0.};
741 for (int d = 0; d < dim; ++d) {
742 outVertex[d] = outMesh(globalID, d);
743 }
744
745 // Step 5b: The matrix vector multiplication res = A * in
746 // Accumulate partial dot product in a thread-parallel manner
747 double sum = 0.0;
748 Kokkos::parallel_reduce(
749 Kokkos::ThreadVectorRange(team, inSize),
750 [&](int c, double &localSum) {
751 // compute the local output coefficients
752 double dist = 0;
753 for (int d = 0; d < dim; ++d) {
754 double diff = outVertex[d] - localInMesh->operator()(c, d);
755 dist += diff * diff;
756 }
757 dist = Kokkos::sqrt(dist);
758 // Evaluate the RBF
759 double val = f(dist, rbf_params);
760 localSum += val * in(c);
761 },
762 sum); // ThreadVectorRange
763
764 // Step 5c: if we have the polynomial as well, we have to apply it here
765 if constexpr (polynomial) {
766 // The "1"/constant term is the last value in the result
767 // dim is here matrixCols - 1
768 sum += qrCoeffs[dim];
769 // ... and the linear polynomial
770 for (int d = 0; d < dim; ++d) {
771 sum += outVertex[d] * qrCoeffs[d];
772 }
773 }
774 // Step 5d: Store final (weighted result) in the global out vector
775 auto w = normalizedWeights(r + outBegin);
776 Kokkos::atomic_add(&out(globalID), sum * w);
777 }); // End TeamThreadRange
778 } // end if-merged-evaluation
779 // End Team parallel loop
780 };
781
782 // We allocate one vector for indata and once for outdata
783 // We need at least four entries for the polynomial, seems unlikely for maxInClusterSize to be lower
784 // but we should be certain
785 auto inBytes = ScratchVector::shmem_size(std::max(4, maxInClusterSize));
786 auto outBytes = ScratchVector::shmem_size(maxOutClusterSize);
787 if (!evaluation_op_available || polynomial) {
788 // and additional storage if we have the polynomial
789 inBytes = 4 * inBytes;
790 }
791 // We use the outBytes only for the per-cluster output vector, which
792 // we don't need if we evaluate everything on the fly
793 if (!evaluation_op_available) {
794 outBytes = 0;
795 }
796
797 // We put the solution and the in data values into shared memory
798 // TODO: Avoid the duplicate memory definitions here, currently needed as we
799 // cannot change the Kokkos::AUTO to the actual recommendataion later
800 auto tmpPol = TeamPolicy(nCluster, Kokkos::AUTO)
801 .set_scratch_size(
802 /* level = */ 0, Kokkos::PerTeam(inBytes))
803 .set_scratch_size(
804 /* level = */ 1, Kokkos::PerTeam(outBytes));
805
806 auto teamSize = impl::findTeamSize<ExecSpace>(avgInClusterSize, impl::Pow2Mode::Smaller, kernel, tmpPol);
807
808 auto policy = TeamPolicy(nCluster, teamSize)
809 .set_scratch_size(
810 /* level = */ 0, Kokkos::PerTeam(inBytes))
811 .set_scratch_size(
812 /* level = */ 1, Kokkos::PerTeam(outBytes));
813
814 Kokkos::parallel_for("do_batched_solve", policy, kernel);
815}
816
817// Currently not used at all, solves the QR decomposition used for the polynomial
818// but cannot be applied standalone, as we subtract the polynomial part on a per-cluster
819// basis from the inData. Still useful for development purposes
820template <typename MemorySpace>
821void do_qr_solve(int nCluster,
822 int dim,
823 int maxInClusterSize,
825 GlobalIDView<MemorySpace> globalInIDs,
831 const VectorView<MemorySpace> weights,
833 GlobalIDView<MemorySpace> globalOutIDs,
835 MeshView<MemorySpace> outMesh)
836{
837 using TeamPolicy = Kokkos::TeamPolicy<MemorySpace>;
838 using MemberType = typename TeamPolicy::member_type;
839 using ScratchView = Kokkos::View<double *[2], typename MemorySpace::scratch_memory_space, UnmanagedMemory>;
840
841 // Used for the inData solution vector and the QR solving
842 auto scratchSize = ScratchView::shmem_size(2 * maxInClusterSize);
843 Kokkos::parallel_for("do_qr_solve", TeamPolicy(nCluster, Kokkos::AUTO).set_scratch_size(
844 /* level = */ 0, Kokkos::PerTeam(scratchSize)),
845 KOKKOS_LAMBDA(const MemberType &team) {
846 // Step 1: Some pointers we need
847 const int batch = team.league_rank();
848 // Ranges
849 const auto inBegin = inOffsets(batch);
850 const auto inEnd = inOffsets(batch + 1);
851 const int inSize = inEnd - inBegin;
852 const auto outBegin = outOffsets(batch);
853 const auto outEnd = outOffsets(batch + 1);
854 const int outSize = outEnd - outBegin;
855
856 // Step 2: Collect the inData
857 ScratchView tmp(team.team_scratch(0), inSize, 2);
858 auto in = Kokkos::subview(tmp, Kokkos::ALL, 0);
859 auto work = Kokkos::subview(tmp, Kokkos::ALL, 1); // work size is just a guess at the moment
860
861 Kokkos::parallel_for(
862 Kokkos::TeamThreadRange(team, inSize),
863 [&](int i) {
864 auto globalID = globalInIDs(i + inBegin);
865 in(i) = inData(globalID);
866 });
867
868 team.team_barrier();
869
870 // Step 3: Gather the QR data structures
871 const int matrixCols = dim + 1;
872 const offset_1d_type matrixBegin = inBegin * matrixCols;
873 const offset_1d_type tauBegin = batch * matrixCols;
874 const offset_1d_type PBegin = batch * (matrixCols + 1);
875 const int rank = qrP(PBegin + matrixCols);
876
877 BatchMatrix<MemorySpace> qr(&qrMatrix(matrixBegin), inSize, matrixCols);
878 BatchVector<double *, MemorySpace> tau(&qrTau(tauBegin), matrixCols);
879 BatchVector<int *, MemorySpace> P(&qrP(PBegin), matrixCols);
880
881 // Step 4: Solve the linear least-square system A x = b, in our case Q R P^T x = in
882
883 // Step 4a: Apply Q on the left of in, i.e., y = Q^T * in
884 KokkosBatched::TeamVectorApplyQ<MemberType,
885 KokkosBatched::Side::Left,
886 KokkosBatched::Trans::Transpose,
887 KokkosBatched::Algo::ApplyQ::Unblocked>::invoke(team, qr, tau, in, work);
888 team.team_barrier();
889
890 auto in_r = Kokkos::subview(in, std::pair<int, int>(0, rank));
891 auto R = Kokkos::subview(qr, std::pair<int, int>(0, rank), std::pair<int, int>(0, rank));
892
893 // Step 4b: Solve triangular solve R z = y
894 KokkosBatched::Trsv<
895 MemberType,
896 KokkosBatched::Uplo::Upper,
897 KokkosBatched::Trans::NoTranspose,
898 KokkosBatched::Diag::NonUnit,
899 KokkosBatched::Mode::Team,
900 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, R, in_r);
901 team.team_barrier();
902
903 auto res = Kokkos::subview(in, std::pair<int, int>(0, matrixCols));
904
905 // Steo 4c: zero out the entries which are not within the rank region
906 Kokkos::parallel_for(
907 Kokkos::TeamThreadRange(team, rank, matrixCols), [&](int i) { res(i) = 0; });
908
909 team.team_barrier();
910
911 // Step 4d: Apply pivoting x = P z
912 KokkosBatched::TeamVectorApplyPivot<MemberType,
913 KokkosBatched::Side::Left,
914 KokkosBatched::Direct::Backward>::invoke(team, P, res);
915 team.team_barrier();
916
917 // Step 5: Subtract polynomial portion from the input data: in -= Q * p
918 // threading over inSize
919 Kokkos::parallel_for(
920 Kokkos::TeamThreadRange(team, inBegin, inEnd),
921 [&](int i) {
922 auto globalID = globalInIDs(i);
923
924 // The "1"/constant term is the last value in the result
925 double tmp = res(matrixCols - 1);
926 // ... and the linear polynomial
927 for (int d = 0; d < dim; ++d)
928 tmp += inMesh(globalID, d) * res(d);
929
930 Kokkos::atomic_sub(&inData(globalID), tmp);
931 });
932 // no barrier needed here
933
934 // Step 6: Add polynomial portion to the output data: out += V * p
935 // threading over outSize
936 Kokkos::parallel_for(
937 Kokkos::TeamThreadRange(team, outBegin, outEnd),
938 [&](offset_1d_type i) {
939 auto globalID = globalOutIDs(i);
940
941 // The "1"/constant term is the last value in the result
942 double tmp = res(matrixCols - 1);
943 // ... and the linear polynomial
944 for (int d = 0; d < dim; ++d)
945 tmp += outMesh(globalID, d) * res(d);
946
947 // and the weight as usual
948 double w = weights(i);
949 Kokkos::atomic_add(&outData(globalID), tmp * w);
950 });
951 // end parallel_for
952 });
953}
954} // namespace precice::mapping::kernel
#define PRECICE_ASSERT(...)
Definition assertion.hpp:85
Wendland radial basis function with compact support.
RadialBasisParameters getFunctionParameters() const
auto findTeamSize(int avgWork, const Pow2Mode mode, const FunctorType &functor, const Policy &policy)
int powerOfTwo(int n, const Pow2Mode mode)
KOKKOS_INLINE_FUNCTION constexpr void capture_conditional_variables(const Args &...)
void do_batched_qr(int nCluster, int dim, int avgClusterSize, int maxClusterSize, VectorOffsetView< MemorySpace > inOffsets, GlobalIDView< MemorySpace > globalInIDs, MeshView< MemorySpace > inMesh, VectorView< MemorySpace > qrMatrix, VectorView< MemorySpace > qrTau, PivotView< MemorySpace > qrP)
void do_batched_assembly(int nCluster, int dim, int avgClusterSize, EvalFunctionType f, const VectorOffsetView< MemorySpace > &inOffsets, const GlobalIDView< MemorySpace > &globalInIDs, const MeshView< MemorySpace > &inCoords, const VectorOffsetView< MemorySpace > &targetOffsets, const GlobalIDView< MemorySpace > &globalTargetIDs, const MeshView< MemorySpace > &targetCoords, const MatrixOffsetView< MemorySpace > &matrixOffsets, VectorView< MemorySpace > matrices)
bool compute_weights(const int nCluster, const int avgOutClusterSize, const offset_1d_type nWeights, const int nMeshVertices, const int dim, VectorOffsetView< MemorySpace > offsets, MeshView< MemorySpace > centers, GlobalIDView< MemorySpace > globalIDs, MeshView< MemorySpace > mesh, const CompactPolynomialC2 &w, VectorView< MemorySpace > normalizedWeights)
void do_input_assembly(int nCluster, int dim, int avgClusterSize, int maxInClusterSize, EvalFunctionType f, const VectorOffsetView< MemorySpace > &inOffsets, const GlobalIDView< MemorySpace > &globalInIDs, const MeshView< MemorySpace > &inCoords, const MatrixOffsetView< MemorySpace > &matrixOffsets, VectorView< MemorySpace > matrices)
void do_qr_solve(int nCluster, int dim, int maxInClusterSize, VectorOffsetView< MemorySpace > inOffsets, GlobalIDView< MemorySpace > globalInIDs, VectorView< MemorySpace > inData, MeshView< MemorySpace > inMesh, VectorView< MemorySpace > qrMatrix, VectorView< MemorySpace > qrTau, PivotView< MemorySpace > qrP, const VectorView< MemorySpace > weights, VectorOffsetView< MemorySpace > outOffsets, GlobalIDView< MemorySpace > globalOutIDs, VectorView< MemorySpace > outData, MeshView< MemorySpace > outMesh)
void do_batched_solve(int nCluster, int dim, int avgInClusterSize, int maxInClusterSize, int maxOutClusterSize, EvalFunctionType f, const VectorOffsetView< MemorySpace > &rhsOffsets, const GlobalIDView< MemorySpace > &globalRhsIDs, VectorView< MemorySpace > rhs, const MatrixOffsetView< MemorySpace > &matrixOffsets, const VectorView< MemorySpace > &matrices, const VectorView< MemorySpace > &normalizedWeights, const MatrixOffsetView< MemorySpace > &evalOffsets, const VectorView< MemorySpace > &evalMat, const VectorOffsetView< MemorySpace > &outOffsets, const GlobalIDView< MemorySpace > &globalOutIDs, VectorView< MemorySpace > out, const MeshView< MemorySpace > &inMesh, const MeshView< MemorySpace > &outMesh, const VectorView< MemorySpace > &qrMatrix, const VectorView< MemorySpace > &qrTau, const PivotView< MemorySpace > &qrP)
void compute_offsets(const VectorOffsetView< MemorySpace > src1, const VectorOffsetView< MemorySpace > src2, MatrixOffsetView< MemorySpace > dst, int nCluster)
void do_batched_lu(int nCluster, int avgClusterSize, const MatrixOffsetView< MemorySpace > &matrixOffsets, VectorView< MemorySpace > matrices)
Kokkos::View< T, MemorySpace, UnmanagedMemory > BatchVector
Kokkos::View< double **, MemorySpace, UnmanagedMemory > BatchMatrix
ExecutionSpace::size_type offset_2d_type
Kokkos::View< int *, MemorySpace > PivotView
Kokkos::View< offset_2d_type *, MemorySpace > MatrixOffsetView
Kokkos::View< offset_1d_type *, MemorySpace > VectorOffsetView
Kokkos::View< VertexID *, MemorySpace > GlobalIDView
ExecutionSpace::size_type offset_1d_type
Kokkos::View< double *, MemorySpace > VectorView
Kokkos::View< double **, Kokkos::LayoutRight, MemorySpace > MeshView
constexpr T pow_int(const T base)
Computes the power of a given number by an integral exponent given at compile time,...
Definition math.hpp:22
provides Mesh, Data and primitives.
Wrapper struct that is used to transfer RBF-specific parameters to the GPU.