190 using TeamPolicy = Kokkos::TeamPolicy<MemorySpace>;
191 using MemberType =
typename TeamPolicy::member_type;
192 using ScratchView = Kokkos::View<double *, typename MemorySpace::scratch_memory_space, UnmanagedMemory>;
195 Kokkos::deep_copy(qrMatrix, 1.0);
198 auto kernel = KOKKOS_LAMBDA(
const MemberType &team)
201 const int batch = team.league_rank();
204 const auto begin = offsets(batch);
205 const int verticesPerCluster = offsets(batch + 1) - begin;
206 const int matrixCols = dim + 1;
212 Kokkos::parallel_for(
213 Kokkos::TeamThreadRange(team, verticesPerCluster),
215 auto globalID = globalIDs(i + begin);
217 for (
int d = 0; d < dim; ++d) {
218 qr(i, d) =
mesh(globalID, d);
231 int &rank = qrP(PBegin + matrixCols);
234 ScratchView work(team.team_scratch(0), 2 * verticesPerCluster);
240 KokkosBatched::TeamVectorQR_WithColumnPivoting<MemberType,
241 KokkosBatched::Algo::QR::Unblocked>::invoke(team, qr, tau,
250 double threshold = 1e-5;
251 if (team.team_rank() == 0) {
252 const double maxp = Kokkos::abs(qr(0, 0));
254 for (
int i = 0; i < rank; ++i) {
255 r +=
static_cast<int>(Kokkos::abs(qr(i, i)) > (threshold * maxp));
265 auto scratchSize = ScratchView::shmem_size(2 * maxClusterSize);
269 std::unique_ptr<TeamPolicy> policy;
270 if constexpr (std::is_same_v<
decltype(teamSize), Kokkos::AUTO_t>) {
271 policy = std::make_unique<TeamPolicy>(nCluster, Kokkos::AUTO);
273 policy = std::make_unique<TeamPolicy>(nCluster, 4, teamSize / 4);
275 policy->set_scratch_size( 0, Kokkos::PerTeam(scratchSize));
276 Kokkos::parallel_for(
"do_batched_qr", *policy,
kernel);
311 int maxInClusterSize,
319 using ExecSpace =
typename MemorySpace::execution_space;
320 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
321 using MemberType =
typename TeamPolicy::member_type;
323 using ScratchSpace =
typename MemorySpace::scratch_memory_space;
324 using ScratchView1d = Kokkos::View<double *, ScratchSpace, UnmanagedMemory>;
325 using ScratchMatrix = Kokkos::View<double **, Kokkos::LayoutRight, ScratchSpace, UnmanagedMemory>;
327 const auto rbf_params = f.getFunctionParameters();
328 auto kernel = KOKKOS_LAMBDA(
const MemberType &team)
330 const int batch = team.league_rank();
332 const auto inBegin = inOffsets(batch);
333 const auto inEnd = inOffsets(batch + 1);
334 const int n = inEnd - inBegin;
336 ScratchMatrix
mesh(team.team_scratch(0), n, dim);
338 Kokkos::parallel_for(
339 Kokkos::TeamThreadRange(team, n), [&](
int i) {
340 auto globalID = globalInIDs(i + inBegin);
341 for (
int d = 0; d < dim; ++d) {
342 mesh(i, d) = inCoords(globalID, d);
347 const auto matrixBegin = matrixOffsets(batch);
356 Kokkos::parallel_for(
357 Kokkos::TeamThreadMDRange(team, n, n),
362 for (
int d = 0; d < dim; ++d) {
363 double diff =
mesh(r, d) -
mesh(c, d);
366 dist = Kokkos::sqrt(dist);
369 double val = f(dist, rbf_params);
372 localMatrix(c, r) = val;
377 auto inBytes = ScratchView1d::shmem_size(maxInClusterSize);
380 Kokkos::parallel_for(
"do_input_assembly", TeamPolicy(nCluster, teamSize).set_scratch_size( 0, Kokkos::PerTeam(dim * inBytes)),
kernel);
496 int avgInClusterSize,
497 int maxInClusterSize,
498 int maxOutClusterSize,
518 using ExecSpace =
typename MemorySpace::execution_space;
519 using TeamPolicy = Kokkos::TeamPolicy<ExecSpace>;
520 using MemberType =
typename TeamPolicy::member_type;
522 using ScratchSpace =
typename MemorySpace::scratch_memory_space;
526 using ScratchView1d = Kokkos::View<double *[1], Kokkos::LayoutLeft, ScratchSpace, UnmanagedMemory>;
527 using ScratchView4d = Kokkos::View<double *[4], Kokkos::LayoutLeft, ScratchSpace, UnmanagedMemory>;
528 using ScratchVector = Kokkos::View<double *, ScratchSpace, UnmanagedMemory>;
529 using ScratchMatrix = std::conditional_t<!evaluation_op_available || polynomial, ScratchView4d, ScratchView1d>;
530 using ScratchMesh = Kokkos::View<double **, Kokkos::LayoutRight, ScratchSpace, UnmanagedMemory>;
535 const auto rbf_params = f.getFunctionParameters();
538 auto kernel = KOKKOS_LAMBDA(
const MemberType &team)
541 impl::capture_conditional_variables(dim, qrMatrix, qrTau, qrP, inMesh, outMesh, evalOffsets, evalMat, globalOutIDs, f, rbf_params, normalizedWeights, out);
545 const int batch = team.league_rank();
546 const auto inBegin = rhsOffsets(batch);
547 const int inSize = rhsOffsets(batch + 1) - inBegin;
548 const auto outBegin = outOffsets(batch);
549 const int outSize = outOffsets(batch + 1) - outBegin;
553 ScratchMatrix work(team.team_scratch(0), Kokkos::max(4, inSize));
554 auto in = Kokkos::subview(work, std::pair<int, int>(0, inSize), 0);
556 Kokkos::parallel_for(
557 Kokkos::TeamThreadRange(team, inSize), [&](
int i) {
558 auto globalID = globalRhsIDs(i + inBegin);
559 in(i) = rhs(globalID);
563 Kokkos::Array<double, 4> qrCoeffs = {0., 0., 0., 0.};
566 if constexpr (polynomial) {
571 auto in_cp = Kokkos::subview(work, std::pair<int, int>(0, inSize), std::pair<int, int>(1, 2));
572 Kokkos::parallel_for(
573 Kokkos::TeamThreadRange(team, inSize), [&](
int i) { in_cp(i, 0) = in(i); });
577 const int matrixCols = dim + 1;
581 const int rank = qrP(PBegin + matrixCols);
588 if (team.team_rank() == 0) {
592 auto tmp = Kokkos::subview(work, Kokkos::ALL, 2);
593 KokkosBatched::ApplyQ<MemberType,
594 KokkosBatched::Side::Left,
595 KokkosBatched::Trans::Transpose,
596 KokkosBatched::Mode::Serial,
597 KokkosBatched::Algo::ApplyQ::Unblocked>::invoke(team, qr, tau, in_cp, tmp);
600 auto in_r = Kokkos::subview(in_cp, std::pair<int, int>(0, rank), 0);
601 auto R = Kokkos::subview(qr, std::pair<int, int>(0, rank), std::pair<int, int>(0, rank));
605 KokkosBatched::Uplo::Upper,
606 KokkosBatched::Trans::NoTranspose,
607 KokkosBatched::Diag::NonUnit,
608 KokkosBatched::Mode::Serial,
609 KokkosBatched::Algo::Trsv::Unblocked>::invoke(team, 1.0, R, in_r);
615 for (
int r = 0; r < rank; ++r) {
616 qrCoeffs[r] = in_cp(r, 0);
624 for (
int i = (matrixCols - 1); i >= 0; --i) {
625 Kokkos::kokkos_swap(qrCoeffs[i], qrCoeffs[i + P(i)]);
640 std::optional<ScratchMesh> localInMesh;
642 if constexpr (!evaluation_op_available || polynomial) {
643 localInMesh.emplace(&work(0, 1), inSize, dim);
645 Kokkos::parallel_for(
646 Kokkos::TeamThreadRange(team, inSize),
648 auto globalID = globalRhsIDs(i + inBegin);
651 double sum = qrCoeffs[dim];
653 for (
int d = 0; d < dim; ++d) {
654 sum += inMesh(globalID, d) * qrCoeffs[d];
656 localInMesh->operator()(i, d) = inMesh(globalID, d);
667 auto matStart = matrixOffsets(batch);
673 KokkosBatched::Uplo::Lower,
674 KokkosBatched::Trans::NoTranspose,
675 KokkosBatched::Diag::Unit,
676 KokkosBatched::Mode::Team,
677 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, A, in);
684 KokkosBatched::Uplo::Upper,
685 KokkosBatched::Trans::NoTranspose,
686 KokkosBatched::Diag::NonUnit,
687 KokkosBatched::Mode::Team,
688 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, A, in);
693 if constexpr (evaluation_op_available) {
695 ScratchVector res(team.team_scratch(1), outSize);
696 Kokkos::parallel_for(
697 Kokkos::TeamThreadRange(team, outSize),
698 [&](
int i) { res(i) = 0; });
703 auto startEval = evalOffsets(batch);
706 KokkosBlas::Experimental::Gemv<
707 KokkosBlas::Mode::Team,
708 KokkosBlas::Algo::Gemv::Blocked>::invoke(team,
'N', 1.0, eval, in, 0.0, res);
714 Kokkos::parallel_for(
715 Kokkos::TeamThreadRange(team, outSize),
717 auto globalID = globalOutIDs(i + outBegin);
720 if constexpr (polynomial) {
723 sum += qrCoeffs[dim];
725 for (
int d = 0; d < dim; ++d) {
726 sum += outMesh(globalID, d) * qrCoeffs[d];
729 auto w = normalizedWeights(i + outBegin);
730 Kokkos::atomic_add(&out(globalID), sum * w);
735 Kokkos::parallel_for(
736 Kokkos::TeamThreadRange(team, outSize), [&](
int r) {
737 auto globalID = globalOutIDs(r + outBegin);
740 Kokkos::Array<double, 3> outVertex = {0., 0., 0.};
741 for (
int d = 0; d < dim; ++d) {
742 outVertex[d] = outMesh(globalID, d);
748 Kokkos::parallel_reduce(
749 Kokkos::ThreadVectorRange(team, inSize),
750 [&](
int c,
double &localSum) {
753 for (
int d = 0; d < dim; ++d) {
754 double diff = outVertex[d] - localInMesh->operator()(c, d);
757 dist = Kokkos::sqrt(dist);
759 double val = f(dist, rbf_params);
760 localSum += val * in(c);
765 if constexpr (polynomial) {
768 sum += qrCoeffs[dim];
770 for (
int d = 0; d < dim; ++d) {
771 sum += outVertex[d] * qrCoeffs[d];
775 auto w = normalizedWeights(r + outBegin);
776 Kokkos::atomic_add(&out(globalID), sum * w);
785 auto inBytes = ScratchVector::shmem_size(std::max(4, maxInClusterSize));
786 auto outBytes = ScratchVector::shmem_size(maxOutClusterSize);
787 if (!evaluation_op_available || polynomial) {
789 inBytes = 4 * inBytes;
793 if (!evaluation_op_available) {
800 auto tmpPol = TeamPolicy(nCluster, Kokkos::AUTO)
802 0, Kokkos::PerTeam(inBytes))
804 1, Kokkos::PerTeam(outBytes));
808 auto policy = TeamPolicy(nCluster, teamSize)
810 0, Kokkos::PerTeam(inBytes))
812 1, Kokkos::PerTeam(outBytes));
814 Kokkos::parallel_for(
"do_batched_solve", policy,
kernel);
823 int maxInClusterSize,
837 using TeamPolicy = Kokkos::TeamPolicy<MemorySpace>;
838 using MemberType =
typename TeamPolicy::member_type;
839 using ScratchView = Kokkos::View<double *[2], typename MemorySpace::scratch_memory_space, UnmanagedMemory>;
842 auto scratchSize = ScratchView::shmem_size(2 * maxInClusterSize);
843 Kokkos::parallel_for(
"do_qr_solve", TeamPolicy(nCluster, Kokkos::AUTO).set_scratch_size(
844 0, Kokkos::PerTeam(scratchSize)),
845 KOKKOS_LAMBDA(
const MemberType &team) {
847 const int batch = team.league_rank();
849 const auto inBegin = inOffsets(batch);
850 const auto inEnd = inOffsets(batch + 1);
851 const int inSize = inEnd - inBegin;
852 const auto outBegin = outOffsets(batch);
853 const auto outEnd = outOffsets(batch + 1);
854 const int outSize = outEnd - outBegin;
857 ScratchView tmp(team.team_scratch(0), inSize, 2);
858 auto in = Kokkos::subview(tmp, Kokkos::ALL, 0);
859 auto work = Kokkos::subview(tmp, Kokkos::ALL, 1);
861 Kokkos::parallel_for(
862 Kokkos::TeamThreadRange(team, inSize),
864 auto globalID = globalInIDs(i + inBegin);
865 in(i) = inData(globalID);
871 const int matrixCols = dim + 1;
875 const int rank = qrP(PBegin + matrixCols);
884 KokkosBatched::TeamVectorApplyQ<MemberType,
885 KokkosBatched::Side::Left,
886 KokkosBatched::Trans::Transpose,
887 KokkosBatched::Algo::ApplyQ::Unblocked>::invoke(team, qr, tau, in, work);
890 auto in_r = Kokkos::subview(in, std::pair<int, int>(0, rank));
891 auto R = Kokkos::subview(qr, std::pair<int, int>(0, rank), std::pair<int, int>(0, rank));
896 KokkosBatched::Uplo::Upper,
897 KokkosBatched::Trans::NoTranspose,
898 KokkosBatched::Diag::NonUnit,
899 KokkosBatched::Mode::Team,
900 KokkosBatched::Algo::Trsv::Blocked>::invoke(team, 1.0, R, in_r);
903 auto res = Kokkos::subview(in, std::pair<int, int>(0, matrixCols));
906 Kokkos::parallel_for(
907 Kokkos::TeamThreadRange(team, rank, matrixCols), [&](
int i) { res(i) = 0; });
912 KokkosBatched::TeamVectorApplyPivot<MemberType,
913 KokkosBatched::Side::Left,
914 KokkosBatched::Direct::Backward>::invoke(team, P, res);
919 Kokkos::parallel_for(
920 Kokkos::TeamThreadRange(team, inBegin, inEnd),
922 auto globalID = globalInIDs(i);
925 double tmp = res(matrixCols - 1);
927 for (
int d = 0; d < dim; ++d)
928 tmp += inMesh(globalID, d) * res(d);
930 Kokkos::atomic_sub(&inData(globalID), tmp);
936 Kokkos::parallel_for(
937 Kokkos::TeamThreadRange(team, outBegin, outEnd),
939 auto globalID = globalOutIDs(i);
942 double tmp = res(matrixCols - 1);
944 for (
int d = 0; d < dim; ++d)
945 tmp += outMesh(globalID, d) * res(d);
948 double w = weights(i);
949 Kokkos::atomic_add(&outData(globalID), tmp * w);