Skip to content

Commit

Permalink
Batched - Dense Serial QR: fixing issue with work array
Browse files Browse the repository at this point in the history
We did not pass the stride of the work array to internal routines
and we are not enforcing a contiguous memory allocation either so
when using subviews to pass the work array, we run into troubles.

Signed-off-by: Luc Berger-Vergiat <[email protected]>
  • Loading branch information
lucbv committed Dec 4, 2024
1 parent 21a1fbd commit f4dfbe9
Show file tree
Hide file tree
Showing 9 changed files with 179 additions and 114 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ struct SerialApplyLeftHouseholderInternal {
/* */ ValueType* u2, const int u2s,
/* */ ValueType* a1t, const int a1ts,
/* */ ValueType* A2, const int as0, const int as1,
/* */ ValueType* w1t) {
typedef ValueType value_type;
/* */ ValueType* w1t, const int ws0) {
using value_type = ValueType;

/// u2 m x 1
/// a1t 1 x n
Expand All @@ -54,15 +54,15 @@ struct SerialApplyLeftHouseholderInternal {
for (int j = 0; j < n; ++j) {
value_type tmp = a1t[j * a1ts];
for (int i = 0; i < m; ++i) tmp += Kokkos::ArithTraits<value_type>::conj(u2[i * u2s]) * A2[i * as0 + j * as1];
w1t[j] = tmp * inv_tau; // /= (*tau);
w1t[j * ws0] = tmp * inv_tau; // /= (*tau);
}

// a1t -= w1t (axpy)
for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j];
for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j * ws0];

// A2 -= u2 w1t (ger)
for (int j = 0; j < n; ++j)
for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j];
for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j * ws0];

return 0;
}
Expand All @@ -74,8 +74,8 @@ struct SerialApplyRightHouseholderInternal {
/* */ ValueType* u2, const int u2s,
/* */ ValueType* a1, const int a1s,
/* */ ValueType* A2, const int as0, const int as1,
/* */ ValueType* w1) {
typedef ValueType value_type;
/* */ ValueType* w1, const int ws0) {
using value_type = ValueType;
/// u2 n x 1
/// a1 m x 1
/// A2 m x n
Expand All @@ -93,15 +93,16 @@ struct SerialApplyRightHouseholderInternal {
for (int i = 0; i < m; ++i) {
value_type tmp = a1[i * a1s];
for (int j = 0; j < n; ++j) tmp += A2[i * as0 + j * as1] * u2[j * u2s];
w1[i] = tmp * inv_tau; // \= (*tau);
w1[i * ws0] = tmp * inv_tau; // \= (*tau);
}

// a1 -= w1 (axpy)
for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i];
for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i * ws0];

// A2 -= w1 * u2' (ger with conjugate)
for (int j = 0; j < n; ++j)
for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits<ValueType>::conj(u2[j * u2s]);
for (int i = 0; i < m; ++i)
A2[i * as0 + j * as1] -= w1[i * ws0] * Kokkos::ArithTraits<ValueType>::conj(u2[j * u2s]);

return 0;
}
Expand Down
6 changes: 3 additions & 3 deletions batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ<Side::Left, Trans::NoTranspose, Algo::Ap
const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) {
return SerialApplyQ_LeftForwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
B.stride_1(), w.data());
B.stride_1(), w.data(), w.stride_0());
}

template <>
Expand All @@ -42,7 +42,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ<Side::Left, Trans::Transpose, Algo::Appl
const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) {
return SerialApplyQ_LeftBackwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
B.stride_1(), w.data());
B.stride_1(), w.data(), w.stride_0());
}

template <>
Expand All @@ -51,7 +51,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ<Side::Right, Trans::NoTranspose, Algo::A
const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) {
return SerialApplyQ_RightForwardInternal::invoke(B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
B.stride_1(), w.data());
B.stride_1(), w.data(), w.strid_0());
}

} // namespace KokkosBatched
Expand Down
13 changes: 6 additions & 7 deletions batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ struct SerialApplyQ_LeftForwardInternal {
/* */ ValueType *A, const int as0, const int as1,
/* */ ValueType *t, const int ts,
/* */ ValueType *B, const int bs0, const int bs1,
/* */ ValueType *w) {
/* */ ValueType *w, const int ws) {
typedef ValueType value_type;

/// Given a matrix A that includes a series of householder vectors,
Expand Down Expand Up @@ -73,7 +73,7 @@ struct SerialApplyQ_LeftForwardInternal {
/// -----------------------------------------------------
// left apply householder to partitioned B1 and B2
SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, B_part3x1.A2, bs0,
bs1, w);
bs1, w, ws);

/// -----------------------------------------------------
A_part2x2.mergeToABR(A_part3x3);
Expand All @@ -90,7 +90,7 @@ struct SerialApplyQ_LeftBackwardInternal {
/* */ ValueType *A, const int as0, const int as1,
/* */ ValueType *t, const int ts,
/* */ ValueType *B, const int bs0, const int bs1,
/* */ ValueType *w) {
/* */ ValueType *w, const int ws) {
typedef ValueType value_type;

/// Given a matrix A that includes a series of householder vectors,
Expand Down Expand Up @@ -127,8 +127,7 @@ struct SerialApplyQ_LeftBackwardInternal {
/// -----------------------------------------------------
// left apply householder to partitioned B1 and B2
SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1, B_part3x1.A2, bs0,
bs1, w);

bs1, w, ws);
/// -----------------------------------------------------
A_part2x2.mergeToATL(A_part3x3);
t_part2x1.mergeToAT(t_part3x1);
Expand All @@ -144,7 +143,7 @@ struct SerialApplyQ_RightForwardInternal {
/* */ ValueType *A, const int as0, const int as1,
/* */ ValueType *t, const int ts,
/* */ ValueType *B, const int bs0, const int bs1,
/* */ ValueType *w) {
/* */ ValueType *w, const int ws) {
typedef ValueType value_type;

/// Given a matrix A that includes a series of householder vectors,
Expand Down Expand Up @@ -181,7 +180,7 @@ struct SerialApplyQ_RightForwardInternal {
/// -----------------------------------------------------
// right apply householder to partitioned B1 and B2
SerialApplyRightHouseholderInternal::invoke(m, n_B2, tau, A_part3x3.A21, as0, B_part1x3.A1, bs0, B_part1x3.A2,
bs0, bs1, w);
bs0, bs1, ws);
/// -----------------------------------------------------
A_part2x2.mergeToATL(A_part3x3);
t_part2x1.mergeToAT(t_part3x1);
Expand Down
6 changes: 3 additions & 3 deletions batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ struct SerialQR_FormQ_Internal {
/* */ ValueType* A, const int as0, const int as1,
/* */ ValueType* t, const int ts,
/* */ ValueType* Q, const int qs0, const int qs1,
/* */ ValueType* w, const bool is_Q_zero = false) {
typedef ValueType value_type;
/* */ ValueType* w, const int ws, const bool is_Q_zero = false) {
using value_type = ValueType;

/// Given a matrix A that includes QR factorization
/// it forms a unitary matrix Q
Expand All @@ -58,7 +58,7 @@ struct SerialQR_FormQ_Internal {
SerialSetIdentityInternal::invoke(m, m, Q, qs0, qs1);
}

return SerialApplyQ_LeftForwardInternal::invoke(m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w);
return SerialApplyQ_LeftForwardInternal::invoke(m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w, ws);
}
};

Expand Down
2 changes: 1 addition & 1 deletion batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ template <typename AViewType, typename tViewType, typename wViewType>
KOKKOS_INLINE_FUNCTION int SerialQR<Algo::QR::Unblocked>::invoke(const AViewType &A, const tViewType &t,
const wViewType &w) {
return SerialQR_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), t.data(),
t.stride_0(), w.data());
t.stride_0(), w.data(), w.stride_0());
}

} // namespace KokkosBatched
Expand Down
4 changes: 2 additions & 2 deletions batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ struct SerialQR_Internal {
const int n, // n = NumCols(A)
/* */ ValueType *A, const int as0, const int as1,
/* */ ValueType *t, const int ts,
/* */ ValueType *w) {
/* */ ValueType *w, const int ws) {
typedef ValueType value_type;

/// Given a matrix A, it computes QR decomposition of the matrix
Expand Down Expand Up @@ -69,7 +69,7 @@ struct SerialQR_Internal {

// left apply householder to A22
SerialApplyLeftHouseholderInternal::invoke(m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1,
A_part3x3.A22, as0, as1, w);
A_part3x3.A22, as0, as1, w, ws);
/// -----------------------------------------------------
A_part2x2.mergeToATL(A_part3x3);
t_part2x1.mergeToAT(t_part3x1);
Expand Down
11 changes: 6 additions & 5 deletions batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,12 @@ struct SerialSVDInternal {
if (n - i > 1) {
KokkosBatched::SerialApplyLeftHouseholderInternal::invoke<value_type>(
m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1),
As0, As1, work);
As0, As1, work, 1);
}
if (U) {
KokkosBatched::SerialApplyRightHouseholderInternal::invoke<value_type>(
m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work);
KokkosBatched::SerialApplyRightHouseholderInternal::invoke<value_type>(m, m - i - 1, &tau, &SVDIND(A, i + 1, i),
As0, &SVDIND(U, 0, i), Us0,
&SVDIND(U, 0, i + 1), Us0, Us1, work, 1);
}
// Zero out A subdiag explicitly (NOTE: may not be necessary...)
for (int j = i + 1; j < m; j++) {
Expand All @@ -193,12 +194,12 @@ struct SerialSVDInternal {
if (m - i > 1) {
KokkosBatched::SerialApplyRightHouseholderInternal::invoke<value_type>(
m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(A, i + 1, i + 1), As0,
&SVDIND(A, i + 1, i + 2), As0, As1, work);
&SVDIND(A, i + 1, i + 2), As0, As1, work, 1);
}
if (Vt) {
KokkosBatched::SerialApplyLeftHouseholderInternal::invoke<value_type>(
n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0,
Vts1, work);
Vts1, work, 1);
}
// Zero out A superdiag row explicitly
for (int j = i + 2; j < n; j++) {
Expand Down
6 changes: 3 additions & 3 deletions batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@ struct ApplyQ {
KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t,
const BViewType &B, const wViewType &w) {
int r_val = 0;
if (std::is_same<ArgMode, Mode::Serial>::value) {
if constexpr (std::is_same_v<ArgMode, Mode::Serial>) {
r_val = SerialApplyQ<ArgSide, ArgTrans, ArgAlgo>::invoke(A, t, B, w);
} else if (std::is_same<ArgMode, Mode::Team>::value) {
} else if constexpr (std::is_same_v<ArgMode, Mode::Team>) {
r_val = TeamApplyQ<MemberType, ArgSide, ArgTrans, ArgAlgo>::invoke(member, A, t, B, w);
} else if (std::is_same<ArgMode, Mode::Team>::value) {
} else if constexpr (std::is_same_v<ArgMode, Mode::TeamVector>) {
r_val = TeamVectorApplyQ<MemberType, ArgSide, ArgTrans, ArgAlgo>::invoke(member, A, t, B, w);
}
return r_val;
Expand Down
Loading

0 comments on commit f4dfbe9

Please sign in to comment.