// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2008-2015 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_SPARSEDENSEPRODUCT_H #define EIGEN_SPARSEDENSEPRODUCT_H // IWYU pragma: private #include "./InternalHeaderCheck.h" namespace Eigen { namespace internal { template <> struct product_promote_storage_type { typedef Sparse ret; }; template <> struct product_promote_storage_type { typedef Sparse ret; }; template struct sparse_time_dense_product_impl; template struct sparse_time_dense_product_impl { typedef internal::remove_all_t Lhs; typedef internal::remove_all_t Rhs; typedef internal::remove_all_t Res; typedef typename evaluator::InnerIterator LhsInnerIterator; typedef evaluator LhsEval; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { LhsEval lhsEval(lhs); Index n = lhs.outerSize(); #ifdef EIGEN_HAS_OPENMP Index threads = Eigen::nbThreads(); #endif for (Index c = 0; c < rhs.cols(); ++c) { #ifdef EIGEN_HAS_OPENMP // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems. // It basically represents the minimal amount of work to be done to be worth it. if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) { #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads) for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c); } else #endif { for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c); } } } static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha, Index i, Index col) { // Two accumulators, which breaks the dependency chain on the accumulator // and allows more instruction-level parallelism in the following loop typename Res::Scalar tmp_a(0); typename Res::Scalar tmp_b(0); for (LhsInnerIterator it(lhsEval, i); it; ++it) { tmp_a += it.value() * rhs.coeff(it.index(), col); ++it; if (it) { tmp_b += it.value() * rhs.coeff(it.index(), col); } } res.coeffRef(i, col) += alpha * (tmp_a + tmp_b); } }; // FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format? // -> let's disable it for now as it is conflicting with generic scalar*matrix and matrix*scalar operators // template // struct ScalarBinaryOpTraits > // { // enum { // Defined = 1 // }; // typedef typename CwiseUnaryOp, T2>::PlainObject ReturnType; // }; template struct sparse_time_dense_product_impl { typedef internal::remove_all_t Lhs; typedef internal::remove_all_t Rhs; typedef internal::remove_all_t Res; typedef evaluator LhsEval; typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) { LhsEval lhsEval(lhs); for (Index c = 0; c < rhs.cols(); ++c) { for (Index j = 0; j < lhs.outerSize(); ++j) { // typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c); typename ScalarBinaryOpTraits::ReturnType rhs_j(alpha * rhs.coeff(j, c)); for (LhsInnerIterator it(lhsEval, j); it; ++it) res.coeffRef(it.index(), c) += it.value() * rhs_j; } } } }; template struct sparse_time_dense_product_impl { typedef internal::remove_all_t Lhs; typedef internal::remove_all_t Rhs; typedef internal::remove_all_t Res; typedef evaluator LhsEval; typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { Index n = lhs.rows(); LhsEval lhsEval(lhs); #ifdef EIGEN_HAS_OPENMP Index threads = Eigen::nbThreads(); // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems. // It basically represents the minimal amount of work to be done to be worth it. if (threads > 1 && lhsEval.nonZerosEstimate() * rhs.cols() > 20000) { #pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads) for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i); } else #endif { for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i); } } static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha, Index i) { typename Res::RowXpr res_i(res.row(i)); for (LhsInnerIterator it(lhsEval, i); it; ++it) res_i += (alpha * it.value()) * rhs.row(it.index()); } }; template struct sparse_time_dense_product_impl { typedef internal::remove_all_t Lhs; typedef internal::remove_all_t Rhs; typedef internal::remove_all_t Res; typedef typename evaluator::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { evaluator lhsEval(lhs); for (Index j = 0; j < lhs.outerSize(); ++j) { typename Rhs::ConstRowXpr rhs_j(rhs.row(j)); for (LhsInnerIterator it(lhsEval, j); it; ++it) res.row(it.index()) += (alpha * it.value()) * rhs_j; } } }; template inline void sparse_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) { sparse_time_dense_product_impl::run(lhs, rhs, res, alpha); } } // end namespace internal namespace internal { template struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; template static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { typedef typename nested_eval::type LhsNested; typedef typename nested_eval::type RhsNested; LhsNested lhsNested(lhs); RhsNested rhsNested(rhs); internal::sparse_time_dense_product(lhsNested, rhsNested, dst, alpha); } }; template struct generic_product_impl : generic_product_impl {}; template struct generic_product_impl : generic_product_impl_base > { typedef typename Product::Scalar Scalar; template static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { typedef typename nested_eval::type LhsNested; typedef typename nested_eval::type RhsNested; LhsNested lhsNested(lhs); RhsNested rhsNested(rhs); // transpose everything Transpose dstT(dst); internal::sparse_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha); } }; template struct generic_product_impl : generic_product_impl {}; template struct sparse_dense_outer_product_evaluator { protected: typedef std::conditional_t Lhs1; typedef std::conditional_t ActualRhs; typedef Product ProdXprType; // if the actual left-hand side is a dense vector, // then build a sparse-view so that we can seamlessly iterate over it. typedef std::conditional_t::StorageKind, Sparse>::value, Lhs1, SparseView > ActualLhs; typedef std::conditional_t::StorageKind, Sparse>::value, Lhs1 const&, SparseView > LhsArg; typedef evaluator LhsEval; typedef evaluator RhsEval; typedef typename evaluator::InnerIterator LhsIterator; typedef typename ProdXprType::Scalar Scalar; public: enum { Flags = NeedToTranspose ? RowMajorBit : 0, CoeffReadCost = HugeCost }; class InnerIterator : public LhsIterator { public: InnerIterator(const sparse_dense_outer_product_evaluator& xprEval, Index outer) : LhsIterator(xprEval.m_lhsXprImpl, 0), m_outer(outer), m_empty(false), m_factor(get(xprEval.m_rhsXprImpl, outer, typename internal::traits::StorageKind())) {} EIGEN_STRONG_INLINE Index outer() const { return m_outer; } EIGEN_STRONG_INLINE Index row() const { return NeedToTranspose ? m_outer : LhsIterator::index(); } EIGEN_STRONG_INLINE Index col() const { return NeedToTranspose ? LhsIterator::index() : m_outer; } EIGEN_STRONG_INLINE Scalar value() const { return LhsIterator::value() * m_factor; } EIGEN_STRONG_INLINE operator bool() const { return LhsIterator::operator bool() && (!m_empty); } protected: Scalar get(const RhsEval& rhs, Index outer, Dense = Dense()) const { return rhs.coeff(outer); } Scalar get(const RhsEval& rhs, Index outer, Sparse = Sparse()) { typename RhsEval::InnerIterator it(rhs, outer); if (it && it.index() == 0 && it.value() != Scalar(0)) return it.value(); m_empty = true; return Scalar(0); } Index m_outer; bool m_empty; Scalar m_factor; }; sparse_dense_outer_product_evaluator(const Lhs1& lhs, const ActualRhs& rhs) : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } // transpose case sparse_dense_outer_product_evaluator(const ActualRhs& rhs, const Lhs1& lhs) : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } protected: const LhsArg m_lhs; evaluator m_lhsXprImpl; evaluator m_rhsXprImpl; }; // sparse * dense outer product template struct product_evaluator, OuterProduct, SparseShape, DenseShape> : sparse_dense_outer_product_evaluator { typedef sparse_dense_outer_product_evaluator Base; typedef Product XprType; typedef typename XprType::PlainObject PlainObject; explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs()) {} }; template struct product_evaluator, OuterProduct, DenseShape, SparseShape> : sparse_dense_outer_product_evaluator { typedef sparse_dense_outer_product_evaluator Base; typedef Product XprType; typedef typename XprType::PlainObject PlainObject; explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs()) {} }; } // end namespace internal } // end namespace Eigen #endif // EIGEN_SPARSEDENSEPRODUCT_H