From 679da1743d70b71b9d23f6d07d6e3c34d08e1b1b Mon Sep 17 00:00:00 2001 From: Stefano Zampini <stefano.zampini@gmail.com> Date: Sun, 18 Oct 2020 18:37:22 +0300 Subject: [PATCH] Implementing fast assembly with COO support --- src/petsc4Foam/solvers/petscSolver.C | 594 +++++++++++++++++---------- 1 file changed, 380 insertions(+), 214 deletions(-) diff --git a/src/petsc4Foam/solvers/petscSolver.C b/src/petsc4Foam/solvers/petscSolver.C index d7e6cdb..c22d93b 100644 --- a/src/petsc4Foam/solvers/petscSolver.C +++ b/src/petsc4Foam/solvers/petscSolver.C @@ -7,6 +7,7 @@ ------------------------------------------------------------------------------- Copyright (C) 2018-2020 OpenCFD Ltd. Copyright (C) 2019-2020 Simone Bna + Copyright (C) 2021 Stefano Zampini ------------------------------------------------------------------------------- License This file is part of OpenFOAM. @@ -473,92 +474,206 @@ void Foam::petscSolver::buildMat MatSetOptionsPrefix(Amat, prefix_.c_str()); MatSetFromOptions(Amat); - // On-processor non-zero entries, per row. - // Initialise with 1 (includes diagonal). - integerList nonZero_(nrows_, 1); - lowNonZero.resize(nrows_, 0); - - // Number of on-processor non-zeros - // set the matrix options - - if (!is_sbaij(Amat)) + // Query for fast COO support + PetscErrorCode (*f)(Mat,PetscInt,const PetscInt[],const PetscInt[]) = NULL; + PetscObjectQueryFunction((PetscObject)Amat,"MatSetPreallocationCOO_C",&f); + if (f) { - for (const label celli : upp) + const globalIndex globalNumbering_(nrows_); + + // Number of non-zeros from interfaces (on-processor or off-processor) + const label nReq = Pstream::nRequests(); + label nProcValues = 0; + labelList globalCells + ( + identity(globalNumbering_.range()) + ); + forAll(interfaces, patchi) { - ++nonZero_[celli]; + const lduInterface* intf = interfaces.set(patchi); + + if (intf) + { + if (isA<cyclicAMILduInterface>(*intf)) + { + FatalErrorInFunction + << "cyclicAMI is not supported" << nl + << exit(FatalError); + } + nProcValues += lduAddr.patchAddr(patchi).size(); + interfaces[patchi].initInternalFieldTransfer + ( + Pstream::commsTypes::nonBlocking, + globalCells + ); + } } - } + PetscInt toti = nrows_+low.size()+upp.size()+nProcValues; - for (const label celli : low) - { - ++nonZero_[celli]; - ++lowNonZero[celli]; - } + PetscInt *coo_i, *coo_j; + PetscMalloc1(toti, &coo_i); + PetscMalloc1(toti, &coo_j); - maxLowNonZeroPerRow = max(lowNonZero); + PetscInt *ptr_i = coo_i; + PetscInt *ptr_j = coo_j; + label off = globalNumbering_.localStart(); + for (label celli = off; celli < off + nrows_; ++celli) + { + const PetscInt globRow = celli; + *ptr_i++ = globRow; + *ptr_j++ = globRow; + } + for (label idx=0; idx < upp.size(); ++idx) + { + PetscInt globRow = low[idx] + off; + PetscInt globCol = upp[idx] + off; + *ptr_i++ = globRow; + *ptr_j++ = globCol; + } + for (label idx=0; idx < low.size(); ++idx) + { + PetscInt globRow = upp[idx] + off; + PetscInt globCol = low[idx] + off; + *ptr_i++ = globRow; + *ptr_j++ = globCol; + } + if (Pstream::parRun()) + { + Pstream::waitRequests(nReq); + } + forAll(interfaces, patchi) + { + if (interfaces.set(patchi)) + { + // Processor-local values + const labelUList& faceCells = lduAddr.patchAddr(patchi); + labelField nbrCells + ( + interfaces[patchi].internalFieldTransfer + ( + Pstream::commsTypes::nonBlocking, + globalCells + ) + ); - // Off-processor non-zero entries, per row. + if (faceCells.size() != nbrCells.size()) + { + FatalErrorInFunction + << "Mismatch in interface sizes (AMI?)" << nl + << "Have " << faceCells.size() << " != " + << nbrCells.size() << nl + << exit(FatalError); + } - integerList nonZeroNeigh_(nrows_, 0); + forAll(faceCells, i) + { + PetscInt globRow = faceCells[i] + off; + PetscInt globCol = nbrCells[i]; + *ptr_i++ = globRow; + *ptr_j++ = globCol; + } + } + } - // Number of non-zeros from interfaces (on-processor or off-processor) - forAll(interfaces, patchi) + // Preallocate with COO information + // Use interface call to get the timing logged +#if PETSC_VERSION_GE(3,15,0) + MatSetPreallocationCOO(Amat,toti,coo_i,coo_j); +#else + (*f)(Amat,toti,coo_i,coo_j); +#endif + PetscFree(coo_i); + PetscFree(coo_j); + } + else { - const lduInterface* intf = interfaces.set(patchi); + // On-processor non-zero entries, per row. + // Initialise with 1 (includes diagonal). + integerList nonZero_(nrows_, 1); + lowNonZero.resize(nrows_, 0); - if (intf) - { - const labelUList& faceCells = lduAddr.patchAddr(patchi); + // Number of on-processor non-zeros + // set the matrix options - if (isA<cyclicAMILduInterface>(*intf)) + if (!is_sbaij(Amat)) + { + for (const label celli : upp) { - FatalErrorInFunction - << "cyclicAMI is not supported" << nl - << exit(FatalError); + ++nonZero_[celli]; } - else if (isA<cyclicLduInterface>(*intf)) + } + + for (const label celli : low) + { + ++nonZero_[celli]; + ++lowNonZero[celli]; + } + + maxLowNonZeroPerRow = max(lowNonZero); + + // Off-processor non-zero entries, per row. + + integerList nonZeroNeigh_(nrows_, 0); + + // Number of non-zeros from interfaces (on-processor or off-processor) + forAll(interfaces, patchi) + { + const lduInterface* intf = interfaces.set(patchi); + + if (intf) { - // Cyclic is on-processor - for (const label celli : faceCells) + const labelUList& faceCells = lduAddr.patchAddr(patchi); + + if (isA<cyclicAMILduInterface>(*intf)) { - ++nonZero_[celli]; + FatalErrorInFunction + << "cyclicAMI is not supported" << nl + << exit(FatalError); } - } - else - { - // Off-processor - for (const label celli : faceCells) + else if (isA<cyclicLduInterface>(*intf)) { - ++nonZeroNeigh_[celli]; + // Cyclic is on-processor + for (const label celli : faceCells) + { + ++nonZero_[celli]; + } + } + else + { + // Off-processor + for (const label celli : faceCells) + { + ++nonZeroNeigh_[celli]; + } } } } - } - // preallocate the matrix - MatXAIJSetPreallocation - ( - Amat, - 1, - nonZero_.data(), // on-processor - nonZeroNeigh_.data(), // off-processor - nonZero_.data(), - nonZeroNeigh_.data() - ); + // preallocate the matrix + MatXAIJSetPreallocation + ( + Amat, + 1, + nonZero_.data(), // on-processor + nonZeroNeigh_.data(), // off-processor + nonZero_.data(), + nonZeroNeigh_.data() + ); - // The general AIJ preallocator does not handle SELL - MatSeqSELLSetPreallocation - ( - Amat, - 0, nonZero_.data() // on-processor - ); + // The general AIJ preallocator does not handle SELL + MatSeqSELLSetPreallocation + ( + Amat, + 0, nonZero_.data() // on-processor + ); - MatMPISELLSetPreallocation - ( - Amat, - 0, nonZero_.data(), // on-processor - 0, nonZeroNeigh_.data() // off-processor - ); + MatMPISELLSetPreallocation + ( + Amat, + 0, nonZero_.data(), // on-processor + 0, nonZeroNeigh_.data() // off-processor + ); + } // set the matrix options if (matrix_.symmetric()) @@ -607,167 +722,218 @@ void Foam::petscSolver::updateMat // Local degrees-of-freedom i.e. number of local rows const label nrows_ = lduAddr.size(); - // Number of internal faces (connectivity) - const label nIntFaces_ = upp.size(); - - const globalIndex globalNumbering_(nrows_); - - // The diagonal - { - PetscInt globRow = globalNumbering_.toGlobal(0); - - for (label celli = 0; celli < nrows_; ++celli) - { - PetscScalar val = diagVal[celli]; - - MatSetValues - ( - Amat, - 1, &globRow, // row - 1, &globRow, // col - &val, - INSERT_VALUES - ); - ++globRow; - } - } - - // Connections to higher numbered cells (on-processor) (optimized form) - PetscInt globRow; - PetscInt globCols[maxLowNonZeroPerRow]; - PetscScalar globVals[maxLowNonZeroPerRow]; - label kidx = 0; - - for (label idx=0; idx < nrows_; ++idx) - { - if (lowNonZero[idx] > 0) - { - globRow = globalNumbering_.toGlobal(low[kidx]); - - for (label jidx=0; jidx<lowNonZero[idx]; ++jidx) - { - // The row is sorted, col is unsorted - globCols[jidx] = globalNumbering_.toGlobal(upp[kidx]); - globVals[jidx] = uppVal[kidx]; - ++kidx; - } - - MatSetValues - ( - Amat, - 1, &globRow, - lowNonZero[idx], globCols, - globVals, - INSERT_VALUES - ); - } - } - - if (!is_sbaij(Amat)) - { - // Connections to lower numbered cells (on-processor) - for (label idx=0; idx < nIntFaces_; ++idx) - { - // The row is unsorted, col is sorted - PetscInt globRow = globalNumbering_.toGlobal(upp[idx]); - PetscInt globCol = globalNumbering_.toGlobal(low[idx]); - PetscScalar val = lowVal[idx]; - - MatSetValues - ( - Amat, - 1, &globRow, - 1, &globCol, - &val, - INSERT_VALUES - ); - } - } - - labelList globalCells - ( - identity - ( - globalNumbering_.localSize(), - globalNumbering_.localStart() - ) - ); - - // Connections to neighbouring processors + // Query for fast COO support + PetscErrorCode (*f)(Mat,const PetscScalar[],InsertMode) = NULL; + PetscObjectQueryFunction((PetscObject)Amat,"MatSetValuesCOO_C",&f); + if (f) { - // Initialise transfer of global cells - forAll(interfaces, patchi) - { - if (interfaces.set(patchi)) - { - interfaces[patchi].initInternalFieldTransfer - ( - Pstream::commsTypes::nonBlocking, - globalCells - ); - } - } - - if (Pstream::parRun()) - { - Pstream::waitRequests(); - } - - forAll(interfaces, patchi) - { - if (interfaces.set(patchi)) - { - // Processor-local values + label nProcValues = 0; + forAll(interfaces, patchi) + { + const lduInterface* intf = interfaces.set(patchi); + + if (intf) + { + nProcValues += lduAddr.patchAddr(patchi).size(); + } + } + PetscInt toti = nrows_+low.size()+upp.size()+nProcValues; + + PetscScalar *coo_v; + PetscMalloc1(toti,&coo_v); + PetscArraycpy(coo_v,diagVal.cdata(),nrows_); + PetscArraycpy(coo_v+nrows_,uppVal.cdata(),upp.size()); + PetscArraycpy(coo_v+nrows_+upp.size(),lowVal.cdata(),low.size()); + if (nProcValues) + { + PetscScalar *ptr_v = coo_v+nrows_+low.size()+upp.size(); + forAll(interfaces, patchi) + { + const lduInterface* intf = interfaces.set(patchi); + + if (intf) + { const labelUList& faceCells = lduAddr.patchAddr(patchi); - const scalarField& bCoeffs = interfaceBouCoeffs_[patchi]; - - labelField nbrCells - ( - interfaces[patchi].internalFieldTransfer - ( - Pstream::commsTypes::nonBlocking, - globalCells - ) - ); - - - if (faceCells.size() != nbrCells.size()) - { - FatalErrorInFunction - << "Mismatch in interface sizes (AMI?)" << nl - << "Have " << faceCells.size() << " != " - << nbrCells.size() << nl - << exit(FatalError); - } - - const label off = globalNumbering_.localStart(); - // NB:opposite sign since we using this sides' // coefficients (from discretising our face; not the // neighbouring, reversed face) - forAll(faceCells, i) { - PetscInt globRow = faceCells[i] + off; // row is sorted - PetscInt globCol = nbrCells[i]; // <- already global - PetscScalar val = -bCoeffs[i]; - - MatSetValues - ( - Amat, - 1, &globRow, - 1, &globCol, - &(val), - INSERT_VALUES - ); + *ptr_v++ = -bCoeffs[i]; } - } - } + } + } + + } +#if PETSC_VERSION_GE(3,15,0) + MatSetValuesCOO(Amat,coo_v,INSERT_VALUES); +#else + (*f)(Amat,coo_v,INSERT_VALUES); +#endif + PetscFree(coo_v); + } else { + // Number of internal faces (connectivity) + const label nIntFaces_ = upp.size(); + + const globalIndex globalNumbering_(nrows_); + + // The diagonal + { + PetscInt globRow = globalNumbering_.toGlobal(0); + + for (label celli = 0; celli < nrows_; ++celli) + { + PetscScalar val = diagVal[celli]; + + MatSetValues + ( + Amat, + 1, &globRow, // row + 1, &globRow, // col + &val, + INSERT_VALUES + ); + ++globRow; + } + } + + // Connections to higher numbered cells (on-processor) (optimized form) + PetscInt globRow; + PetscInt globCols[maxLowNonZeroPerRow]; + PetscScalar globVals[maxLowNonZeroPerRow]; + label kidx = 0; + + for (label idx=0; idx < nrows_; ++idx) + { + if (lowNonZero[idx] > 0) + { + globRow = globalNumbering_.toGlobal(low[kidx]); + + for (label jidx=0; jidx<lowNonZero[idx]; ++jidx) + { + // The row is sorted, col is unsorted + globCols[jidx] = globalNumbering_.toGlobal(upp[kidx]); + globVals[jidx] = uppVal[kidx]; + ++kidx; + } + + MatSetValues + ( + Amat, + 1, &globRow, + lowNonZero[idx], globCols, + globVals, + INSERT_VALUES + ); + } + } + + if (!is_sbaij(Amat)) + { + // Connections to lower numbered cells (on-processor) + for (label idx=0; idx < nIntFaces_; ++idx) + { + // The row is unsorted, col is sorted + PetscInt globRow = globalNumbering_.toGlobal(upp[idx]); + PetscInt globCol = globalNumbering_.toGlobal(low[idx]); + PetscScalar val = lowVal[idx]; + + MatSetValues + ( + Amat, + 1, &globRow, + 1, &globCol, + &val, + INSERT_VALUES + ); + } + } + + labelList globalCells + ( + identity + ( + globalNumbering_.localSize(), + globalNumbering_.localStart() + ) + ); + + // Connections to neighbouring processors + { + // Initialise transfer of global cells + forAll(interfaces, patchi) + { + if (interfaces.set(patchi)) + { + interfaces[patchi].initInternalFieldTransfer + ( + Pstream::commsTypes::nonBlocking, + globalCells + ); + } + } + + if (Pstream::parRun()) + { + Pstream::waitRequests(); + } + + forAll(interfaces, patchi) + { + if (interfaces.set(patchi)) + { + // Processor-local values + const labelUList& faceCells = lduAddr.patchAddr(patchi); + + const scalarField& bCoeffs = interfaceBouCoeffs_[patchi]; + + labelField nbrCells + ( + interfaces[patchi].internalFieldTransfer + ( + Pstream::commsTypes::nonBlocking, + globalCells + ) + ); + + + if (faceCells.size() != nbrCells.size()) + { + FatalErrorInFunction + << "Mismatch in interface sizes (AMI?)" << nl + << "Have " << faceCells.size() << " != " + << nbrCells.size() << nl + << exit(FatalError); + } + + const label off = globalNumbering_.localStart(); + + // NB:opposite sign since we using this sides' + // coefficients (from discretising our face; not the + // neighbouring, reversed face) + + forAll(faceCells, i) + { + PetscInt globRow = faceCells[i] + off; // row is sorted + PetscInt globCol = nbrCells[i]; // <- already global + PetscScalar val = -bCoeffs[i]; + + MatSetValues + ( + Amat, + 1, &globRow, + 1, &globCol, + &(val), + INSERT_VALUES + ); + } + } + } + } + MatAssemblyBegin(Amat, MAT_FINAL_ASSEMBLY); + MatAssemblyEnd(Amat, MAT_FINAL_ASSEMBLY); } - - MatAssemblyBegin(Amat, MAT_FINAL_ASSEMBLY); - MatAssemblyEnd(Amat, MAT_FINAL_ASSEMBLY); } -- GitLab