diff --git a/etc/caseDicts/profiling/parallel.cfg b/etc/caseDicts/profiling/parallel.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..b61a963c6ea6d18c0b6453f7a2bdd188e7a0424e
--- /dev/null
+++ b/etc/caseDicts/profiling/parallel.cfg
@@ -0,0 +1,23 @@
+/*--------------------------------*- C++ -*----------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Version:  v2306
+    \\  /    A nd           | Website:  www.openfoam.com
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+Description
+    Configuration for profiling parallel (MPI) timings
+
+\*---------------------------------------------------------------------------*/
+
+type    parProfiling;
+libs    (utilityFunctionObjects);
+
+// Level of detail to report
+detail  0;
+
+// Report stats on exit only (instead of every time step)
+executeControl  onEnd;
+writeControl    none;
+
+// ************************************************************************* //
diff --git a/src/OpenFOAM/global/profiling/profilingPstream.C b/src/OpenFOAM/global/profiling/profilingPstream.C
index c46348bcc670d47e7a1d6ebc8bb2ce58a299f17e..588d3e1cde09c16c0b6e7e8627d386ad57abc95c 100644
--- a/src/OpenFOAM/global/profiling/profilingPstream.C
+++ b/src/OpenFOAM/global/profiling/profilingPstream.C
@@ -5,7 +5,7 @@
     \\  /    A nd           | www.openfoam.com
      \\/     M anipulation  |
 -------------------------------------------------------------------------------
-    Copyright (C) 2019-2020 OpenCFD Ltd.
+    Copyright (C) 2019-2023 OpenCFD Ltd.
 -------------------------------------------------------------------------------
 License
     This file is part of OpenFOAM.
@@ -31,10 +31,11 @@ License
 
 std::unique_ptr<Foam::cpuTime> Foam::profilingPstream::timer_(nullptr);
 
-Foam::profilingPstream::timingList Foam::profilingPstream::times_(Zero);
-
 bool Foam::profilingPstream::suspend_(false);
 
+Foam::profilingPstream::timingList Foam::profilingPstream::times_(double(0));
+Foam::profilingPstream::countList Foam::profilingPstream::counts_(uint64_t(0));
+
 
 // * * * * * * * * * * * * * * * * Constructors  * * * * * * * * * * * * * * //
 
@@ -52,7 +53,7 @@ Foam::profilingPstream::~profilingPstream()
 }
 
 
-// * * * * * * * * * * * * * * * Member Functions  * * * * * * * * * * * * * //
+// * * * * * * * * * * * * * Static Member Functions * * * * * * * * * * * * //
 
 void Foam::profilingPstream::enable()
 {
@@ -63,7 +64,8 @@ void Foam::profilingPstream::enable()
     else
     {
         timer_.reset(new cpuTime);
-        times_ = Zero;
+        times_ = double(0);
+        counts_ = uint64_t(0);
     }
 
     suspend_ = false;
@@ -77,4 +79,16 @@ void Foam::profilingPstream::disable() noexcept
 }
 
 
+double Foam::profilingPstream::elapsedTime()
+{
+    double total = 0;
+    for (const double val : times_)
+    {
+        total += val;
+    }
+
+    return total;
+}
+
+
 // ************************************************************************* //
diff --git a/src/OpenFOAM/global/profiling/profilingPstream.H b/src/OpenFOAM/global/profiling/profilingPstream.H
index c088d988e0594d8e6d2e3345269295e37ea697d1..961aa8859685704d3c7ae497d57b04c0419d8ed4 100644
--- a/src/OpenFOAM/global/profiling/profilingPstream.H
+++ b/src/OpenFOAM/global/profiling/profilingPstream.H
@@ -5,7 +5,7 @@
     \\  /    A nd           | www.openfoam.com
      \\/     M anipulation  |
 -------------------------------------------------------------------------------
-    Copyright (C) 2019-2022 OpenCFD Ltd.
+    Copyright (C) 2019-2023 OpenCFD Ltd.
 -------------------------------------------------------------------------------
 License
     This file is part of OpenFOAM.
@@ -57,20 +57,26 @@ public:
 
     // Public Types
 
-        //- Enumeration within times array
-        enum timingType
+        //- The enumerated timing categories (for times and counts arrays)
+        enum timingType : unsigned
         {
-            GATHER = 0,
-            SCATTER,
-            BROADCAST,
+            BROADCAST = 0,
             REDUCE,
+            PROBE,
+            REQUEST,
             WAIT,
+            GATHER,
+            SCATTER,
             ALL_TO_ALL,
-            OTHER
+            OTHER,
+            nCategories  // Dimensioning size
         };
 
-        //- The timing values
-        typedef FixedList<double, 7> timingList;
+        //- Fixed-size container for timing values
+        typedef FixedList<double, timingType::nCategories> timingList;
+
+        //- Fixed-size container for timing counts
+        typedef FixedList<uint64_t, timingType::nCategories> countList;
 
 
 private:
@@ -80,12 +86,15 @@ private:
         //- The timer to use
         static std::unique_ptr<cpuTime> timer_;
 
-        //- The timing values
-        static timingList times_;
-
         //- Is timer in a suspend state?
         static bool suspend_;
 
+        //- The accumulated values for various timing categories
+        static timingList times_;
+
+        //- The timing frequency for various timing categories
+        static countList counts_;
+
 
 public:
 
@@ -125,18 +134,27 @@ public:
             return !suspend_ && bool(timer_);
         }
 
+        //- The total of times
+        static double elapsedTime();
+
         //- Access to the timing information
-        static timingList& times() noexcept
-        {
-            return times_;
-        }
+        static timingList& times() noexcept { return times_; }
+
+        //- Access to the timing counts
+        static countList& counts() noexcept { return counts_; }
 
-        //- Access to the timing information at given index
+        //- Access to the timing information for given timing category
         static double times(const timingType idx)
         {
             return times_[idx];
         }
 
+        //- Access to the count for given timing category
+        static uint64_t counts(const timingType idx)
+        {
+            return counts_[idx];
+        }
+
         //- Update timer prior to measurement
         static void beginTiming()
         {
@@ -152,21 +170,10 @@ public:
             if (active())
             {
                 times_[idx] += timer_->cpuTimeIncrement();
+                ++counts_[idx];
             }
         }
 
-        //- Add time increment to \em gather time
-        static void addGatherTime()
-        {
-            addTime(timingType::GATHER);
-        }
-
-        //- Add time increment to \em scatter time
-        static void addScatterTime()
-        {
-            addTime(timingType::SCATTER);
-        }
-
         //- Add time increment to \em broadcast time
         static void addBroadcastTime()
         {
@@ -179,12 +186,36 @@ public:
             addTime(timingType::REDUCE);
         }
 
+        //- Add time increment to \em probe time
+        static void addProbeTime()
+        {
+            addTime(timingType::PROBE);
+        }
+
+        //- Add time increment to \em request time
+        static void addRequestTime()
+        {
+            addTime(timingType::REQUEST);
+        }
+
         //- Add time increment to \em wait time
         static void addWaitTime()
         {
             addTime(timingType::WAIT);
         }
 
+        //- Add time increment to \em gather time
+        static void addGatherTime()
+        {
+            addTime(timingType::GATHER);
+        }
+
+        //- Add time increment to \em scatter time
+        static void addScatterTime()
+        {
+            addTime(timingType::SCATTER);
+        }
+
         //- Add time increment to \em allToAll time
         static void addAllToAllTime()
         {
diff --git a/src/Pstream/mpi/UIPstreamRead.C b/src/Pstream/mpi/UIPstreamRead.C
index e7bb303da076704ac6bdb0258d1b65b4ebd8f26e..37cb441414442e37181d905194e8ed362696076c 100644
--- a/src/Pstream/mpi/UIPstreamRead.C
+++ b/src/Pstream/mpi/UIPstreamRead.C
@@ -61,8 +61,7 @@ void Foam::UIPstream::bufferIPCrecv()
         );
         MPI_Get_count(&status, MPI_BYTE, &messageSize_);
 
-        // Assume these are from gathers ...
-        profilingPstream::addGatherTime();
+        profilingPstream::addProbeTime();
 
         recvBuf_.resize(messageSize_);
 
@@ -206,7 +205,7 @@ Foam::label Foam::UIPstream::read
             return 0;
         }
 
-        profilingPstream::addWaitTime();
+        profilingPstream::addRequestTime();
 
         if (debug)
         {
diff --git a/src/Pstream/mpi/UOPstreamWrite.C b/src/Pstream/mpi/UOPstreamWrite.C
index a971799ac976d27377f3bf4341ae502280822bf0..c14404d0773513697aa2765fa5ea2e913fadf221 100644
--- a/src/Pstream/mpi/UOPstreamWrite.C
+++ b/src/Pstream/mpi/UOPstreamWrite.C
@@ -179,7 +179,7 @@ bool Foam::UOPstream::write
             );
         }
 
-        profilingPstream::addWaitTime();
+        profilingPstream::addRequestTime();
 
         if (debug)
         {
diff --git a/src/Pstream/mpi/UPstream.C b/src/Pstream/mpi/UPstream.C
index f56d5631e97eaa93dae8a4f4c264b4fe6f6489f1..a84cdddf82aaabb34510e87f461d21b587e6310c 100644
--- a/src/Pstream/mpi/UPstream.C
+++ b/src/Pstream/mpi/UPstream.C
@@ -822,6 +822,8 @@ Foam::UPstream::probeMessage
     if (UPstream::commsTypes::blocking == commsType)
     {
         // Blocking
+        profilingPstream::beginTiming();
+
         if
         (
             MPI_Probe
@@ -837,11 +839,15 @@ Foam::UPstream::probeMessage
                 << "MPI_Probe returned with error"
                 << Foam::abort(FatalError);
         }
+
+        profilingPstream::addProbeTime();
         flag = 1;
     }
     else
     {
         // Non-blocking
+        profilingPstream::beginTiming();
+
         if
         (
             MPI_Iprobe
@@ -858,6 +864,8 @@ Foam::UPstream::probeMessage
                 << "MPI_Iprobe returned with error"
                 << Foam::abort(FatalError);
         }
+
+        profilingPstream::addRequestTime();
     }
 
     if (flag)
diff --git a/src/Pstream/mpi/UPstreamWrappingTemplates.C b/src/Pstream/mpi/UPstreamWrappingTemplates.C
index b93f9d447b5dee2e0937fba65f3c199c4752f37a..65e85bb8ea39af1d279d6c0d44ebbc33bc7f2a6a 100644
--- a/src/Pstream/mpi/UPstreamWrappingTemplates.C
+++ b/src/Pstream/mpi/UPstreamWrappingTemplates.C
@@ -158,14 +158,14 @@ void Foam::PstreamDetail::allReduce
         error::printStack(Pout);
     }
 
-    profilingPstream::beginTiming();
-
     bool handled(false);
 
 #if defined(MPI_VERSION) && (MPI_VERSION >= 3)
     // MPI-3 : eg, openmpi-1.7 (2013) and later
     if (immediate)
     {
+        profilingPstream::beginTiming();
+
         handled = true;
         MPI_Request request;
 
@@ -198,11 +198,15 @@ void Foam::PstreamDetail::allReduce
         {
             *requestID = PstreamGlobals::push_request(request);
         }
+
+        profilingPstream::addRequestTime();
     }
 #endif
 
     if (!handled)
     {
+        profilingPstream::beginTiming();
+
         if (req) req->reset();
         if (requestID) *requestID = -1;
 
@@ -224,9 +228,9 @@ void Foam::PstreamDetail::allReduce
                 << UList<Type>(values, count)
                 << Foam::abort(FatalError);
         }
-    }
 
-    profilingPstream::addReduceTime();
+        profilingPstream::addReduceTime();
+    }
 }
 
 
@@ -283,14 +287,14 @@ void Foam::PstreamDetail::allToAll
         return;
     }
 
-    profilingPstream::beginTiming();
-
     bool handled(false);
 
 #if defined(MPI_VERSION) && (MPI_VERSION >= 3)
     // MPI-3 : eg, openmpi-1.7 (2013) and later
     if (immediate)
     {
+        profilingPstream::beginTiming();
+
         handled = true;
         MPI_Request request;
 
@@ -326,11 +330,15 @@ void Foam::PstreamDetail::allToAll
         {
             *requestID = PstreamGlobals::push_request(request);
         }
+
+        profilingPstream::addRequestTime();
     }
 #endif
 
     if (!handled)
     {
+        profilingPstream::beginTiming();
+
         if (req) req->reset();
         if (requestID) *requestID = -1;
 
@@ -355,9 +363,9 @@ void Foam::PstreamDetail::allToAll
                 << " For " << sendData
                 << Foam::abort(FatalError);
         }
-    }
 
-    profilingPstream::addAllToAllTime();
+        profilingPstream::addAllToAllTime();
+    }
 }
 
 
@@ -438,14 +446,14 @@ void Foam::PstreamDetail::allToAllv
         return;
     }
 
-    profilingPstream::beginTiming();
-
     bool handled(false);
 
 #if defined(MPI_VERSION) && (MPI_VERSION >= 3)
     // MPI-3 : eg, openmpi-1.7 (2013) and later
     if (immediate)
     {
+        profilingPstream::beginTiming();
+
         handled = true;
         MPI_Request request;
 
@@ -482,11 +490,15 @@ void Foam::PstreamDetail::allToAllv
         {
             *requestID = PstreamGlobals::push_request(request);
         }
+
+        profilingPstream::addRequestTime();
     }
 #endif
 
     if (!handled)
     {
+        profilingPstream::beginTiming();
+
         if (req) req->reset();
         if (requestID) *requestID = -1;
 
@@ -512,9 +524,10 @@ void Foam::PstreamDetail::allToAllv
                 << " recvCounts " << recvCounts
                 << Foam::abort(FatalError);
         }
+
+        profilingPstream::addAllToAllTime();
     }
 
-    profilingPstream::addAllToAllTime();
 }
 
 
@@ -929,14 +942,14 @@ void Foam::PstreamDetail::gather
         error::printStack(Pout);
     }
 
-    profilingPstream::beginTiming();
-
     bool handled(false);
 
 #if defined(MPI_VERSION) && (MPI_VERSION >= 3)
     // MPI-3 : eg, openmpi-1.7 (2013) and later
     if (immediate)
     {
+        profilingPstream::beginTiming();
+
         handled = true;
         MPI_Request request;
 
@@ -972,11 +985,15 @@ void Foam::PstreamDetail::gather
         {
             *requestID = PstreamGlobals::push_request(request);
         }
+
+        profilingPstream::addRequestTime();
     }
 #endif
 
     if (!handled)
     {
+        profilingPstream::beginTiming();
+
         if (req) req->reset();
         if (requestID) *requestID = -1;
 
@@ -1001,9 +1018,9 @@ void Foam::PstreamDetail::gather
                 << " recvCount " << recvCount
                 << Foam::abort(FatalError);
         }
-    }
 
-    profilingPstream::addGatherTime();
+        profilingPstream::addGatherTime();
+    }
 }
 
 
@@ -1055,14 +1072,14 @@ void Foam::PstreamDetail::scatter
         error::printStack(Pout);
     }
 
-    profilingPstream::beginTiming();
-
     bool handled(false);
 
 #if defined(MPI_VERSION) && (MPI_VERSION >= 3)
     // MPI-3 : eg, openmpi-1.7 (2013) and later
     if (immediate)
     {
+        profilingPstream::beginTiming();
+
         handled = true;
         MPI_Request request;
 
@@ -1098,11 +1115,15 @@ void Foam::PstreamDetail::scatter
         {
             *requestID = PstreamGlobals::push_request(request);
         }
+
+        profilingPstream::addRequestTime();
     }
 #endif
 
     if (!handled)
     {
+        profilingPstream::beginTiming();
+
         if (req) req->reset();
         if (requestID) *requestID = -1;
 
@@ -1127,9 +1148,9 @@ void Foam::PstreamDetail::scatter
                 << " recvCount " << recvCount
                 << Foam::abort(FatalError);
         }
-    }
 
-    profilingPstream::addScatterTime();
+        profilingPstream::addScatterTime();
+    }
 }
 
 
@@ -1200,8 +1221,6 @@ void Foam::PstreamDetail::gatherv
             << Foam::abort(FatalError);
     }
 
-    profilingPstream::beginTiming();
-
     // Ensure send/recv consistency on master
     if (UPstream::master(comm) && !recvCounts[0])
     {
@@ -1214,6 +1233,8 @@ void Foam::PstreamDetail::gatherv
     // MPI-3 : eg, openmpi-1.7 (2013) and later
     if (immediate)
     {
+        profilingPstream::beginTiming();
+
         handled = true;
         MPI_Request request;
 
@@ -1250,11 +1271,15 @@ void Foam::PstreamDetail::gatherv
         {
             *requestID = PstreamGlobals::push_request(request);
         }
+
+        profilingPstream::addRequestTime();
     }
 #endif
 
     if (!handled)
     {
+        profilingPstream::beginTiming();
+
         if (req) req->reset();
         if (requestID) *requestID = -1;
 
@@ -1280,9 +1305,9 @@ void Foam::PstreamDetail::gatherv
                 << " recvCounts " << recvCounts
                 << Foam::abort(FatalError);
         }
-    }
 
-    profilingPstream::addGatherTime();
+        profilingPstream::addGatherTime();
+    }
 }
 
 
@@ -1352,14 +1377,14 @@ void Foam::PstreamDetail::scatterv
             << Foam::abort(FatalError);
     }
 
-    profilingPstream::beginTiming();
-
     bool handled(false);
 
 #if defined(MPI_VERSION) && (MPI_VERSION >= 3)
     // MPI-3 : eg, openmpi-1.7 (2013) and later
     if (immediate)
     {
+        profilingPstream::beginTiming();
+
         handled = true;
         MPI_Request request;
 
@@ -1396,11 +1421,15 @@ void Foam::PstreamDetail::scatterv
         {
             *requestID = PstreamGlobals::push_request(request);
         }
+
+        profilingPstream::addRequestTime();
     }
 #endif
 
     if (!handled)
     {
+        profilingPstream::beginTiming();
+
         if (req) req->reset();
         if (requestID) *requestID = -1;
 
@@ -1426,9 +1455,9 @@ void Foam::PstreamDetail::scatterv
                 << " sendOffsets " << sendOffsets
                 << Foam::abort(FatalError);
         }
-    }
 
-    profilingPstream::addScatterTime();
+        profilingPstream::addScatterTime();
+    }
 }
 
 
diff --git a/src/functionObjects/utilities/parProfiling/parProfiling.C b/src/functionObjects/utilities/parProfiling/parProfiling.C
index ab0de02f6b548fb35b78d312a707c1191b0a4178..5af73afd99d687b45530c3040c33c3a3d3ed8983 100644
--- a/src/functionObjects/utilities/parProfiling/parProfiling.C
+++ b/src/functionObjects/utilities/parProfiling/parProfiling.C
@@ -5,7 +5,7 @@
     \\  /    A nd           | www.openfoam.com
      \\/     M anipulation  |
 -------------------------------------------------------------------------------
-    Copyright (C) 2019-2022 OpenCFD Ltd.
+    Copyright (C) 2019-2023 OpenCFD Ltd.
 -------------------------------------------------------------------------------
 License
     This file is part of OpenFOAM.
@@ -61,8 +61,10 @@ Foam::functionObjects::parProfiling::parProfiling
     const dictionary& dict
 )
 :
-    functionObject(name)
+    functionObject(name),
+    detailLevel_(0)
 {
+    dict.readIfPresent("detail", detailLevel_);
     profilingPstream::enable();
 }
 
@@ -75,121 +77,365 @@ Foam::functionObjects::parProfiling::~parProfiling()
 }
 
 
-// * * * * * * * * * * * * * * * Member Functions  * * * * * * * * * * * * * //
+// * * * * * * * * * * * * * * * Local Functions * * * * * * * * * * * * * * //
 
-void Foam::functionObjects::parProfiling::report()
+namespace Foam
+{
+
+// Loop over all values (with striding) and extract the value at given index
+template<class Type>
+inline static void extractValues
+(
+    UList<Type>& result,
+    const int index,
+    const UList<Type>& allValues
+)
 {
-    if (!profilingPstream::active())
+    if (result.empty())
     {
         return;
     }
 
-    // (Time, Processor) for each of: min/max/sum
-    typedef FixedList<Tuple2<double, int>, 3> statData;
-    typedef FixedList<statData, 3> statDataTimes;
+    const label numProc = result.size();
+    const Type* values = allValues.cbegin();
+    const label stride = allValues.size() / numProc;
 
-    // Reduction: if x and y are unequal assign value.
-    auto statsEqOp = [](statDataTimes& xStats, const statDataTimes& yStats)
+    if (!values || !stride)
     {
-        forAll(xStats, i)
+        result = Type(0);
+        return;
+    }
+
+    for (label proci = 0; proci < numProc; ++proci, values += stride)
+    {
+        result[proci] = values[index];
+    }
+}
+
+
+// Loop over all values (with striding) and extract combined value
+// using the given unary function
+template<class Type, class Extract>
+inline static void extractValues
+(
+    UList<Type>& result,
+    const UList<Type>& allValues,
+    const Extract& extract
+)
+{
+    if (result.empty())
+    {
+        return;
+    }
+
+    const label numProc = result.size();
+    const Type* values = allValues.cbegin();
+    const label stride = allValues.size() / numProc;
+
+    if (!values || !stride)
+    {
+        result = Type(0);
+        return;
+    }
+
+    for (label proci = 0; proci < numProc; ++proci, values += stride)
+    {
+        result[proci] = extract(values);
+    }
+}
+
+
+inline static void printTimingDetail(const UList<double>& values)
+{
+    const label numProc = values.size();
+
+    if (numProc)
+    {
+        Info<< indent << "    times   " << numProc << '(';
+
+        for (label proci = 0; proci < numProc; ++proci)
         {
-            statData& x = xStats[i];
-            const statData& y = yStats[i];
-
-            // 0: min, 1: max, 2: total (or avg)
-            if (x[0].first() > y[0].first())
-            {
-                x[0] = y[0];
-            }
-            if (x[1].first() < y[1].first())
-            {
-                x[1] = y[1];
-            }
-            x[2].first() += y[2].first();
+            if (proci) Info<< ' ';
+            Info<< values[proci];
         }
-    };
 
-    statDataTimes times;
+        Info<< ')' << nl;
+    }
+}
+
+
+inline static void printTimingDetail(const UList<uint64_t>& values)
+{
+    const label numProc = values.size();
 
-    // Master time
+    if (numProc)
     {
-        const double total =
-        (
-            profilingPstream::times(profilingPstream::REDUCE)
-          + profilingPstream::times(profilingPstream::GATHER)
-          + profilingPstream::times(profilingPstream::SCATTER)
-            // Include broadcast with reduce instead of all-to-all
-          + profilingPstream::times(profilingPstream::BROADCAST)
-        );
+        // Output via std::ostream to avoid conversion to Foam::label
+        // that Ostream performs
+
+        auto& os = Info.stdStream();
+
+        Info<< indent << "    counts  " << numProc << '(';
+
+        for (label proci = 0; proci < numProc; ++proci)
+        {
+            if (proci) os << ' ';
+            os << values[proci];
+        }
+
+        Info<< ')' << nl;
+    }
+}
+
+} // End namespace Foam
+
+
+// * * * * * * * * * * * * * * * Member Functions  * * * * * * * * * * * * * //
+
+void Foam::functionObjects::parProfiling::report()
+{
+    const label numProc = (UPstream::parRun() ? UPstream::nProcs() : 1);
 
-        times[0] = Tuple2<double, int>(total, Pstream::myProcNo());
+    if (!profilingPstream::active() || numProc < 2)
+    {
+        return;
     }
 
-    // All time
+    // Use mpiGather on all values and perform the combinations
+    // and statistics locally. This reduces the overall number of MPI
+    // calls. For detailed output we need this information anyhow.
+
+    // NB: profilingPstream uses a FixedList for timings(), counts()
+    // so the sizes are guaranteed to be consistent and identical
+    // everywhere.
+
+    List<double> allTimes;
+    List<uint64_t> allCounts;
+
+    // Avoid disturbing the counts
+    profilingPstream::suspend();
+
     {
-        const double total =
+        // The timings
+        const auto& procTimes = profilingPstream::times();
+
+        if (Pstream::master())
+        {
+            allTimes.resize(numProc * procTimes.size());
+        }
+
+        UPstream::mpiGather
         (
-            profilingPstream::times(profilingPstream::WAIT)
-          + profilingPstream::times(profilingPstream::ALL_TO_ALL)
-          + profilingPstream::times(profilingPstream::OTHER)
+            procTimes.cdata_bytes(), // Send
+            procTimes.size_bytes(),  // Num send per proc
+            allTimes.data_bytes(),   // Recv
+            procTimes.size_bytes(),  // Num recv per proc
+            UPstream::commWorld()
         );
-
-        times[1] = Tuple2<double, int>(total, Pstream::myProcNo());
     }
 
-    // Other time
+    if (detailLevel_ > 1)
     {
-        const double total =
+        // The counts
+        const auto& procCounts = profilingPstream::counts();
+
+        if (Pstream::master())
+        {
+            allCounts.resize(numProc * procCounts.size());
+        }
+
+        UPstream::mpiGather
         (
-            profilingPstream::times(profilingPstream::OTHER)
+            procCounts.cdata_bytes(), // Send
+            procCounts.size_bytes(),  // Num send per proc
+            allCounts.data_bytes(),   // Recv
+            procCounts.size_bytes(),  // Num recv per proc
+            UPstream::commWorld()
         );
-
-        times[2] = Tuple2<double, int>(total, Pstream::myProcNo());
     }
 
-    profilingPstream::suspend();
+    profilingPstream::resume();
 
-    Pstream::combineGather(times, statsEqOp);
 
-    profilingPstream::resume();
+    // (Time, Processor) for each of: min/max/sum(avg)
+    typedef FixedList<Tuple2<double, int>, 3> statData;
+
+    // Extract min/max/average
+    auto calcStats = [](const UList<double>& data) -> statData
+    {
+        statData stats;
+        stats = Tuple2<double, int>((data.empty() ? 0 : data[0]), 0);
+
+        const label np = data.size();
+        for (label proci = 1; proci < np; ++proci)
+        {
+            Tuple2<double, int> tup(data[proci], proci);
+
+            // 0: min, 1: max, 2: total(avg)
+            if (stats[0].first() > tup.first()) stats[0] = tup;
+            if (stats[1].first() < tup.first()) stats[1] = tup;
+            stats[2].first() += tup.first();
+        }
+
+        // From total -> average value
+        if (np) { stats[2].first() /= np; }
+
+        return stats;
+    };
+
+
+    const auto printTimingStats =
+        [&](Ostream& os, const char* tag, const statData& stats)
+        {
+            os  << indent << tag << ": avg = " << stats[2].first()
+                << ", min = " << stats[0].first()
+                << " (proc " << stats[0].second() << ')'
+                << ", max = " << stats[1].first()
+                << " (proc " << stats[1].second() << ')'
+                << nl;
+        };
 
 
     if (Pstream::master())
     {
+        statData stats;
+        List<double> extractedTimes(numProc);
+        List<uint64_t> extractedCounts;
+
+        if (detailLevel_ > 1)
+        {
+            extractedCounts.resize(numProc);
+        }
+
         Info<< type() << ':' << nl
             << incrIndent;
 
+        // Total times
+        {
+            extractValues
+            (
+                extractedTimes,
+                allTimes,
+                [=](const double values[])
+                {
+                    double total = 0;
+                    for (unsigned i = 0; i < profilingPstream::nCategories; ++i)
+                    {
+                        total += values[i];
+                    }
+                    return total;
+                }
+            );
+            stats = calcStats(extractedTimes);
+
+            printTimingStats(Info(), "total     ", stats);
+            if (detailLevel_ > 0) printTimingDetail(extractedTimes);
+        }
+
+        // all-all
+        {
+            const int index = int(profilingPstream::ALL_TO_ALL);
+
+            extractValues(extractedTimes, index, allTimes);
+            extractValues(extractedCounts, index, allCounts);
+            stats = calcStats(extractedTimes);
+
+            printTimingStats(Info(), "all-all   ", stats);
+            if (detailLevel_ > 0) printTimingDetail(extractedTimes);
+            if (detailLevel_ > 1) printTimingDetail(extractedCounts);
+        }
+
+        // broadcast
+        {
+            const int index = int(profilingPstream::BROADCAST);
+
+            extractValues(extractedTimes, index, allTimes);
+            extractValues(extractedCounts, index, allCounts);
+            stats = calcStats(extractedTimes);
+
+            printTimingStats(Info(), "broadcast ", stats);
+            if (detailLevel_ > 0) printTimingDetail(extractedTimes);
+            if (detailLevel_ > 1) printTimingDetail(extractedCounts);
+        }
+
+        // probe
         {
-            const statData& stats = times[0];
-            double avg = stats[2].first()/Pstream::nProcs();
-
-            Info<< indent << "reduce    : avg = " << avg << 's' << nl
-                << indent << "            min = " << stats[0].first()
-                << "s (processor " << stats[0].second() << ')' << nl
-                << indent << "            max = " << stats[1].first()
-                << "s (processor " << stats[1].second() << ')' << nl;
+            const int index = int(profilingPstream::PROBE);
+
+            extractValues(extractedTimes, index, allTimes);
+            extractValues(extractedCounts, index, allCounts);
+            stats = calcStats(extractedTimes);
+
+            printTimingStats(Info(), "probe     ", stats);
+            if (detailLevel_ > 0) printTimingDetail(extractedTimes);
+            if (detailLevel_ > 1) printTimingDetail(extractedCounts);
         }
 
+        // Reduce/scatter times
         {
-            const statData& stats = times[1];
-            double avg = stats[2].first()/Pstream::nProcs();
-
-            Info<< indent << "all-all   : avg = " << avg << 's' << nl
-                << indent << "            min = " << stats[0].first()
-                << "s (processor " << stats[0].second() << ')' << nl
-                << indent << "            max = " << stats[1].first()
-                << "s (processor " << stats[1].second() << ')' << nl;
+            // const int index = int(profilingPstream::REDUCE);
+
+            extractValues
+            (
+                extractedTimes,
+                allTimes,
+                [=](const double values[])
+                {
+                    return
+                    (
+                        values[profilingPstream::REDUCE]
+                      + values[profilingPstream::GATHER]
+                      + values[profilingPstream::SCATTER]
+                    );
+                }
+            );
+            extractValues
+            (
+                extractedCounts,
+                allCounts,
+                [=](const uint64_t values[])
+                {
+                    return
+                    (
+                        values[profilingPstream::REDUCE]
+                      + values[profilingPstream::GATHER]
+                      + values[profilingPstream::SCATTER]
+                    );
+                }
+            );
+            stats = calcStats(extractedTimes);
+
+            printTimingStats(Info(), "reduce    ", stats);
+            if (detailLevel_ > 0) printTimingDetail(extractedTimes);
+            if (detailLevel_ > 1) printTimingDetail(extractedCounts);
         }
 
+        // request
         {
-            const statData& stats = times[2];
-            double avg = stats[2].first()/Pstream::nProcs();
-
-            Info<< indent << "other     : avg = " << avg << 's' << nl
-                << indent << "            min = " << stats[0].first()
-                << "s (processor " << stats[0].second() << ')' << nl
-                << indent << "            max = " << stats[1].first()
-                << "s (processor " << stats[1].second() << ')' << nl;
+            const int index = int(profilingPstream::REQUEST);
+
+            extractValues(extractedTimes, index, allTimes);
+            extractValues(extractedCounts, index, allCounts);
+            stats = calcStats(extractedTimes);
+
+            printTimingStats(Info(), "request   ", stats);
+
+            if (detailLevel_ > 0) printTimingDetail(extractedTimes);
+            if (detailLevel_ > 1) printTimingDetail(extractedCounts);
+        }
+
+        // wait
+        {
+            const int index = int(profilingPstream::WAIT);
+
+            extractValues(extractedTimes, index, allTimes);
+            extractValues(extractedCounts, index, allCounts);
+            stats = calcStats(extractedTimes);
+
+            printTimingStats(Info(), "wait      ", stats);
+
+            if (detailLevel_ > 0) printTimingDetail(extractedTimes);
+            if (detailLevel_ > 1) printTimingDetail(extractedCounts);
         }
 
         Info<< decrIndent;
diff --git a/src/functionObjects/utilities/parProfiling/parProfiling.H b/src/functionObjects/utilities/parProfiling/parProfiling.H
index 0529bc01fb254fdd34936c44d4ce0fb94b123f42..4a3f37eb2adbbca7e198036b24976a5563a71879 100644
--- a/src/functionObjects/utilities/parProfiling/parProfiling.H
+++ b/src/functionObjects/utilities/parProfiling/parProfiling.H
@@ -5,7 +5,7 @@
     \\  /    A nd           | www.openfoam.com
      \\/     M anipulation  |
 -------------------------------------------------------------------------------
-    Copyright (C) 2019-2022 OpenCFD Ltd.
+    Copyright (C) 2019-2023 OpenCFD Ltd.
 -------------------------------------------------------------------------------
 License
     This file is part of OpenFOAM.
@@ -43,6 +43,7 @@ Usage
         // Report stats on exit only (instead of every time step)
         executeControl  onEnd;
         writeControl    none;
+        detail          0;
     }
     \endverbatim
 
@@ -60,10 +61,6 @@ SourceFiles
 
 namespace Foam
 {
-
-// Forward Declarations
-class Time;
-
 namespace functionObjects
 {
 
@@ -75,7 +72,15 @@ class parProfiling
 :
     public functionObject
 {
-    // Private Member Functions
+    // Private Data
+
+        //- The level of detail
+        //  0: summary, 1: per-proc times, 2: per-proc times/counts
+        int detailLevel_;
+
+public:
+
+    // Generated Methods
 
         //- No copy construct
         parProfiling(const parProfiling&) = delete;
@@ -84,8 +89,6 @@ class parProfiling
         void operator=(const parProfiling&) = delete;
 
 
-public:
-
     //- Runtime type information
     TypeName("parProfiling");
 
@@ -116,7 +119,7 @@ public:
         //- Do nothing
         virtual bool write();
 
-        //- Report
+        //- Disables profilingPstream
         virtual bool end();
 };
 
diff --git a/tutorials/compressible/rhoSimpleFoam/squareBend/system/controlDict b/tutorials/compressible/rhoSimpleFoam/squareBend/system/controlDict
index 3761a08a3d507fa1c263197b324d1688cbc4a54e..77a23d28a5a37a547aa1f5ea08b25e169e920d94 100644
--- a/tutorials/compressible/rhoSimpleFoam/squareBend/system/controlDict
+++ b/tutorials/compressible/rhoSimpleFoam/squareBend/system/controlDict
@@ -54,6 +54,7 @@ functions
     // #include "sampleCellCentres"
     #include "isentropicTotalPressure"
     #include "wallHeatFlux"
+    #include "profiling"
 }
 
 
diff --git a/tutorials/compressible/rhoSimpleFoam/squareBend/system/profiling b/tutorials/compressible/rhoSimpleFoam/squareBend/system/profiling
new file mode 100644
index 0000000000000000000000000000000000000000..9e1a10e62c2fa01b2b969c81b639188a402f53ad
--- /dev/null
+++ b/tutorials/compressible/rhoSimpleFoam/squareBend/system/profiling
@@ -0,0 +1,10 @@
+// -*- C++ -*-
+
+profiling
+{
+    #includeEtc "caseDicts/profiling/parallel.cfg"
+    detail  2;
+}
+
+
+// ************************************************************************* //
diff --git a/tutorials/incompressible/simpleFoam/motorBike/system/profiling b/tutorials/incompressible/simpleFoam/motorBike/system/profiling
index 7efc5a84417c28479a7cc79b295645a10f1ef3bc..b762c1cd79b39ed1beb4d8e67e13bb16df94480a 100644
--- a/tutorials/incompressible/simpleFoam/motorBike/system/profiling
+++ b/tutorials/incompressible/simpleFoam/motorBike/system/profiling
@@ -2,14 +2,8 @@
 
 profiling
 {
-    type  parProfiling;
-
-    libs  (utilityFunctionObjects);
-
-    // Report stats on exit only (instead of every time step)
-    executeControl  onEnd;
-    writeControl    none;
+    #includeEtc "caseDicts/profiling/parallel.cfg"
+    detail  1;
 }
 
-
 // ************************************************************************* //
diff --git a/tutorials/incompressible/simpleFoam/windAroundBuildings/system/profiling b/tutorials/incompressible/simpleFoam/windAroundBuildings/system/profiling
index 7efc5a84417c28479a7cc79b295645a10f1ef3bc..b762c1cd79b39ed1beb4d8e67e13bb16df94480a 100644
--- a/tutorials/incompressible/simpleFoam/windAroundBuildings/system/profiling
+++ b/tutorials/incompressible/simpleFoam/windAroundBuildings/system/profiling
@@ -2,14 +2,8 @@
 
 profiling
 {
-    type  parProfiling;
-
-    libs  (utilityFunctionObjects);
-
-    // Report stats on exit only (instead of every time step)
-    executeControl  onEnd;
-    writeControl    none;
+    #includeEtc "caseDicts/profiling/parallel.cfg"
+    detail  1;
 }
 
-
 // ************************************************************************* //