Update spin-wait pause/yield normalization (#55295)

Update spin-wait pause/yield normalization - Modified the measurement to use much less time and to remeasure periodically to reduce CPU usage during startup - Each measurement does a low-microsecond-level measurement of pause/yield times - Some small amount of history of recent measurements is retained and used to for now take the lowest measurement for normalization - Measurements are done lazily, and at most every few seconds another measurement is taken - Added a profiling event that includes info about a measurement and the established value from recent measurements that is used for normalization
author: Koundinya Veluri <kouvel@users.noreply.github.com> 2021-07-13 18:59:20 +0300
committer: GitHub <noreply@github.com> 2021-07-13 18:59:20 +0300
commit: 4ff3762a2e08245a5f005beffeb42cca5c4b6c10 (patch)
tree: e2f3e18cee966f687bad55934a347f68b1641ae2 /src/coreclr/vm
parent: b2a670b5421af99edad62f8cb5ae172cd8d05030 (diff)
10 files changed, 301 insertions, 82 deletions
diff --git a/src/coreclr/vm/CMakeLists.txt b/src/coreclr/vm/CMakeLists.txt
index 9c2cb3df0b7..f31e5a3ca12 100644
--- a/src/coreclr/vm/CMakeLists.txt
+++ b/src/coreclr/vm/CMakeLists.txt
@@ -136,7 +136,6 @@ set(VM_SOURCES_DAC_AND_WKS_COMMON
     versionresilienthashcode.cpp
     virtualcallstub.cpp
     win32threadpool.cpp
-    yieldprocessornormalized.cpp
     zapsig.cpp
 )
 
@@ -389,6 +388,7 @@ set(VM_SOURCES_WKS
     threadsuspend.cpp
     typeparse.cpp
     weakreferencenative.cpp
+    yieldprocessornormalized.cpp
     ${VM_SOURCES_GDBJIT}
 )
 
diff --git a/src/coreclr/vm/ClrEtwAll.man b/src/coreclr/vm/ClrEtwAll.man
index 45895f16fce..0eed049c17b 100644
--- a/src/coreclr/vm/ClrEtwAll.man
+++ b/src/coreclr/vm/ClrEtwAll.man
@@ -438,7 +438,13 @@
                             <opcode name="Profiler" message="$(string.RuntimePublisher.ProfilerOpcodeMessage)" symbol="CLR_PROFILER_OPCODE" value="11"/>
                         </opcodes>
                     </task>
-                <!--Next available ID is 37-->
+                    <task name="YieldProcessorMeasurement" symbol="CLR_YIELD_PROCESSOR_MEASUREMENT_TASK"
+                          value="37" eventGUID="{B4AFC324-DECE-4B02-86DC-AAB8F22BC1B1}"
+                          message="$(string.RuntimePublisher.YieldProcessorMeasurementTaskMessage)">
+                        <opcodes>
+                        </opcodes>
+                    </task>
+                <!--Next available ID is 38-->
                 </tasks>
                 <!--Maps-->
                 <maps>
@@ -2916,6 +2922,19 @@
                         </Settings>
                       </UserData>
                     </template>
+
+                    <template tid="YieldProcessorMeasurement">
+                      <data name="ClrInstanceID" inType="win:UInt16"/>
+                      <data name="NsPerYield" inType="win:Double"/>
+                      <data name="EstablishedNsPerYield" inType="win:Double"/>
+                      <UserData>
+                        <Settings xmlns="myNs">
+                          <ClrInstanceID> %1 </ClrInstanceID>
+                          <NsPerYield> %2 </NsPerYield>
+                          <EstablishedNsPerYield> %3 </EstablishedNsPerYield>
+                        </Settings>
+                      </UserData>
+                    </template>
                 </templates>
 
                 <events>
@@ -3313,6 +3332,10 @@
                            keywords ="ThreadingKeyword"  opcode="Wait"
                            task="ThreadPoolWorkerThread"
                            symbol="ThreadPoolWorkerThreadWait" message="$(string.RuntimePublisher.ThreadPoolWorkerThreadEventMessage)"/>
+                  
+                    <event value="58" version="0" level="win:Informational" template="YieldProcessorMeasurement"
+                           keywords="ThreadingKeyword" task="YieldProcessorMeasurement" opcode="win:Info"
+                           symbol="YieldProcessorMeasurement" message="$(string.RuntimePublisher.YieldProcessorMeasurementEventMessage)"/>
 
                     <!-- CLR private ThreadPool events -->
                     <event value="60" version="0" level="win:Verbose"  template="ThreadPoolWorkingThreadCount"
@@ -8049,6 +8072,7 @@
                 <string id="RuntimePublisher.WorkerThreadRetirementRetireThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreads=%2" />
                 <string id="RuntimePublisher.WorkerThreadRetirementUnretireThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreads=%2" />
                 <string id="RuntimePublisher.ThreadPoolWorkerThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreadCount=%2;%nClrInstanceID=%3" />
+                <string id="RuntimePublisher.YieldProcessorMeasurementEventMessage" value="ClrInstanceID=%1;%nNsPerYield=%2;%nEstablishedNsPerYield=%3" />
                 <string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentSampleEventMessage" value="Throughput=%1;%nClrInstanceID=%2" />
                 <string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentAdjustmentEventMessage" value="AverageThroughput=%1;%nNewWorkerThreadCount=%2;%nReason=%3;%nClrInstanceID=%4" />
                 <string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentStatsEventMessage" value="Duration=%1;%nThroughput=%2;%nThreadWave=%3;%nThroughputWave=%4;%nThroughputErrorEstimate=%5;%nAverageThroughputErrorEstimate=%6;%nThroughputRatio=%7;%nConfidence=%8;%nNewControlSetting=%9;%nNewThreadWaveMagnitude=%10;%nClrInstanceID=%11" />
@@ -8334,6 +8358,7 @@
                 <string id="RuntimePublisher.JitInstrumentationDataTaskMessage" value="JitInstrumentationData" />
                 <string id="RuntimePublisher.ExecutionCheckpointTaskMessage" value="ExecutionCheckpoint" />
                 <string id="RuntimePublisher.ProfilerTaskMessage" value="Profiler" />
+                <string id="RuntimePublisher.YieldProcessorMeasurementTaskMessage" value="YieldProcessorMeasurement" />
 
                 <string id="RundownPublisher.EEStartupTaskMessage" value="Runtime" />
                 <string id="RundownPublisher.MethodTaskMessage" value="Method" />
diff --git a/src/coreclr/vm/ClrEtwAllMeta.lst b/src/coreclr/vm/ClrEtwAllMeta.lst
index 4ac4fe405d9..9c5738ef43d 100644
--- a/src/coreclr/vm/ClrEtwAllMeta.lst
+++ b/src/coreclr/vm/ClrEtwAllMeta.lst
@@ -134,9 +134,9 @@ nomac:GarbageCollection:::GCJoin_V2
 
 nostack:Type:::BulkType
 
-###################
-# Threadpool events
-###################
+#################################
+# Threading and Threadpool events
+#################################
 nomac:WorkerThreadCreation:::WorkerThreadCreate
 noclrinstanceid:WorkerThreadCreation:::WorkerThreadCreate
 nomac:WorkerThreadCreation:::WorkerThreadTerminate
@@ -170,6 +170,8 @@ nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample
 nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample
 nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment
 nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment
+nomac:YieldProcessorMeasurement:::YieldProcessorMeasurement
+nostack:YieldProcessorMeasurement:::YieldProcessorMeasurement
 
 ##################
 # Exception events
diff --git a/src/coreclr/vm/comsynchronizable.cpp b/src/coreclr/vm/comsynchronizable.cpp
index 39f00d06741..15a33c711e7 100644
--- a/src/coreclr/vm/comsynchronizable.cpp
+++ b/src/coreclr/vm/comsynchronizable.cpp
@@ -1089,22 +1089,13 @@ FCIMPL1(void, ThreadNative::SetIsThreadpoolThread, ThreadBaseObject* thread)
 }
 FCIMPLEND
 
-INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration()
+FCIMPL0(INT32, ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
 {
-    QCALL_CONTRACT;
-
-    INT32 optimalMaxNormalizedYieldsPerSpinIteration;
-
-    BEGIN_QCALL;
-
-    // RuntimeThread calls this function only once lazily and caches the result, so ensure initialization
-    EnsureYieldProcessorNormalizedInitialized();
-    optimalMaxNormalizedYieldsPerSpinIteration = g_optimalMaxNormalizedYieldsPerSpinIteration;
-
-    END_QCALL;
+    FCALL_CONTRACT;
 
-    return optimalMaxNormalizedYieldsPerSpinIteration;
+    return (INT32)YieldProcessorNormalization::GetOptimalMaxNormalizedYieldsPerSpinIteration();
 }
+FCIMPLEND
 
 FCIMPL1(void, ThreadNative::SpinWait, int iterations)
 {
diff --git a/src/coreclr/vm/comsynchronizable.h b/src/coreclr/vm/comsynchronizable.h
index e9968201b8b..cfab18d9010 100644
--- a/src/coreclr/vm/comsynchronizable.h
+++ b/src/coreclr/vm/comsynchronizable.h
@@ -86,7 +86,7 @@ public:
     UINT64 QCALLTYPE GetProcessDefaultStackSize();
 
     static FCDECL1(INT32,   GetManagedThreadId, ThreadBaseObject* th);
-    static INT32 QCALLTYPE GetOptimalMaxSpinWaitsPerSpinIteration();
+    static FCDECL0(INT32,   GetOptimalMaxSpinWaitsPerSpinIteration);
     static FCDECL1(void,    SpinWait,                       int iterations);
     static BOOL QCALLTYPE YieldThread();
     static FCDECL0(Object*, GetCurrentThread);
diff --git a/src/coreclr/vm/ecalllist.h b/src/coreclr/vm/ecalllist.h
index f77dc75c80b..ea3f65d7291 100644
--- a/src/coreclr/vm/ecalllist.h
+++ b/src/coreclr/vm/ecalllist.h
@@ -602,7 +602,7 @@ FCFuncStart(gThreadFuncs)
 #endif // FEATURE_COMINTEROP
     FCFuncElement("Interrupt", ThreadNative::Interrupt)
     FCFuncElement("Join", ThreadNative::Join)
-    QCFuncElement("GetOptimalMaxSpinWaitsPerSpinIterationInternal", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
+    FCFuncElement("get_OptimalMaxSpinWaitsPerSpinIteration", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
     FCFuncElement("GetCurrentProcessorNumber", ThreadNative::GetCurrentProcessorNumber)
 FCFuncEnd()
 
diff --git a/src/coreclr/vm/eventtrace.cpp b/src/coreclr/vm/eventtrace.cpp
index ac7be2a9439..aded74deda6 100644
--- a/src/coreclr/vm/eventtrace.cpp
+++ b/src/coreclr/vm/eventtrace.cpp
@@ -4417,6 +4417,12 @@ VOID EtwCallbackCommon(
     {
         ETW::TypeSystemLog::OnKeywordsChanged();
     }
+
+    if (g_fEEStarted && !g_fEEShutDown)
+    {
+        // Emit the YieldProcessor measured values at the beginning of the trace
+        YieldProcessorNormalization::FireMeasurementEvents();
+    }
 }
 
 // Individual callbacks for each EventPipe provider.
diff --git a/src/coreclr/vm/finalizerthread.cpp b/src/coreclr/vm/finalizerthread.cpp
index 1e4dbf913c8..e8370315e66 100644
--- a/src/coreclr/vm/finalizerthread.cpp
+++ b/src/coreclr/vm/finalizerthread.cpp
@@ -379,11 +379,6 @@ DWORD WINAPI FinalizerThread::FinalizerThreadStart(void *args)
         {
             GetFinalizerThread()->SetBackground(TRUE);
 
-            {
-                GCX_PREEMP();
-                EnsureYieldProcessorNormalizedInitialized();
-            }
-
             while (!fQuitFinalizer)
             {
                 // This will apply any policy for swallowing exceptions during normal
diff --git a/src/coreclr/vm/threads.cpp b/src/coreclr/vm/threads.cpp
index 2c55f8770b0..c6485b86d59 100644
--- a/src/coreclr/vm/threads.cpp
+++ b/src/coreclr/vm/threads.cpp
@@ -1150,8 +1150,6 @@ void InitThreadManager()
     }
     CONTRACTL_END;
 
-    InitializeYieldProcessorNormalizedCrst();
-
     // All patched helpers should fit into one page.
     // If you hit this assert on retail build, there is most likely problem with BBT script.
     _ASSERTE_ALL_BUILDS("clr/src/VM/threads.cpp", (BYTE*)JIT_PatchedCodeLast - (BYTE*)JIT_PatchedCodeStart > (ptrdiff_t)0);
@@ -7194,6 +7192,7 @@ BOOL Thread::HaveExtraWorkForFinalizer()
         || Thread::CleanupNeededForFinalizedThread()
         || (m_DetachCount > 0)
         || SystemDomain::System()->RequireAppDomainCleanup()
+        || YieldProcessorNormalization::IsMeasurementScheduled()
         || ThreadStore::s_pThreadStore->ShouldTriggerGCForDeadThreads();
 }
 
@@ -7240,6 +7239,12 @@ void Thread::DoExtraWorkForFinalizer()
     // If there were any TimerInfos waiting to be released, they'll get flushed now
     ThreadpoolMgr::FlushQueueOfTimerInfos();
 
+    if (YieldProcessorNormalization::IsMeasurementScheduled())
+    {
+        GCX_PREEMP();
+        YieldProcessorNormalization::PerformMeasurement();
+    }
+
     ThreadStore::s_pThreadStore->TriggerGCForDeadThreadsIfNecessary();
 }
 
diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 91547923310..2c51e73b678 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -2,17 +2,33 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 #include "common.h"
+#include "yieldprocessornormalized.h"
 
-static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
-static CrstStatic s_initializeYieldProcessorNormalizedCrst;
+#ifndef CROSSGEN_COMPILE
 
-void InitializeYieldProcessorNormalizedCrst()
+#include "finalizerthread.h"
+
+enum class NormalizationState : UINT8
 {
-    WRAPPER_NO_CONTRACT;
-    s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock);
-}
+    Uninitialized,
+    Initialized,
+    Failed
+};
+
+static const int NsPerYieldMeasurementCount = 8;
+static const unsigned int MeasurementPeriodMs = 4000;
+
+static const unsigned int NsPerS = 1000 * 1000 * 1000;
+
+static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
+static unsigned int s_previousNormalizationTimeMs;
+
+static UINT64 s_performanceCounterTicksPerS;
+static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
+static int s_nextMeasurementIndex;
+static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
 
-static void InitializeYieldProcessorNormalized()
+static unsigned int DetermineMeasureDurationUs()
 {
     CONTRACTL
     {
@@ -22,92 +38,271 @@ static void InitializeYieldProcessorNormalized()
     }
     CONTRACTL_END;
 
-    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
 
-    if (s_isYieldProcessorNormalizedInitialized)
+    // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
+    // if the overhead seems high relative to the measure duration.
+    unsigned int measureDurationUs = 1;
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    UINT64 startTicks = li.QuadPart;
+    QueryPerformanceCounter(&li);
+    UINT64 elapsedTicks = li.QuadPart - startTicks;
+    if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
     {
-        return;
+        measureDurationUs *= 4;
+    }
+    return measureDurationUs;
+}
+
+static double MeasureNsPerYield(unsigned int measureDurationUs)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_PREEMPTIVE;
     }
+    CONTRACTL_END;
 
-    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
-    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
-    const int MeasureDurationMs = 10;
-    const int NsPerSecond = 1000 * 1000 * 1000;
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+    int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
+    UINT64 ticksPerS = s_performanceCounterTicksPerS;
+    UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
 
     LARGE_INTEGER li;
-    if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs)
+    QueryPerformanceCounter(&li);
+    UINT64 startTicks = li.QuadPart;
+
+    for (int i = 0; i < yieldCount; ++i)
     {
-        // High precision clock not available or clock resolution is too low, resort to defaults
-        s_isYieldProcessorNormalizedInitialized = true;
-        return;
+        System_YieldProcessor();
     }
-    ULONGLONG ticksPerSecond = li.QuadPart;
 
-    // Measure the nanosecond delay per yield
-    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
-    unsigned int yieldCount = 0;
     QueryPerformanceCounter(&li);
-    ULONGLONG startTicks = li.QuadPart;
-    ULONGLONG elapsedTicks;
-    do
-    {
-        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
-        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
-        // low microsecond range.
-        for (int i = 0; i < 1000; ++i)
+    UINT64 elapsedTicks = li.QuadPart - startTicks;
+    while (elapsedTicks < measureDurationTicks)
+    {
+        int nextYieldCount =
+            Max(4,
+                elapsedTicks == 0
+                    ? yieldCount / 4
+                    : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
+        for (int i = 0; i < nextYieldCount; ++i)
         {
             System_YieldProcessor();
         }
-        yieldCount += 1000;
 
         QueryPerformanceCounter(&li);
-        ULONGLONG nowTicks = li.QuadPart;
-        elapsedTicks = nowTicks - startTicks;
-    } while (elapsedTicks < measureDurationTicks);
-    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
-    if (nsPerYield < 1)
+        elapsedTicks = li.QuadPart - startTicks;
+        yieldCount += nextYieldCount;
+    }
+
+    // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
+    const double MinNsPerYield = 0.1;
+
+    // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
+    // really take this long. Limit the maximum to keep the recorded values reasonable.
+    const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
+
+    return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
+}
+
+void YieldProcessorNormalization::PerformMeasurement()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_isMeasurementScheduled);
+
+    double latestNsPerYield;
+    if (s_normalizationState == NormalizationState::Initialized)
     {
-        nsPerYield = 1;
+        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+
+        int nextMeasurementIndex = s_nextMeasurementIndex;
+        latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
+        AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
+        if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
+        {
+            nextMeasurementIndex = 0;
+        }
+        s_nextMeasurementIndex = nextMeasurementIndex;
     }
+    else if (s_normalizationState == NormalizationState::Uninitialized)
+    {
+        LARGE_INTEGER li;
+        if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
+        {
+            // High precision clock not available or clock resolution is too low, resort to defaults
+            s_normalizationState = NormalizationState::Failed;
+            return;
+        }
+        s_performanceCounterTicksPerS = li.QuadPart;
+
+        unsigned int measureDurationUs = DetermineMeasureDurationUs();
+        for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+        {
+            latestNsPerYield = MeasureNsPerYield(measureDurationUs);
+            AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
+            if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
+            {
+                AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
+            }
 
-    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
-    // value is naturally limited to MinNsPerNormalizedYield.
-    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
-    if (yieldsPerNormalizedYield < 1)
+            if (i < NsPerYieldMeasurementCount - 1)
+            {
+                FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+            }
+        }
+    }
+    else
     {
-        yieldsPerNormalizedYield = 1;
+        _ASSERTE(s_normalizationState == NormalizationState::Failed);
+        return;
     }
-    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
+
+    double establishedNsPerYield = s_nsPerYieldMeasurements[0];
+    for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
+    {
+        double nsPerYield = s_nsPerYieldMeasurements[i];
+        if (nsPerYield < establishedNsPerYield)
+        {
+            establishedNsPerYield = nsPerYield;
+        }
+    }
+    if (establishedNsPerYield != s_establishedNsPerYield)
+    {
+        AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
+    }
+
+    FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+
+    // Calculate the number of yields required to span the duration of a normalized yield
+    unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
+    _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
+    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
 
     // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
     // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
     // better job of allowing other work to run.
-    int optimalMaxNormalizedYieldsPerSpinIteration =
-        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
-    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
+    s_optimalMaxNormalizedYieldsPerSpinIteration =
+        Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
+    _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+
+    s_previousNormalizationTimeMs = GetTickCount();
+    s_normalizationState = NormalizationState::Initialized;
+    s_isMeasurementScheduled = false;
+}
+
+#endif // !CROSSGEN_COMPILE
+
+void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
+{
+    CONTRACTL
     {
-        optimalMaxNormalizedYieldsPerSpinIteration = 1;
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
     }
+    CONTRACTL_END;
 
-    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
-    s_isYieldProcessorNormalizedInitialized = true;
+#ifndef CROSSGEN_COMPILE
+    NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
+    if (normalizationState == NormalizationState::Initialized)
+    {
+        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+    }
+    else if (normalizationState == NormalizationState::Uninitialized)
+    {
+    }
+    else
+    {
+        _ASSERTE(normalizationState == NormalizationState::Failed);
+        return;
+    }
 
-    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+    // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
+    if (s_isMeasurementScheduled || !g_fEEStarted)
+    {
+        return;
+    }
+
+    s_isMeasurementScheduled = true;
+    FinalizerThread::EnableFinalization();
+#endif // !CROSSGEN_COMPILE
 }
 
-void EnsureYieldProcessorNormalizedInitialized()
+#ifndef CROSSGEN_COMPILE
+
+void YieldProcessorNormalization::FireMeasurementEvents()
 {
     CONTRACTL
     {
         NOTHROW;
         GC_NOTRIGGER;
-        MODE_PREEMPTIVE;
+        MODE_ANY;
     }
     CONTRACTL_END;
 
-    if (!s_isYieldProcessorNormalizedInitialized)
+    if (!EventEnabledYieldProcessorMeasurement())
     {
-        InitializeYieldProcessorNormalized();
+        return;
     }
+
+    // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
+    // recorded information, so try to enumerate the array with some care.
+    double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
+    int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
+    for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+    {
+        double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
+        if (nsPerYield != 0) // the array may not be fully initialized yet
+        {
+            FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
+        }
+
+        if (++nextIndex >= NsPerYieldMeasurementCount)
+        {
+            nextIndex = 0;
+        }
+    }
+}
+
+double YieldProcessorNormalization::AtomicLoad(double *valueRef)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    return VolatileLoadWithoutBarrier(valueRef);
+#else
+    return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
+#endif
 }
+
+void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    *valueRef = value;
+#else
+    InterlockedExchangeT(valueRef, value);
+#endif
+}
+
+#endif // !CROSSGEN_COMPILE
author	Koundinya Veluri <kouvel@users.noreply.github.com>	2021-07-13 18:59:20 +0300
committer	GitHub <noreply@github.com>	2021-07-13 18:59:20 +0300
commit	4ff3762a2e08245a5f005beffeb42cca5c4b6c10 (patch)
tree	e2f3e18cee966f687bad55934a347f68b1641ae2 /src/coreclr/vm
parent	b2a670b5421af99edad62f8cb5ae172cd8d05030 (diff)