Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/dotnet/runtime.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs23
-rw-r--r--src/coreclr/inc/yieldprocessornormalized.h120
-rw-r--r--src/coreclr/utilcode/yieldprocessornormalized.cpp16
-rw-r--r--src/coreclr/vm/CMakeLists.txt2
-rw-r--r--src/coreclr/vm/ClrEtwAll.man27
-rw-r--r--src/coreclr/vm/ClrEtwAllMeta.lst8
-rw-r--r--src/coreclr/vm/comsynchronizable.cpp17
-rw-r--r--src/coreclr/vm/comsynchronizable.h2
-rw-r--r--src/coreclr/vm/ecalllist.h2
-rw-r--r--src/coreclr/vm/eventtrace.cpp6
-rw-r--r--src/coreclr/vm/finalizerthread.cpp5
-rw-r--r--src/coreclr/vm/threads.cpp9
-rw-r--r--src/coreclr/vm/yieldprocessornormalized.cpp305
-rw-r--r--src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs4
-rw-r--r--src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs4
15 files changed, 412 insertions, 138 deletions
diff --git a/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs b/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
index c1a968ed9f9..83be00cf5e0 100644
--- a/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
+++ b/src/coreclr/System.Private.CoreLib/src/System/Threading/Thread.CoreCLR.cs
@@ -322,33 +322,14 @@ namespace System.Threading
[MethodImpl(MethodImplOptions.InternalCall)]
public extern bool Join(int millisecondsTimeout);
- private static int s_optimalMaxSpinWaitsPerSpinIteration;
-
- [DllImport(RuntimeHelpers.QCall)]
- private static extern int GetOptimalMaxSpinWaitsPerSpinIterationInternal();
-
/// <summary>
/// Max value to be passed into <see cref="SpinWait(int)"/> for optimal delaying. This value is normalized to be
/// appropriate for the processor.
/// </summary>
internal static int OptimalMaxSpinWaitsPerSpinIteration
{
- get
- {
- int optimalMaxSpinWaitsPerSpinIteration = s_optimalMaxSpinWaitsPerSpinIteration;
- return optimalMaxSpinWaitsPerSpinIteration != 0 ? optimalMaxSpinWaitsPerSpinIteration : CalculateOptimalMaxSpinWaitsPerSpinIteration();
- }
- }
-
- [MethodImpl(MethodImplOptions.NoInlining)]
- private static int CalculateOptimalMaxSpinWaitsPerSpinIteration()
- {
- // This is done lazily because the first call to the function below in the process triggers a measurement that
- // takes a nontrivial amount of time if the measurement has not already been done in the backgorund.
- // See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
- s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal();
- Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0);
- return s_optimalMaxSpinWaitsPerSpinIteration;
+ [MethodImpl(MethodImplOptions.InternalCall)]
+ get;
}
[MethodImpl(MethodImplOptions.InternalCall)]
diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index ba349bb83ad..121e60b0333 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -12,14 +12,59 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
#endif
#define YieldProcessor Dont_Use_YieldProcessor
-const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
-const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
+#define DISABLE_COPY(T) \
+ T(const T &) = delete; \
+ T &operator =(const T &) = delete
-extern unsigned int g_yieldsPerNormalizedYield;
-extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
+#define DISABLE_CONSTRUCT_COPY(T) \
+ T() = delete; \
+ DISABLE_COPY(T)
-void InitializeYieldProcessorNormalizedCrst();
-void EnsureYieldProcessorNormalizedInitialized();
+class YieldProcessorNormalization
+{
+public:
+ static const unsigned int TargetNsPerNormalizedYield = 37;
+ static const unsigned int TargetMaxNsPerSpinIteration = 272;
+
+ // These are maximums for the computed values for normalization based their calculation
+ static const unsigned int MaxYieldsPerNormalizedYield = TargetNsPerNormalizedYield * 10;
+ static const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
+ TargetMaxNsPerSpinIteration * 3 / (TargetNsPerNormalizedYield * 2) + 1;
+
+private:
+ static bool s_isMeasurementScheduled;
+
+ static unsigned int s_yieldsPerNormalizedYield;
+ static unsigned int s_optimalMaxNormalizedYieldsPerSpinIteration;
+
+public:
+ static bool IsMeasurementScheduled()
+ {
+ return s_isMeasurementScheduled;
+ }
+
+ static void PerformMeasurement();
+
+private:
+ static void ScheduleMeasurementIfNecessary();
+
+public:
+ static unsigned int GetOptimalMaxNormalizedYieldsPerSpinIteration()
+ {
+ return s_optimalMaxNormalizedYieldsPerSpinIteration;
+ }
+
+ static void FireMeasurementEvents();
+
+private:
+ static double AtomicLoad(double *valueRef);
+ static void AtomicStore(double *valueRef, double value);
+
+ DISABLE_CONSTRUCT_COPY(YieldProcessorNormalization);
+
+ friend class YieldProcessorNormalizationInfo;
+ friend void YieldProcessorNormalizedForPreSkylakeCount(unsigned int);
+};
class YieldProcessorNormalizationInfo
{
@@ -30,12 +75,15 @@ private:
public:
YieldProcessorNormalizationInfo()
- : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
- optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
+ : yieldsPerNormalizedYield(YieldProcessorNormalization::s_yieldsPerNormalizedYield),
+ optimalMaxNormalizedYieldsPerSpinIteration(YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIteration),
optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
{
+ YieldProcessorNormalization::ScheduleMeasurementIfNecessary();
}
+ DISABLE_COPY(YieldProcessorNormalizationInfo);
+
friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
@@ -98,9 +146,8 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
- // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
- // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
- const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
+ // On platforms with a small SIZE_T, prevent overflow on the multiply below
+ const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (count > MaxCount)
{
count = MaxCount;
@@ -144,9 +191,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
- // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
- // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
- const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
+ // On platforms with a small SIZE_T, prevent overflow on the multiply below
+ const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
preSkylakeCount = MaxCount;
@@ -175,7 +221,35 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
// }
FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
{
- YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
+ // This function does not forward to the one above because it is used by some code under utilcode, where
+ // YieldProcessorNormalizationInfo cannot be used since normalization does not happen in some of its consumers. So this
+ // version uses the fields in YieldProcessorNormalization directly.
+
+ _ASSERTE(preSkylakeCount != 0);
+
+ if (sizeof(SIZE_T) <= sizeof(unsigned int))
+ {
+ // On platforms with a small SIZE_T, prevent overflow on the multiply below
+ const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
+ if (preSkylakeCount > MaxCount)
+ {
+ preSkylakeCount = MaxCount;
+ }
+ }
+
+ const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
+ SIZE_T n =
+ (SIZE_T)preSkylakeCount *
+ YieldProcessorNormalization::s_yieldsPerNormalizedYield /
+ PreSkylakeCountToSkylakeCountDivisor;
+ if (n == 0)
+ {
+ n = 1;
+ }
+ do
+ {
+ System_YieldProcessor();
+ } while (--n != 0);
}
// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
@@ -193,15 +267,12 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
const YieldProcessorNormalizationInfo &normalizationInfo,
unsigned int spinIteration)
{
- // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
- // InitializeYieldProcessorNormalized()
- const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
- NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
- _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
- // This shift value should be adjusted based on the asserted condition below
+ // This shift value should be adjusted based on the asserted conditions below
const UINT8 MaxShift = 3;
- static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+ static_assert_no_msg(
+ ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+ static_assert_no_msg(
+ ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
unsigned int n;
if (spinIteration <= MaxShift &&
@@ -219,3 +290,6 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
System_YieldProcessor();
} while (--n != 0);
}
+
+#undef DISABLE_CONSTRUCT_COPY
+#undef DISABLE_COPY
diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp
index 4242f82792b..020d8d7cc79 100644
--- a/src/coreclr/utilcode/yieldprocessornormalized.cpp
+++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp
@@ -2,8 +2,16 @@
// The .NET Foundation licenses this file to you under the MIT license.
#include "stdafx.h"
+#include "yieldprocessornormalized.h"
-// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
-// tuned for Skylake processors
-unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
-unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
+bool YieldProcessorNormalization::s_isMeasurementScheduled;
+
+// Defaults are for when normalization has not yet been done
+unsigned int YieldProcessorNormalization::s_yieldsPerNormalizedYield = 1;
+unsigned int YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIteration =
+ (unsigned int)
+ (
+ (double)YieldProcessorNormalization::TargetMaxNsPerSpinIteration /
+ YieldProcessorNormalization::TargetNsPerNormalizedYield +
+ 0.5
+ );
diff --git a/src/coreclr/vm/CMakeLists.txt b/src/coreclr/vm/CMakeLists.txt
index 9c2cb3df0b7..f31e5a3ca12 100644
--- a/src/coreclr/vm/CMakeLists.txt
+++ b/src/coreclr/vm/CMakeLists.txt
@@ -136,7 +136,6 @@ set(VM_SOURCES_DAC_AND_WKS_COMMON
versionresilienthashcode.cpp
virtualcallstub.cpp
win32threadpool.cpp
- yieldprocessornormalized.cpp
zapsig.cpp
)
@@ -389,6 +388,7 @@ set(VM_SOURCES_WKS
threadsuspend.cpp
typeparse.cpp
weakreferencenative.cpp
+ yieldprocessornormalized.cpp
${VM_SOURCES_GDBJIT}
)
diff --git a/src/coreclr/vm/ClrEtwAll.man b/src/coreclr/vm/ClrEtwAll.man
index 45895f16fce..0eed049c17b 100644
--- a/src/coreclr/vm/ClrEtwAll.man
+++ b/src/coreclr/vm/ClrEtwAll.man
@@ -438,7 +438,13 @@
<opcode name="Profiler" message="$(string.RuntimePublisher.ProfilerOpcodeMessage)" symbol="CLR_PROFILER_OPCODE" value="11"/>
</opcodes>
</task>
- <!--Next available ID is 37-->
+ <task name="YieldProcessorMeasurement" symbol="CLR_YIELD_PROCESSOR_MEASUREMENT_TASK"
+ value="37" eventGUID="{B4AFC324-DECE-4B02-86DC-AAB8F22BC1B1}"
+ message="$(string.RuntimePublisher.YieldProcessorMeasurementTaskMessage)">
+ <opcodes>
+ </opcodes>
+ </task>
+ <!--Next available ID is 38-->
</tasks>
<!--Maps-->
<maps>
@@ -2916,6 +2922,19 @@
</Settings>
</UserData>
</template>
+
+ <template tid="YieldProcessorMeasurement">
+ <data name="ClrInstanceID" inType="win:UInt16"/>
+ <data name="NsPerYield" inType="win:Double"/>
+ <data name="EstablishedNsPerYield" inType="win:Double"/>
+ <UserData>
+ <Settings xmlns="myNs">
+ <ClrInstanceID> %1 </ClrInstanceID>
+ <NsPerYield> %2 </NsPerYield>
+ <EstablishedNsPerYield> %3 </EstablishedNsPerYield>
+ </Settings>
+ </UserData>
+ </template>
</templates>
<events>
@@ -3313,6 +3332,10 @@
keywords ="ThreadingKeyword" opcode="Wait"
task="ThreadPoolWorkerThread"
symbol="ThreadPoolWorkerThreadWait" message="$(string.RuntimePublisher.ThreadPoolWorkerThreadEventMessage)"/>
+
+ <event value="58" version="0" level="win:Informational" template="YieldProcessorMeasurement"
+ keywords="ThreadingKeyword" task="YieldProcessorMeasurement" opcode="win:Info"
+ symbol="YieldProcessorMeasurement" message="$(string.RuntimePublisher.YieldProcessorMeasurementEventMessage)"/>
<!-- CLR private ThreadPool events -->
<event value="60" version="0" level="win:Verbose" template="ThreadPoolWorkingThreadCount"
@@ -8049,6 +8072,7 @@
<string id="RuntimePublisher.WorkerThreadRetirementRetireThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreads=%2" />
<string id="RuntimePublisher.WorkerThreadRetirementUnretireThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreads=%2" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreadCount=%2;%nClrInstanceID=%3" />
+ <string id="RuntimePublisher.YieldProcessorMeasurementEventMessage" value="ClrInstanceID=%1;%nNsPerYield=%2;%nEstablishedNsPerYield=%3" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentSampleEventMessage" value="Throughput=%1;%nClrInstanceID=%2" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentAdjustmentEventMessage" value="AverageThroughput=%1;%nNewWorkerThreadCount=%2;%nReason=%3;%nClrInstanceID=%4" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentStatsEventMessage" value="Duration=%1;%nThroughput=%2;%nThreadWave=%3;%nThroughputWave=%4;%nThroughputErrorEstimate=%5;%nAverageThroughputErrorEstimate=%6;%nThroughputRatio=%7;%nConfidence=%8;%nNewControlSetting=%9;%nNewThreadWaveMagnitude=%10;%nClrInstanceID=%11" />
@@ -8334,6 +8358,7 @@
<string id="RuntimePublisher.JitInstrumentationDataTaskMessage" value="JitInstrumentationData" />
<string id="RuntimePublisher.ExecutionCheckpointTaskMessage" value="ExecutionCheckpoint" />
<string id="RuntimePublisher.ProfilerTaskMessage" value="Profiler" />
+ <string id="RuntimePublisher.YieldProcessorMeasurementTaskMessage" value="YieldProcessorMeasurement" />
<string id="RundownPublisher.EEStartupTaskMessage" value="Runtime" />
<string id="RundownPublisher.MethodTaskMessage" value="Method" />
diff --git a/src/coreclr/vm/ClrEtwAllMeta.lst b/src/coreclr/vm/ClrEtwAllMeta.lst
index 4ac4fe405d9..9c5738ef43d 100644
--- a/src/coreclr/vm/ClrEtwAllMeta.lst
+++ b/src/coreclr/vm/ClrEtwAllMeta.lst
@@ -134,9 +134,9 @@ nomac:GarbageCollection:::GCJoin_V2
nostack:Type:::BulkType
-###################
-# Threadpool events
-###################
+#################################
+# Threading and Threadpool events
+#################################
nomac:WorkerThreadCreation:::WorkerThreadCreate
noclrinstanceid:WorkerThreadCreation:::WorkerThreadCreate
nomac:WorkerThreadCreation:::WorkerThreadTerminate
@@ -170,6 +170,8 @@ nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample
nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample
nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment
nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment
+nomac:YieldProcessorMeasurement:::YieldProcessorMeasurement
+nostack:YieldProcessorMeasurement:::YieldProcessorMeasurement
##################
# Exception events
diff --git a/src/coreclr/vm/comsynchronizable.cpp b/src/coreclr/vm/comsynchronizable.cpp
index 39f00d06741..15a33c711e7 100644
--- a/src/coreclr/vm/comsynchronizable.cpp
+++ b/src/coreclr/vm/comsynchronizable.cpp
@@ -1089,22 +1089,13 @@ FCIMPL1(void, ThreadNative::SetIsThreadpoolThread, ThreadBaseObject* thread)
}
FCIMPLEND
-INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration()
+FCIMPL0(INT32, ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
{
- QCALL_CONTRACT;
-
- INT32 optimalMaxNormalizedYieldsPerSpinIteration;
-
- BEGIN_QCALL;
-
- // RuntimeThread calls this function only once lazily and caches the result, so ensure initialization
- EnsureYieldProcessorNormalizedInitialized();
- optimalMaxNormalizedYieldsPerSpinIteration = g_optimalMaxNormalizedYieldsPerSpinIteration;
-
- END_QCALL;
+ FCALL_CONTRACT;
- return optimalMaxNormalizedYieldsPerSpinIteration;
+ return (INT32)YieldProcessorNormalization::GetOptimalMaxNormalizedYieldsPerSpinIteration();
}
+FCIMPLEND
FCIMPL1(void, ThreadNative::SpinWait, int iterations)
{
diff --git a/src/coreclr/vm/comsynchronizable.h b/src/coreclr/vm/comsynchronizable.h
index e9968201b8b..cfab18d9010 100644
--- a/src/coreclr/vm/comsynchronizable.h
+++ b/src/coreclr/vm/comsynchronizable.h
@@ -86,7 +86,7 @@ public:
UINT64 QCALLTYPE GetProcessDefaultStackSize();
static FCDECL1(INT32, GetManagedThreadId, ThreadBaseObject* th);
- static INT32 QCALLTYPE GetOptimalMaxSpinWaitsPerSpinIteration();
+ static FCDECL0(INT32, GetOptimalMaxSpinWaitsPerSpinIteration);
static FCDECL1(void, SpinWait, int iterations);
static BOOL QCALLTYPE YieldThread();
static FCDECL0(Object*, GetCurrentThread);
diff --git a/src/coreclr/vm/ecalllist.h b/src/coreclr/vm/ecalllist.h
index f77dc75c80b..ea3f65d7291 100644
--- a/src/coreclr/vm/ecalllist.h
+++ b/src/coreclr/vm/ecalllist.h
@@ -602,7 +602,7 @@ FCFuncStart(gThreadFuncs)
#endif // FEATURE_COMINTEROP
FCFuncElement("Interrupt", ThreadNative::Interrupt)
FCFuncElement("Join", ThreadNative::Join)
- QCFuncElement("GetOptimalMaxSpinWaitsPerSpinIterationInternal", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
+ FCFuncElement("get_OptimalMaxSpinWaitsPerSpinIteration", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
FCFuncElement("GetCurrentProcessorNumber", ThreadNative::GetCurrentProcessorNumber)
FCFuncEnd()
diff --git a/src/coreclr/vm/eventtrace.cpp b/src/coreclr/vm/eventtrace.cpp
index ac7be2a9439..aded74deda6 100644
--- a/src/coreclr/vm/eventtrace.cpp
+++ b/src/coreclr/vm/eventtrace.cpp
@@ -4417,6 +4417,12 @@ VOID EtwCallbackCommon(
{
ETW::TypeSystemLog::OnKeywordsChanged();
}
+
+ if (g_fEEStarted && !g_fEEShutDown)
+ {
+ // Emit the YieldProcessor measured values at the beginning of the trace
+ YieldProcessorNormalization::FireMeasurementEvents();
+ }
}
// Individual callbacks for each EventPipe provider.
diff --git a/src/coreclr/vm/finalizerthread.cpp b/src/coreclr/vm/finalizerthread.cpp
index 1e4dbf913c8..e8370315e66 100644
--- a/src/coreclr/vm/finalizerthread.cpp
+++ b/src/coreclr/vm/finalizerthread.cpp
@@ -379,11 +379,6 @@ DWORD WINAPI FinalizerThread::FinalizerThreadStart(void *args)
{
GetFinalizerThread()->SetBackground(TRUE);
- {
- GCX_PREEMP();
- EnsureYieldProcessorNormalizedInitialized();
- }
-
while (!fQuitFinalizer)
{
// This will apply any policy for swallowing exceptions during normal
diff --git a/src/coreclr/vm/threads.cpp b/src/coreclr/vm/threads.cpp
index 2c55f8770b0..c6485b86d59 100644
--- a/src/coreclr/vm/threads.cpp
+++ b/src/coreclr/vm/threads.cpp
@@ -1150,8 +1150,6 @@ void InitThreadManager()
}
CONTRACTL_END;
- InitializeYieldProcessorNormalizedCrst();
-
// All patched helpers should fit into one page.
// If you hit this assert on retail build, there is most likely problem with BBT script.
_ASSERTE_ALL_BUILDS("clr/src/VM/threads.cpp", (BYTE*)JIT_PatchedCodeLast - (BYTE*)JIT_PatchedCodeStart > (ptrdiff_t)0);
@@ -7194,6 +7192,7 @@ BOOL Thread::HaveExtraWorkForFinalizer()
|| Thread::CleanupNeededForFinalizedThread()
|| (m_DetachCount > 0)
|| SystemDomain::System()->RequireAppDomainCleanup()
+ || YieldProcessorNormalization::IsMeasurementScheduled()
|| ThreadStore::s_pThreadStore->ShouldTriggerGCForDeadThreads();
}
@@ -7240,6 +7239,12 @@ void Thread::DoExtraWorkForFinalizer()
// If there were any TimerInfos waiting to be released, they'll get flushed now
ThreadpoolMgr::FlushQueueOfTimerInfos();
+ if (YieldProcessorNormalization::IsMeasurementScheduled())
+ {
+ GCX_PREEMP();
+ YieldProcessorNormalization::PerformMeasurement();
+ }
+
ThreadStore::s_pThreadStore->TriggerGCForDeadThreadsIfNecessary();
}
diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 91547923310..2c51e73b678 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -2,17 +2,33 @@
// The .NET Foundation licenses this file to you under the MIT license.
#include "common.h"
+#include "yieldprocessornormalized.h"
-static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
-static CrstStatic s_initializeYieldProcessorNormalizedCrst;
+#ifndef CROSSGEN_COMPILE
-void InitializeYieldProcessorNormalizedCrst()
+#include "finalizerthread.h"
+
+enum class NormalizationState : UINT8
{
- WRAPPER_NO_CONTRACT;
- s_initializeYieldProcessorNormalizedCrst.Init(CrstLeafLock);
-}
+ Uninitialized,
+ Initialized,
+ Failed
+};
+
+static const int NsPerYieldMeasurementCount = 8;
+static const unsigned int MeasurementPeriodMs = 4000;
+
+static const unsigned int NsPerS = 1000 * 1000 * 1000;
+
+static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
+static unsigned int s_previousNormalizationTimeMs;
+
+static UINT64 s_performanceCounterTicksPerS;
+static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
+static int s_nextMeasurementIndex;
+static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
-static void InitializeYieldProcessorNormalized()
+static unsigned int DetermineMeasureDurationUs()
{
CONTRACTL
{
@@ -22,92 +38,271 @@ static void InitializeYieldProcessorNormalized()
}
CONTRACTL_END;
- CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
+ _ASSERTE(s_normalizationState != NormalizationState::Failed);
- if (s_isYieldProcessorNormalizedInitialized)
+ // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
+ // if the overhead seems high relative to the measure duration.
+ unsigned int measureDurationUs = 1;
+ LARGE_INTEGER li;
+ QueryPerformanceCounter(&li);
+ UINT64 startTicks = li.QuadPart;
+ QueryPerformanceCounter(&li);
+ UINT64 elapsedTicks = li.QuadPart - startTicks;
+ if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
{
- return;
+ measureDurationUs *= 4;
+ }
+ return measureDurationUs;
+}
+
+static double MeasureNsPerYield(unsigned int measureDurationUs)
+{
+ CONTRACTL
+ {
+ NOTHROW;
+ GC_NOTRIGGER;
+ MODE_PREEMPTIVE;
}
+ CONTRACTL_END;
- // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
- // Intel post-Skylake processor: measured typically 125-150 cycles per yield
- const int MeasureDurationMs = 10;
- const int NsPerSecond = 1000 * 1000 * 1000;
+ _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+ int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
+ UINT64 ticksPerS = s_performanceCounterTicksPerS;
+ UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
LARGE_INTEGER li;
- if (!QueryPerformanceFrequency(&li) || (ULONGLONG)li.QuadPart < 1000 / MeasureDurationMs)
+ QueryPerformanceCounter(&li);
+ UINT64 startTicks = li.QuadPart;
+
+ for (int i = 0; i < yieldCount; ++i)
{
- // High precision clock not available or clock resolution is too low, resort to defaults
- s_isYieldProcessorNormalizedInitialized = true;
- return;
+ System_YieldProcessor();
}
- ULONGLONG ticksPerSecond = li.QuadPart;
- // Measure the nanosecond delay per yield
- ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
- unsigned int yieldCount = 0;
QueryPerformanceCounter(&li);
- ULONGLONG startTicks = li.QuadPart;
- ULONGLONG elapsedTicks;
- do
- {
- // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
- // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
- // low microsecond range.
- for (int i = 0; i < 1000; ++i)
+ UINT64 elapsedTicks = li.QuadPart - startTicks;
+ while (elapsedTicks < measureDurationTicks)
+ {
+ int nextYieldCount =
+ Max(4,
+ elapsedTicks == 0
+ ? yieldCount / 4
+ : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
+ for (int i = 0; i < nextYieldCount; ++i)
{
System_YieldProcessor();
}
- yieldCount += 1000;
QueryPerformanceCounter(&li);
- ULONGLONG nowTicks = li.QuadPart;
- elapsedTicks = nowTicks - startTicks;
- } while (elapsedTicks < measureDurationTicks);
- double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
- if (nsPerYield < 1)
+ elapsedTicks = li.QuadPart - startTicks;
+ yieldCount += nextYieldCount;
+ }
+
+ // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
+ const double MinNsPerYield = 0.1;
+
+ // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
+ // really take this long. Limit the maximum to keep the recorded values reasonable.
+ const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
+
+ return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
+}
+
+void YieldProcessorNormalization::PerformMeasurement()
+{
+ CONTRACTL
+ {
+ NOTHROW;
+ GC_NOTRIGGER;
+ MODE_PREEMPTIVE;
+ }
+ CONTRACTL_END;
+
+ _ASSERTE(s_isMeasurementScheduled);
+
+ double latestNsPerYield;
+ if (s_normalizationState == NormalizationState::Initialized)
{
- nsPerYield = 1;
+ if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+ {
+ return;
+ }
+
+ int nextMeasurementIndex = s_nextMeasurementIndex;
+ latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
+ AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
+ if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
+ {
+ nextMeasurementIndex = 0;
+ }
+ s_nextMeasurementIndex = nextMeasurementIndex;
}
+ else if (s_normalizationState == NormalizationState::Uninitialized)
+ {
+ LARGE_INTEGER li;
+ if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
+ {
+ // High precision clock not available or clock resolution is too low, resort to defaults
+ s_normalizationState = NormalizationState::Failed;
+ return;
+ }
+ s_performanceCounterTicksPerS = li.QuadPart;
+
+ unsigned int measureDurationUs = DetermineMeasureDurationUs();
+ for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+ {
+ latestNsPerYield = MeasureNsPerYield(measureDurationUs);
+ AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
+ if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
+ {
+ AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
+ }
- // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
- // value is naturally limited to MinNsPerNormalizedYield.
- int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
- if (yieldsPerNormalizedYield < 1)
+ if (i < NsPerYieldMeasurementCount - 1)
+ {
+ FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+ }
+ }
+ }
+ else
{
- yieldsPerNormalizedYield = 1;
+ _ASSERTE(s_normalizationState == NormalizationState::Failed);
+ return;
}
- _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
+
+ double establishedNsPerYield = s_nsPerYieldMeasurements[0];
+ for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
+ {
+ double nsPerYield = s_nsPerYieldMeasurements[i];
+ if (nsPerYield < establishedNsPerYield)
+ {
+ establishedNsPerYield = nsPerYield;
+ }
+ }
+ if (establishedNsPerYield != s_establishedNsPerYield)
+ {
+ AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
+ }
+
+ FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+
+ // Calculate the number of yields required to span the duration of a normalized yield
+ unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
+ _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
+ s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
// Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
// spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
// better job of allowing other work to run.
- int optimalMaxNormalizedYieldsPerSpinIteration =
- (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
- if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
+ s_optimalMaxNormalizedYieldsPerSpinIteration =
+ Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
+ _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+ GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+
+ s_previousNormalizationTimeMs = GetTickCount();
+ s_normalizationState = NormalizationState::Initialized;
+ s_isMeasurementScheduled = false;
+}
+
+#endif // !CROSSGEN_COMPILE
+
+void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
+{
+ CONTRACTL
{
- optimalMaxNormalizedYieldsPerSpinIteration = 1;
+ NOTHROW;
+ GC_NOTRIGGER;
+ MODE_ANY;
}
+ CONTRACTL_END;
- g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
- g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
- s_isYieldProcessorNormalizedInitialized = true;
+#ifndef CROSSGEN_COMPILE
+ NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
+ if (normalizationState == NormalizationState::Initialized)
+ {
+ if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+ {
+ return;
+ }
+ }
+ else if (normalizationState == NormalizationState::Uninitialized)
+ {
+ }
+ else
+ {
+ _ASSERTE(normalizationState == NormalizationState::Failed);
+ return;
+ }
- GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+ // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
+ if (s_isMeasurementScheduled || !g_fEEStarted)
+ {
+ return;
+ }
+
+ s_isMeasurementScheduled = true;
+ FinalizerThread::EnableFinalization();
+#endif // !CROSSGEN_COMPILE
}
-void EnsureYieldProcessorNormalizedInitialized()
+#ifndef CROSSGEN_COMPILE
+
+void YieldProcessorNormalization::FireMeasurementEvents()
{
CONTRACTL
{
NOTHROW;
GC_NOTRIGGER;
- MODE_PREEMPTIVE;
+ MODE_ANY;
}
CONTRACTL_END;
- if (!s_isYieldProcessorNormalizedInitialized)
+ if (!EventEnabledYieldProcessorMeasurement())
{
- InitializeYieldProcessorNormalized();
+ return;
}
+
+ // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
+ // recorded information, so try to enumerate the array with some care.
+ double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
+ int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
+ for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+ {
+ double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
+ if (nsPerYield != 0) // the array may not be fully initialized yet
+ {
+ FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
+ }
+
+ if (++nextIndex >= NsPerYieldMeasurementCount)
+ {
+ nextIndex = 0;
+ }
+ }
+}
+
+double YieldProcessorNormalization::AtomicLoad(double *valueRef)
+{
+ WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+ return VolatileLoadWithoutBarrier(valueRef);
+#else
+ return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
+#endif
}
+
+void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
+{
+ WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+ *valueRef = value;
+#else
+ InterlockedExchangeT(valueRef, value);
+#endif
+}
+
+#endif // !CROSSGEN_COMPILE
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs
index e1c0766b3f0..8e8198de392 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/LowLevelSpinWaiter.cs
@@ -71,10 +71,6 @@ namespace System.Threading
// the equivalent of YieldProcessor(), as that that point SwitchToThread/Sleep(0) are more likely to be able to
// allow other useful work to run. Long YieldProcessor() loops can help to reduce contention, but Sleep(1) is
// usually better for that.
- //
- // Thread.OptimalMaxSpinWaitsPerSpinIteration:
- // - See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
- //
int n = Thread.OptimalMaxSpinWaitsPerSpinIteration;
if (spinIndex <= 30 && (1 << spinIndex) < n)
{
diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs
index b45cc7d5d38..66b73f8be02 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/SpinWait.cs
@@ -225,10 +225,6 @@ namespace System.Threading
// the equivalent of YieldProcessor(), as at that point SwitchToThread/Sleep(0) are more likely to be able to
// allow other useful work to run. Long YieldProcessor() loops can help to reduce contention, but Sleep(1) is
// usually better for that.
- //
- // Thread.OptimalMaxSpinWaitsPerSpinIteration:
- // - See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
- //
int n = Thread.OptimalMaxSpinWaitsPerSpinIteration;
if (_count <= 30 && (1 << _count) < n)
{