// Copyright (c) 2016, libnumaapi authors // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal in the Software without restriction, including without limitation the // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or // sell copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. // // Author: Sergey Sharybin (sergey.vfx@gmail.com) #include "build_config.h" #if OS_WIN #include "numaapi.h" #ifndef NOGDI # define NOGDI #endif #ifndef NOMINMAX # define NOMINMAX #endif #ifndef WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN #endif #ifndef NOCOMM # define NOCOMM #endif #include #include #include #if ARCH_CPU_64_BITS # include #endif //////////////////////////////////////////////////////////////////////////////// // Initialization. // Kernel library, from where the symbols come. static HMODULE kernel_lib; // Types of all symbols which are read from the library. // NUMA function types. typedef BOOL t_GetNumaHighestNodeNumber(PULONG highest_node_number); typedef BOOL t_GetNumaNodeProcessorMask(UCHAR node, ULONGLONG* processor_mask); typedef BOOL t_GetNumaNodeProcessorMaskEx(USHORT node, GROUP_AFFINITY* processor_mask); typedef BOOL t_GetNumaProcessorNode(UCHAR processor, UCHAR* node_number); typedef void* t_VirtualAllocExNuma(HANDLE process_handle, LPVOID address, SIZE_T size, DWORD allocation_type, DWORD protect, DWORD preferred); typedef BOOL t_VirtualFree(void* address, SIZE_T size, DWORD free_type); // Threading function types. typedef BOOL t_SetProcessAffinityMask(HANDLE process_handle, DWORD_PTR process_affinity_mask); typedef BOOL t_SetThreadGroupAffinity(HANDLE thread_handle, const GROUP_AFFINITY* group_affinity, GROUP_AFFINITY* PreviousGroupAffinity); typedef BOOL t_GetThreadGroupAffinity(HANDLE thread_handle, GROUP_AFFINITY* group_affinity); typedef DWORD t_GetCurrentProcessorNumber(void); typedef void t_GetCurrentProcessorNumberEx(PROCESSOR_NUMBER* proc_number); typedef DWORD t_GetActiveProcessorCount(WORD group_number); // NUMA symbols. static t_GetNumaHighestNodeNumber* _GetNumaHighestNodeNumber; static t_GetNumaNodeProcessorMask* _GetNumaNodeProcessorMask; static t_GetNumaNodeProcessorMaskEx* _GetNumaNodeProcessorMaskEx; static t_GetNumaProcessorNode* _GetNumaProcessorNode; static t_VirtualAllocExNuma* _VirtualAllocExNuma; static t_VirtualFree* _VirtualFree; // Threading symbols. static t_SetProcessAffinityMask* _SetProcessAffinityMask; static t_SetThreadGroupAffinity* _SetThreadGroupAffinity; static t_GetThreadGroupAffinity* _GetThreadGroupAffinity; static t_GetCurrentProcessorNumber* _GetCurrentProcessorNumber; static t_GetCurrentProcessorNumberEx* _GetCurrentProcessorNumberEx; static t_GetActiveProcessorCount* _GetActiveProcessorCount; static void numaExit(void) { // TODO(sergey): Consider closing library here. } static NUMAAPI_Result loadNumaSymbols(void) { // Prevent multiple initializations. static bool initialized = false; static NUMAAPI_Result result = NUMAAPI_NOT_AVAILABLE; if (initialized) { return result; } initialized = true; // Register de-initialization. const int error = atexit(numaExit); if (error) { result = NUMAAPI_ERROR_ATEXIT; return result; } // Load library. kernel_lib = LoadLibraryA("Kernel32.dll"); // Load symbols. #define _LIBRARY_FIND(lib, name) \ do { \ _##name = (t_##name *)GetProcAddress(lib, #name); \ } while (0) #define KERNEL_LIBRARY_FIND(name) _LIBRARY_FIND(kernel_lib, name) // NUMA. KERNEL_LIBRARY_FIND(GetNumaHighestNodeNumber); KERNEL_LIBRARY_FIND(GetNumaNodeProcessorMask); KERNEL_LIBRARY_FIND(GetNumaNodeProcessorMaskEx); KERNEL_LIBRARY_FIND(GetNumaProcessorNode); KERNEL_LIBRARY_FIND(VirtualAllocExNuma); KERNEL_LIBRARY_FIND(VirtualFree); // Threading. KERNEL_LIBRARY_FIND(SetProcessAffinityMask); KERNEL_LIBRARY_FIND(SetThreadGroupAffinity); KERNEL_LIBRARY_FIND(GetThreadGroupAffinity); KERNEL_LIBRARY_FIND(GetCurrentProcessorNumber); KERNEL_LIBRARY_FIND(GetCurrentProcessorNumberEx); KERNEL_LIBRARY_FIND(GetActiveProcessorCount); #undef KERNEL_LIBRARY_FIND #undef _LIBRARY_FIND result = NUMAAPI_SUCCESS; return result; } NUMAAPI_Result numaAPI_Initialize(void) { #if !ARCH_CPU_64_BITS // No NUMA on 32 bit platforms. return NUMAAPI_NOT_AVAILABLE; #else if (!IsWindows7OrGreater()) { // Require Windows 7 or higher. NUMAAPI_NOT_AVAILABLE; } loadNumaSymbols(); return NUMAAPI_SUCCESS; #endif } //////////////////////////////////////////////////////////////////////////////// // Internal helpers. static int countNumSetBits(ULONGLONG mask) { // TODO(sergey): There might be faster way calculating number of set bits. // NOTE: mask must be unsigned, there is undefined behavior for signed ints. int num_bits = 0; while (mask != 0) { num_bits += (mask & 1); mask = (mask >> 1); } return num_bits; } //////////////////////////////////////////////////////////////////////////////// // Topology query. int numaAPI_GetNumNodes(void) { ULONG highest_node_number; if (!_GetNumaHighestNodeNumber(&highest_node_number)) { return 0; } // TODO(sergey): Resolve the type narrowing. // NOTE: This is not necessarily a total amount of nodes in the system. return (int)highest_node_number + 1; } bool numaAPI_IsNodeAvailable(int node) { // Trick to detect whether the node is usable or not: check whether // there are any processors associated with it. // // This is needed because numaApiGetNumNodes() is not guaranteed to // give total amount of nodes and some nodes might be unavailable. ULONGLONG processor_mask; if (!_GetNumaNodeProcessorMask(node, &processor_mask)) { return false; } if (processor_mask == 0) { return false; } return true; } int numaAPI_GetNumNodeProcessors(int node) { ULONGLONG processor_mask; if (!_GetNumaNodeProcessorMask(node, &processor_mask)) { return 0; } return countNumSetBits(processor_mask); } //////////////////////////////////////////////////////////////////////////////// // Topology helpers. int numaAPI_GetNumCurrentNodesProcessors(void) { HANDLE thread_handle = GetCurrentThread(); GROUP_AFFINITY group_affinity; // TODO(sergey): Needs implementation. if (!_GetThreadGroupAffinity(thread_handle, &group_affinity)) { return 0; } // First, count number of possible bits in the affinity mask. const int num_processors = countNumSetBits(group_affinity.Mask); // Then check that it's not exceeding number of processors in tjhe group. const int num_group_processors = _GetActiveProcessorCount(group_affinity.Group); if (num_group_processors < num_processors) { return num_group_processors; } return num_processors; } //////////////////////////////////////////////////////////////////////////////// // Affinities. bool numaAPI_RunProcessOnNode(int node) { // TODO(sergey): Make sure requested node is within active CPU group. // Change affinity of the proces to make it to run on a given node. HANDLE process_handle = GetCurrentProcess(); ULONGLONG processor_mask; if (_GetNumaNodeProcessorMask(node, &processor_mask) == 0) { return false; } if (_SetProcessAffinityMask(process_handle, processor_mask) == 0) { return false; } return true; } bool numaAPI_RunThreadOnNode(int node) { HANDLE thread_handle = GetCurrentThread(); GROUP_AFFINITY group_affinity = { 0 }; if (_GetNumaNodeProcessorMaskEx(node, &group_affinity) == 0) { return false; } if (_SetThreadGroupAffinity(thread_handle, &group_affinity, NULL) == 0) { return false; } return true; } //////////////////////////////////////////////////////////////////////////////// // Memory management. void* numaAPI_AllocateOnNode(size_t size, int node) { return _VirtualAllocExNuma(GetCurrentProcess(), NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, node); } void* numaAPI_AllocateLocal(size_t size) { UCHAR current_processor = (UCHAR)_GetCurrentProcessorNumber(); UCHAR node; if (!_GetNumaProcessorNode(current_processor, &node)) { return NULL; } return numaAPI_AllocateOnNode(size, node); } void numaAPI_Free(void* start, size_t size) { if (!_VirtualFree(start, size, MEM_RELEASE)) { // TODO(sergey): Throw an error! } } #endif // OS_WIN