diff options
author | Thales Sabino <thales@codeplay.com> | 2020-05-27 18:26:52 +0300 |
---|---|---|
committer | Thales Sabino <thales@codeplay.com> | 2020-06-05 12:15:47 +0300 |
commit | a46f83364e71bb2ae7e9b54d3029858cd8c8f815 (patch) | |
tree | 236e9ba9a8ff2258c818ecd8f4f6b90e063bb2ce /TracyOpenCL.hpp | |
parent | 6793b34fb53b77fbce8d1b8a6f9f51af51fa12af (diff) |
Add OpenCL trace support
- Adds the file TracyOpenCL.hpp which contains the API to annotate OpenCL applications
- It works in a similar fashion to the Vulkan annotations
- Adds an example OpenCL application in examples/OpenCLVectorAdd
- Adds "OpenCL Context" to the UI
- Manual entry for annotating OpenCL zones
Diffstat (limited to 'TracyOpenCL.hpp')
-rw-r--r-- | TracyOpenCL.hpp | 333 |
1 files changed, 333 insertions, 0 deletions
diff --git a/TracyOpenCL.hpp b/TracyOpenCL.hpp new file mode 100644 index 00000000..77c7de98 --- /dev/null +++ b/TracyOpenCL.hpp @@ -0,0 +1,333 @@ +#ifndef __TRACYOPENCL_HPP__ +#define __TRACYOPENCL_HPP__ + +#if !defined TRACY_ENABLE + +#define TracyCLContext(x, y) nullptr +#define TracyCLDestroy(x) +#define TracyCLNamedZone(c, x, y, z, w) +#define TracyCLNamedZoneC(c, x, y, z, w, a) +#define TracyCLZone(c, x, y) +#define TracyCLZoneC(c, x, y, z) +#define TracyCLCollect(c) + +#define TracyCLNamedZoneS(c, x, y, z, w, a) +#define TracyCLNamedZoneCS(c, x, y, z, w, v, a) +#define TracyCLZoneS(c, x, y, z) +#define TracyCLZoneCS(c, x, y, z, w) + +namespace tracy +{ + class OpenCLCtxScope {}; +} + +using TracyCLCtx = void*; + +#else + +#include <CL/cl.h> + +#include <atomic> +#include <cassert> + +#include "Tracy.hpp" +#include "client/TracyCallstack.hpp" +#include "client/TracyProfiler.hpp" +#include "common/TracyAlloc.hpp" + +namespace tracy { + + enum class EventPhase : uint8_t + { + Begin, + End + }; + + struct EventInfo + { + cl_event event; + EventPhase phase; + }; + + class OpenCLCtx + { + public: + enum { QueryCount = 64 * 1024 }; + + OpenCLCtx(cl_context context, cl_device_id device) + : m_contextId(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed)) + , m_head(0) + , m_tail(0) + { + assert(m_contextId != 255); + + m_hostStartTime = Profiler::GetTime(); + m_deviceStartTime = GetDeviceTimestamp(context, device); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuNewContext); + MemWrite(&item->gpuNewContext.cpuTime, m_hostStartTime); + MemWrite(&item->gpuNewContext.gpuTime, m_hostStartTime); + memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); + MemWrite(&item->gpuNewContext.period, 1.0f); + MemWrite(&item->gpuNewContext.type, GpuContextType::OpenCL); + MemWrite(&item->gpuNewContext.context, (uint8_t) m_contextId); + MemWrite(&item->gpuNewContext.accuracyBits, (uint8_t)0); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + Profiler::QueueSerialFinish(); + } + + void Collect() + { + ZoneScopedC(Color::Red4); + + if (m_tail == m_head) return; + +#ifdef TRACY_ON_DEMAND + if (!GetProfiler().IsConnected()) + { + m_head = m_tail = 0; + } +#endif + + while (m_tail != m_head) + { + EventInfo eventInfo = m_query[m_tail]; + cl_event event = eventInfo.event; + cl_int eventStatus; + cl_int err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr); + assert(err == CL_SUCCESS); + if (eventStatus != CL_COMPLETE) return; + + cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin) + ? CL_PROFILING_COMMAND_START + : CL_PROFILING_COMMAND_END; + + cl_ulong eventTimeStamp = 0; + err = clGetEventProfilingInfo(event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr); + assert(err == CL_SUCCESS); + assert(eventTimeStamp != 0); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, TimestampOffset(eventTimeStamp)); + MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + + if (eventInfo.phase == EventPhase::End) + { + // Done with the event, so release it + assert(clReleaseEvent(event) == CL_SUCCESS); + } + + m_tail = (m_tail + 1) % QueryCount; + } + } + + tracy_force_inline uint8_t GetId() const + { + return m_contextId; + } + + tracy_force_inline unsigned int NextQueryId(EventInfo eventInfo) + { + const auto id = m_head; + m_head = (m_head + 1) % QueryCount; + assert(m_head != m_tail); + m_query[id] = eventInfo; + return id; + } + + tracy_force_inline EventInfo& GetQuery(unsigned int id) + { + assert(id < QueryCount); + return m_query[id]; + } + + private: + tracy_force_inline int64_t GetHostStartTime() const + { + return m_hostStartTime; + } + + tracy_force_inline int64_t GetDeviceStartTime() const + { + return m_deviceStartTime; + } + + tracy_force_inline int64_t TimestampOffset(int64_t deviceTimestamp) const + { + return m_hostStartTime + (deviceTimestamp - m_deviceStartTime); + } + + tracy_force_inline int64_t GetDeviceTimestamp(cl_context context, cl_device_id device) const + { + cl_ulong deviceTimestamp = 0; + cl_int err = CL_SUCCESS; + cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); + assert(err == CL_SUCCESS); + uint32_t dummyValue = 42; + cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err); + assert(err == CL_SUCCESS); + cl_event writeBufferEvent; + err = clEnqueueWriteBuffer(queue, dummyBuffer, CL_TRUE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent); + assert(err == CL_SUCCESS); + err = clWaitForEvents(1, &writeBufferEvent); + assert(err == CL_SUCCESS); + cl_int eventStatus; + err = clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr); + assert(err == CL_SUCCESS); + assert(eventStatus == CL_COMPLETE); + err = clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &deviceTimestamp, nullptr); + assert(err == CL_SUCCESS); + err = clReleaseEvent(writeBufferEvent); + assert(err == CL_SUCCESS); + err = clReleaseMemObject(dummyBuffer); + assert(err == CL_SUCCESS); + err = clReleaseCommandQueue(queue); + assert(err == CL_SUCCESS); + + return (int64_t)deviceTimestamp; + } + + unsigned int m_contextId; + + EventInfo m_query[QueryCount]; + unsigned int m_head; + unsigned int m_tail; + + int64_t m_hostStartTime; + int64_t m_deviceStartTime; + }; + + class OpenCLCtxScope { + public: + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, bool is_active) +#ifdef TRACY_ON_DEMAND + : m_active(is_active&& GetProfiler().IsConnected()) +#else + : m_active(is_active) +#endif + , m_ctx(ctx) + , m_event(nullptr) + { + if (!m_active) return; + + m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int depth, bool is_active) +#ifdef TRACY_ON_DEMAND + : m_active(is_active&& GetProfiler().IsConnected()) +#else + : m_active(is_active) +#endif + , m_ctx(ctx) + , m_event(nullptr) + { + if (!m_active) return; + + m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial); + MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc); + MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId); + MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); + Profiler::QueueSerialFinish(); + + GetProfiler().SendCallstack(depth); + } + + tracy_force_inline void SetEvent(cl_event event) + { + m_event = event; + assert(clRetainEvent(m_event) == CL_SUCCESS); + m_ctx->GetQuery(m_beginQueryId).event = m_event; + } + + tracy_force_inline ~OpenCLCtxScope() + { + const auto queryId = m_ctx->NextQueryId(EventInfo{ m_event, EventPhase::End }); + + auto item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial); + MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime()); + MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle()); + MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)queryId); + MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId()); + Profiler::QueueSerialFinish(); + } + + const bool m_active; + OpenCLCtx* m_ctx; + cl_event m_event; + unsigned int m_beginQueryId; + }; + + static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device) + { + InitRPMallocThread(); + auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx)); + new (ctx) OpenCLCtx(context, device); + return ctx; + } + + static inline void DestroyCLContext(OpenCLCtx* ctx) + { + ctx->~OpenCLCtx(); + tracy_free(ctx); + } + +} // namespace tracy + +using TracyCLCtx = tracy::OpenCLCtx*; + +#define TracyCLContext(context, device) tracy::CreateCLContext(context, device); +#define TracyCLDestroy(ctx) tracy::DestroyCLContext(ctx); +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define TracyCLZone(ctx, name) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, TRACY_CALLSTACK, true) +# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, TRACY_CALLSTACK, true) +#else +# define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active); +# define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active); +# define TracyCLZone(ctx, name) TracyCLNamedZone(ctx, __tracy_gpu_zone, name, true) +# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneC(ctx, __tracy_gpu_zone, name, color, true ) +#endif + +#ifdef TRACY_HAS_CALLSTACK +# define TracyCLNamedZoneS(ctx, varname, name, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active); +# define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active); +# define TracyCLZoneS(ctx, name, depth) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, depth, true) +# define TracyCLZoneCS(ctx, name, color, depth) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, depth, true) +#else +#define TracyCLNamedZoneS(ctx, varname, name, depth, active) TracyCLNamedZone(ctx, varname, name, active) +#define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) TracyCLNamedZoneC(ctx, varname, name, color, active) +#define TracyCLZoneS(ctx, name, depth) TracyCLZone(ctx, name) +#define TracyCLZoneCS(ctx, name, color, depth) TracyCLZoneC(ctx, name, color) +#endif + +#define TracyCLNamedZoneSetEvent(varname, event) varname.SetEvent(event) +#define TracyCLZoneSetEvent(event) __tracy_gpu_zone.SetEvent(event) + +#define TracyCLCollect(ctx) ctx->Collect() + +#endif + +#endif |