Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/wolfpld/tracy.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThales Sabino <thales@codeplay.com>2020-05-27 18:26:52 +0300
committerThales Sabino <thales@codeplay.com>2020-06-05 12:15:47 +0300
commita46f83364e71bb2ae7e9b54d3029858cd8c8f815 (patch)
tree236e9ba9a8ff2258c818ecd8f4f6b90e063bb2ce /TracyOpenCL.hpp
parent6793b34fb53b77fbce8d1b8a6f9f51af51fa12af (diff)
Add OpenCL trace support
- Adds the file TracyOpenCL.hpp which contains the API to annotate OpenCL applications - It works in a similar fashion to the Vulkan annotations - Adds an example OpenCL application in examples/OpenCLVectorAdd - Adds "OpenCL Context" to the UI - Manual entry for annotating OpenCL zones
Diffstat (limited to 'TracyOpenCL.hpp')
-rw-r--r--TracyOpenCL.hpp333
1 files changed, 333 insertions, 0 deletions
diff --git a/TracyOpenCL.hpp b/TracyOpenCL.hpp
new file mode 100644
index 00000000..77c7de98
--- /dev/null
+++ b/TracyOpenCL.hpp
@@ -0,0 +1,333 @@
+#ifndef __TRACYOPENCL_HPP__
+#define __TRACYOPENCL_HPP__
+
+#if !defined TRACY_ENABLE
+
+#define TracyCLContext(x, y) nullptr
+#define TracyCLDestroy(x)
+#define TracyCLNamedZone(c, x, y, z, w)
+#define TracyCLNamedZoneC(c, x, y, z, w, a)
+#define TracyCLZone(c, x, y)
+#define TracyCLZoneC(c, x, y, z)
+#define TracyCLCollect(c)
+
+#define TracyCLNamedZoneS(c, x, y, z, w, a)
+#define TracyCLNamedZoneCS(c, x, y, z, w, v, a)
+#define TracyCLZoneS(c, x, y, z)
+#define TracyCLZoneCS(c, x, y, z, w)
+
+namespace tracy
+{
+ class OpenCLCtxScope {};
+}
+
+using TracyCLCtx = void*;
+
+#else
+
+#include <CL/cl.h>
+
+#include <atomic>
+#include <cassert>
+
+#include "Tracy.hpp"
+#include "client/TracyCallstack.hpp"
+#include "client/TracyProfiler.hpp"
+#include "common/TracyAlloc.hpp"
+
+namespace tracy {
+
+ enum class EventPhase : uint8_t
+ {
+ Begin,
+ End
+ };
+
+ struct EventInfo
+ {
+ cl_event event;
+ EventPhase phase;
+ };
+
+ class OpenCLCtx
+ {
+ public:
+ enum { QueryCount = 64 * 1024 };
+
+ OpenCLCtx(cl_context context, cl_device_id device)
+ : m_contextId(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed))
+ , m_head(0)
+ , m_tail(0)
+ {
+ assert(m_contextId != 255);
+
+ m_hostStartTime = Profiler::GetTime();
+ m_deviceStartTime = GetDeviceTimestamp(context, device);
+
+ auto item = Profiler::QueueSerial();
+ MemWrite(&item->hdr.type, QueueType::GpuNewContext);
+ MemWrite(&item->gpuNewContext.cpuTime, m_hostStartTime);
+ MemWrite(&item->gpuNewContext.gpuTime, m_hostStartTime);
+ memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
+ MemWrite(&item->gpuNewContext.period, 1.0f);
+ MemWrite(&item->gpuNewContext.type, GpuContextType::OpenCL);
+ MemWrite(&item->gpuNewContext.context, (uint8_t) m_contextId);
+ MemWrite(&item->gpuNewContext.accuracyBits, (uint8_t)0);
+#ifdef TRACY_ON_DEMAND
+ GetProfiler().DeferItem(*item);
+#endif
+ Profiler::QueueSerialFinish();
+ }
+
+ void Collect()
+ {
+ ZoneScopedC(Color::Red4);
+
+ if (m_tail == m_head) return;
+
+#ifdef TRACY_ON_DEMAND
+ if (!GetProfiler().IsConnected())
+ {
+ m_head = m_tail = 0;
+ }
+#endif
+
+ while (m_tail != m_head)
+ {
+ EventInfo eventInfo = m_query[m_tail];
+ cl_event event = eventInfo.event;
+ cl_int eventStatus;
+ cl_int err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
+ assert(err == CL_SUCCESS);
+ if (eventStatus != CL_COMPLETE) return;
+
+ cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin)
+ ? CL_PROFILING_COMMAND_START
+ : CL_PROFILING_COMMAND_END;
+
+ cl_ulong eventTimeStamp = 0;
+ err = clGetEventProfilingInfo(event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr);
+ assert(err == CL_SUCCESS);
+ assert(eventTimeStamp != 0);
+
+ auto item = Profiler::QueueSerial();
+ MemWrite(&item->hdr.type, QueueType::GpuTime);
+ MemWrite(&item->gpuTime.gpuTime, TimestampOffset(eventTimeStamp));
+ MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail);
+ MemWrite(&item->gpuTime.context, m_contextId);
+ Profiler::QueueSerialFinish();
+
+ if (eventInfo.phase == EventPhase::End)
+ {
+ // Done with the event, so release it
+ assert(clReleaseEvent(event) == CL_SUCCESS);
+ }
+
+ m_tail = (m_tail + 1) % QueryCount;
+ }
+ }
+
+ tracy_force_inline uint8_t GetId() const
+ {
+ return m_contextId;
+ }
+
+ tracy_force_inline unsigned int NextQueryId(EventInfo eventInfo)
+ {
+ const auto id = m_head;
+ m_head = (m_head + 1) % QueryCount;
+ assert(m_head != m_tail);
+ m_query[id] = eventInfo;
+ return id;
+ }
+
+ tracy_force_inline EventInfo& GetQuery(unsigned int id)
+ {
+ assert(id < QueryCount);
+ return m_query[id];
+ }
+
+ private:
+ tracy_force_inline int64_t GetHostStartTime() const
+ {
+ return m_hostStartTime;
+ }
+
+ tracy_force_inline int64_t GetDeviceStartTime() const
+ {
+ return m_deviceStartTime;
+ }
+
+ tracy_force_inline int64_t TimestampOffset(int64_t deviceTimestamp) const
+ {
+ return m_hostStartTime + (deviceTimestamp - m_deviceStartTime);
+ }
+
+ tracy_force_inline int64_t GetDeviceTimestamp(cl_context context, cl_device_id device) const
+ {
+ cl_ulong deviceTimestamp = 0;
+ cl_int err = CL_SUCCESS;
+ cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
+ assert(err == CL_SUCCESS);
+ uint32_t dummyValue = 42;
+ cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err);
+ assert(err == CL_SUCCESS);
+ cl_event writeBufferEvent;
+ err = clEnqueueWriteBuffer(queue, dummyBuffer, CL_TRUE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent);
+ assert(err == CL_SUCCESS);
+ err = clWaitForEvents(1, &writeBufferEvent);
+ assert(err == CL_SUCCESS);
+ cl_int eventStatus;
+ err = clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
+ assert(err == CL_SUCCESS);
+ assert(eventStatus == CL_COMPLETE);
+ err = clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &deviceTimestamp, nullptr);
+ assert(err == CL_SUCCESS);
+ err = clReleaseEvent(writeBufferEvent);
+ assert(err == CL_SUCCESS);
+ err = clReleaseMemObject(dummyBuffer);
+ assert(err == CL_SUCCESS);
+ err = clReleaseCommandQueue(queue);
+ assert(err == CL_SUCCESS);
+
+ return (int64_t)deviceTimestamp;
+ }
+
+ unsigned int m_contextId;
+
+ EventInfo m_query[QueryCount];
+ unsigned int m_head;
+ unsigned int m_tail;
+
+ int64_t m_hostStartTime;
+ int64_t m_deviceStartTime;
+ };
+
+ class OpenCLCtxScope {
+ public:
+ tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, bool is_active)
+#ifdef TRACY_ON_DEMAND
+ : m_active(is_active&& GetProfiler().IsConnected())
+#else
+ : m_active(is_active)
+#endif
+ , m_ctx(ctx)
+ , m_event(nullptr)
+ {
+ if (!m_active) return;
+
+ m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
+
+ auto item = Profiler::QueueSerial();
+ MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial);
+ MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+ MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
+ MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+ MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
+ MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+ Profiler::QueueSerialFinish();
+ }
+
+ tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int depth, bool is_active)
+#ifdef TRACY_ON_DEMAND
+ : m_active(is_active&& GetProfiler().IsConnected())
+#else
+ : m_active(is_active)
+#endif
+ , m_ctx(ctx)
+ , m_event(nullptr)
+ {
+ if (!m_active) return;
+
+ m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
+
+ auto item = Profiler::QueueSerial();
+ MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial);
+ MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+ MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
+ MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+ MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
+ MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+ Profiler::QueueSerialFinish();
+
+ GetProfiler().SendCallstack(depth);
+ }
+
+ tracy_force_inline void SetEvent(cl_event event)
+ {
+ m_event = event;
+ assert(clRetainEvent(m_event) == CL_SUCCESS);
+ m_ctx->GetQuery(m_beginQueryId).event = m_event;
+ }
+
+ tracy_force_inline ~OpenCLCtxScope()
+ {
+ const auto queryId = m_ctx->NextQueryId(EventInfo{ m_event, EventPhase::End });
+
+ auto item = Profiler::QueueSerial();
+ MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
+ MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
+ MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
+ MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)queryId);
+ MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
+ Profiler::QueueSerialFinish();
+ }
+
+ const bool m_active;
+ OpenCLCtx* m_ctx;
+ cl_event m_event;
+ unsigned int m_beginQueryId;
+ };
+
+ static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device)
+ {
+ InitRPMallocThread();
+ auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx));
+ new (ctx) OpenCLCtx(context, device);
+ return ctx;
+ }
+
+ static inline void DestroyCLContext(OpenCLCtx* ctx)
+ {
+ ctx->~OpenCLCtx();
+ tracy_free(ctx);
+ }
+
+} // namespace tracy
+
+using TracyCLCtx = tracy::OpenCLCtx*;
+
+#define TracyCLContext(context, device) tracy::CreateCLContext(context, device);
+#define TracyCLDestroy(ctx) tracy::DestroyCLContext(ctx);
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+# define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
+# define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
+# define TracyCLZone(ctx, name) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, TRACY_CALLSTACK, true)
+# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, TRACY_CALLSTACK, true)
+#else
+# define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active);
+# define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active);
+# define TracyCLZone(ctx, name) TracyCLNamedZone(ctx, __tracy_gpu_zone, name, true)
+# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneC(ctx, __tracy_gpu_zone, name, color, true )
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+# define TracyCLNamedZoneS(ctx, varname, name, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active);
+# define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active);
+# define TracyCLZoneS(ctx, name, depth) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, depth, true)
+# define TracyCLZoneCS(ctx, name, color, depth) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, depth, true)
+#else
+#define TracyCLNamedZoneS(ctx, varname, name, depth, active) TracyCLNamedZone(ctx, varname, name, active)
+#define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) TracyCLNamedZoneC(ctx, varname, name, color, active)
+#define TracyCLZoneS(ctx, name, depth) TracyCLZone(ctx, name)
+#define TracyCLZoneCS(ctx, name, color, depth) TracyCLZoneC(ctx, name, color)
+#endif
+
+#define TracyCLNamedZoneSetEvent(varname, event) varname.SetEvent(event)
+#define TracyCLZoneSetEvent(event) __tracy_gpu_zone.SetEvent(event)
+
+#define TracyCLCollect(ctx) ctx->Collect()
+
+#endif
+
+#endif