/*
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 * Copyright 2016, Blender Foundation.
 */

/** \file
 * \ingroup draw
 */

/* Private functions / structs of the draw manager */

#ifndef __DRAW_MANAGER_H__
#define __DRAW_MANAGER_H__

#include "DRW_engine.h"
#include "DRW_render.h"

#include "BLI_assert.h"
#include "BLI_linklist.h"
#include "BLI_memblock.h"
#include "BLI_task.h"
#include "BLI_threads.h"

#include "GPU_batch.h"
#include "GPU_context.h"
#include "GPU_framebuffer.h"
#include "GPU_shader.h"
#include "GPU_uniformbuffer.h"
#include "GPU_viewport.h"

#include "draw_instance_data.h"

/* Use draw manager to call GPU_select, see: DRW_draw_select_loop */
#define USE_GPU_SELECT

/* Use drawcall batching using instanced rendering. */
#define USE_BATCHING 1

// #define DRW_DEBUG_CULLING
#define DRW_DEBUG_USE_UNIFORM_NAME 0
#define DRW_UNIFORM_BUFFER_NAME 64

/* ------------ Profiling --------------- */

#define USE_PROFILE

#ifdef USE_PROFILE
#  include "PIL_time.h"

#  define PROFILE_TIMER_FALLOFF 0.04

#  define PROFILE_START(time_start) \
    double time_start = PIL_check_seconds_timer(); \
    ((void)0)

#  define PROFILE_END_ACCUM(time_accum, time_start) \
    { \
      time_accum += (PIL_check_seconds_timer() - time_start) * 1e3; \
    } \
    ((void)0)

/* exp average */
#  define PROFILE_END_UPDATE(time_update, time_start) \
    { \
      double _time_delta = (PIL_check_seconds_timer() - time_start) * 1e3; \
      time_update = (time_update * (1.0 - PROFILE_TIMER_FALLOFF)) + \
                    (_time_delta * PROFILE_TIMER_FALLOFF); \
    } \
    ((void)0)

#else /* USE_PROFILE */

#  define PROFILE_START(time_start) ((void)0)
#  define PROFILE_END_ACCUM(time_accum, time_start) ((void)0)
#  define PROFILE_END_UPDATE(time_update, time_start) ((void)0)

#endif /* USE_PROFILE */

/* ------------ Data Structure --------------- */
/**
 * Data structure containing all drawcalls organized by passes and materials.
 * DRWPass > DRWShadingGroup > DRWCall > DRWCallState
 *                           > DRWUniform
 */

typedef struct DRWCullingState {
  uint32_t mask;
  /* Culling: Using Bounding Sphere for now for faster culling.
   * Not ideal for planes. Could be extended. */
  BoundSphere bsphere;
  /* Grrr only used by EEVEE. */
  void *user_data;
} DRWCullingState;

/* Minimum max UBO size is 64KiB. We take the largest
 * UBO struct and alloc the max number.
 * ((1 << 16) / sizeof(DRWObjectMatrix)) = 512
 * Keep in sync with common_view_lib.glsl */
#define DRW_RESOURCE_CHUNK_LEN 512

/**
 * Identifier used to sort similar drawcalls together.
 * Also used to reference elements inside memory blocks.
 *
 * From MSB to LSB
 * 1 bit for negative scale.
 * 22 bits for chunk id.
 * 9 bits for resource id inside the chunk. (can go up to 511)
 * |-|----------------------|---------|
 *
 * Use manual bit-shift and mask instead of bit-fields to avoid
 * compiler dependent behavior that would mess the ordering of
 * the members thus changing the sorting order.
 */
typedef uint32_t DRWResourceHandle;

BLI_INLINE uint32_t DRW_handle_negative_scale_get(const DRWResourceHandle *handle)
{
  return (*handle & 0x80000000) != 0;
}

BLI_INLINE uint32_t DRW_handle_chunk_get(const DRWResourceHandle *handle)
{
  return (*handle & 0x7FFFFFFF) >> 9;
}

BLI_INLINE uint32_t DRW_handle_id_get(const DRWResourceHandle *handle)
{
  return (*handle & 0x000001FF);
}

BLI_INLINE void DRW_handle_increment(DRWResourceHandle *handle)
{
  *handle += 1;
}

BLI_INLINE void DRW_handle_negative_scale_enable(DRWResourceHandle *handle)
{
  *handle |= 0x80000000;
}

BLI_INLINE void *DRW_memblock_elem_from_handle(struct BLI_memblock *memblock,
                                               const DRWResourceHandle *handle)
{
  int elem = DRW_handle_id_get(handle);
  int chunk = DRW_handle_chunk_get(handle);
  return BLI_memblock_elem_get(memblock, chunk, elem);
}

typedef struct DRWObjectMatrix {
  float model[4][4];
  float modelinverse[4][4];
} DRWObjectMatrix;

typedef struct DRWObjectInfos {
  float orcotexfac[2][4];
  float ob_color[4];
  float ob_index;
  float pad; /* UNUSED*/
  float ob_random;
  float ob_flag; /* sign is negative scaling,  */
} DRWObjectInfos;

BLI_STATIC_ASSERT_ALIGN(DRWObjectMatrix, 16)
BLI_STATIC_ASSERT_ALIGN(DRWObjectInfos, 16)

typedef enum {
  /* Draw Commands */
  DRW_CMD_DRAW = 0, /* Only sortable type. Must be 0. */
  DRW_CMD_DRAW_RANGE = 1,
  DRW_CMD_DRAW_INSTANCE = 2,
  DRW_CMD_DRAW_INSTANCE_RANGE = 3,
  DRW_CMD_DRAW_PROCEDURAL = 4,
  /* Other Commands */
  DRW_CMD_CLEAR = 12,
  DRW_CMD_DRWSTATE = 13,
  DRW_CMD_STENCIL = 14,
  DRW_CMD_SELECTID = 15,
  /* Needs to fit in 4bits */
} eDRWCommandType;

#define DRW_MAX_DRAW_CMD_TYPE DRW_CMD_DRAW_PROCEDURAL

typedef struct DRWCommandDraw {
  GPUBatch *batch;
  DRWResourceHandle handle;
} DRWCommandDraw;

/* Assume DRWResourceHandle to be 0. */
typedef struct DRWCommandDrawRange {
  GPUBatch *batch;
  DRWResourceHandle handle;
  uint vert_first;
  uint vert_count;
} DRWCommandDrawRange;

typedef struct DRWCommandDrawInstance {
  GPUBatch *batch;
  DRWResourceHandle handle;
  uint inst_count;
  uint use_attrs; /* bool */
} DRWCommandDrawInstance;

typedef struct DRWCommandDrawInstanceRange {
  GPUBatch *batch;
  DRWResourceHandle handle;
  uint inst_first;
  uint inst_count;
} DRWCommandDrawInstanceRange;

typedef struct DRWCommandDrawProcedural {
  GPUBatch *batch;
  DRWResourceHandle handle;
  uint vert_count;
} DRWCommandDrawProcedural;

typedef struct DRWCommandSetMutableState {
  /** State changes (or'd or and'd with the pass's state) */
  DRWState enable;
  DRWState disable;
} DRWCommandSetMutableState;

typedef struct DRWCommandSetStencil {
  uint write_mask;
  uint comp_mask;
  uint ref;
} DRWCommandSetStencil;

typedef struct DRWCommandSetSelectID {
  GPUVertBuf *select_buf;
  uint select_id;
} DRWCommandSetSelectID;

typedef struct DRWCommandClear {
  eGPUFrameBufferBits clear_channels;
  uchar r, g, b, a; /* [0..1] for each channels. Normalized. */
  float depth;      /* [0..1] for depth. Normalized. */
  uchar stencil;    /* Stencil value [0..255] */
} DRWCommandClear;

typedef union DRWCommand {
  DRWCommandDraw draw;
  DRWCommandDrawRange range;
  DRWCommandDrawInstance instance;
  DRWCommandDrawInstanceRange instance_range;
  DRWCommandDrawProcedural procedural;
  DRWCommandSetMutableState state;
  DRWCommandSetStencil stencil;
  DRWCommandSetSelectID select_id;
  DRWCommandClear clear;
} DRWCommand;

/* Used for agregating calls into GPUVertBufs. */
struct DRWCallBuffer {
  GPUVertBuf *buf;
  GPUVertBuf *buf_select;
  int count;
};

/* Used by DRWUniform.type */
typedef enum {
  DRW_UNIFORM_INT = 0,
  DRW_UNIFORM_INT_COPY,
  DRW_UNIFORM_FLOAT,
  DRW_UNIFORM_FLOAT_COPY,
  DRW_UNIFORM_TEXTURE,
  DRW_UNIFORM_TEXTURE_REF,
  DRW_UNIFORM_BLOCK,
  DRW_UNIFORM_BLOCK_REF,
  DRW_UNIFORM_TFEEDBACK_TARGET,
  /** Per drawcall uniforms/UBO */
  DRW_UNIFORM_BLOCK_OBMATS,
  DRW_UNIFORM_BLOCK_OBINFOS,
  DRW_UNIFORM_RESOURCE_CHUNK,
  DRW_UNIFORM_RESOURCE_ID,
  /** Legacy / Fallback */
  DRW_UNIFORM_BASE_INSTANCE,
  DRW_UNIFORM_MODEL_MATRIX,
  DRW_UNIFORM_MODEL_MATRIX_INVERSE,
  /* WARNING: set DRWUniform->type
   * bit length accordingly. */
} DRWUniformType;

struct DRWUniform {
  union {
    /* For reference or array/vector types. */
    const void *pvalue;
    /* DRW_UNIFORM_TEXTURE */
    struct {
      union {
        GPUTexture *texture;
        GPUTexture **texture_ref;
      };
      eGPUSamplerState sampler_state;
    };
    /* DRW_UNIFORM_BLOCK */
    union {
      GPUUniformBuffer *block;
      GPUUniformBuffer **block_ref;
    };
    /* DRW_UNIFORM_FLOAT_COPY */
    float fvalue[4];
    /* DRW_UNIFORM_INT_COPY */
    int ivalue[4];
  };
  int location;      /* Uniform location or binding point for textures and ubos. */
  uint8_t type;      /* DRWUniformType */
  uint8_t length;    /* Length of vector types. */
  uint8_t arraysize; /* Array size of scalar/vector types. */
};

struct DRWShadingGroup {
  DRWShadingGroup *next;

  GPUShader *shader;                /* Shader to bind */
  struct DRWUniformChunk *uniforms; /* Uniforms pointers */

  struct {
    /* Chunks of draw calls. */
    struct DRWCommandChunk *first, *last;
  } cmd;

  union {
    /* This struct is used during cache populate. */
    struct {
      int objectinfo;                /* Equal to 1 if the shader needs obinfos. */
      DRWResourceHandle pass_handle; /* Memblock key to parent pass. */
    };
    /* This struct is used after cache populate if using the Z sorting.
     * It will not conflict with the above struct. */
    struct {
      float distance;      /* Distance from camera. */
      uint original_index; /* Original position inside the shgroup list. */
    } z_sorting;
  };
};

#define MAX_PASS_NAME 32

struct DRWPass {
  /* Linked list */
  struct {
    DRWShadingGroup *first;
    DRWShadingGroup *last;
  } shgroups;

  /* Draw the shgroups of this pass instead.
   * This avoid duplicating drawcalls/shgroups
   * for similar passes. */
  DRWPass *original;
  /* Link list of additional passes to render. */
  DRWPass *next;

  DRWResourceHandle handle;
  DRWState state;
  char name[MAX_PASS_NAME];
};

/* keep in sync with viewBlock */
typedef struct DRWViewUboStorage {
  /* View matrices */
  float persmat[4][4];
  float persinv[4][4];
  float viewmat[4][4];
  float viewinv[4][4];
  float winmat[4][4];
  float wininv[4][4];

  float clipplanes[6][4];
  /* Should not be here. Not view dependent (only main view). */
  float viewcamtexcofac[4];
} DRWViewUboStorage;

BLI_STATIC_ASSERT_ALIGN(DRWViewUboStorage, 16)

#define MAX_CULLED_VIEWS 32

struct DRWView {
  /** Parent view if this is a sub view. NULL otherwise. */
  struct DRWView *parent;

  DRWViewUboStorage storage;
  /** Number of active clipplanes. */
  int clip_planes_len;
  /** Does culling result needs to be updated. */
  bool is_dirty;
  /** Does facing needs to be reversed? */
  bool is_inverted;
  /** Culling */
  uint32_t culling_mask;
  BoundBox frustum_corners;
  BoundSphere frustum_bsphere;
  float frustum_planes[6][4];
  /** Custom visibility function. */
  DRWCallVisibilityFn *visibility_fn;
  void *user_data;
};

/* ------------ Data Chunks --------------- */
/**
 * In order to keep a cache friendly data structure,
 * we alloc most of our little data into chunks of multiple item.
 * Iteration, allocation and memory usage are better.
 * We loose a bit of memory by allocating more than what we need
 * but it's counterbalanced by not needing the linked-list pointers
 * for each item.
 **/

typedef struct DRWUniformChunk {
  struct DRWUniformChunk *next; /* single-linked list */
  uint32_t uniform_len;
  uint32_t uniform_used;
  DRWUniform uniforms[10];
} DRWUniformChunk;

typedef struct DRWCommandChunk {
  struct DRWCommandChunk *next;
  uint32_t command_len;
  uint32_t command_used;
  /* 4bits for each command. */
  uint64_t command_type[6];
  /* -- 64 bytes aligned -- */
  DRWCommand commands[96];
  /* -- 64 bytes aligned -- */
} DRWCommandChunk;

typedef struct DRWCommandSmallChunk {
  struct DRWCommandChunk *next;
  uint32_t command_len;
  uint32_t command_used;
  /* 4bits for each command. */
  /* TODO reduce size of command_type. */
  uint64_t command_type[6];
  DRWCommand commands[6];
} DRWCommandSmallChunk;

/* Only true for 64-bit platforms. */
#ifdef __LP64__
BLI_STATIC_ASSERT_ALIGN(DRWCommandChunk, 16);
#endif

/* ------------- DRAW DEBUG ------------ */

typedef struct DRWDebugLine {
  struct DRWDebugLine *next; /* linked list */
  float pos[2][3];
  float color[4];
} DRWDebugLine;

typedef struct DRWDebugSphere {
  struct DRWDebugSphere *next; /* linked list */
  float mat[4][4];
  float color[4];
} DRWDebugSphere;

/* ------------- DRAW MANAGER ------------ */

#define DST_MAX_SLOTS 64  /* Cannot be changed without modifying RST.bound_tex_slots */
#define MAX_CLIP_PLANES 6 /* GL_MAX_CLIP_PLANES is at least 6 */
#define STENCIL_UNDEFINED 256
#define DRW_DRAWLIST_LEN 256
typedef struct DRWManager {
  /* TODO clean up this struct a bit */
  /* Cache generation */
  ViewportMemoryPool *vmempool;
  DRWInstanceDataList *idatalist;
  /* State of the object being evaluated if already allocated. */
  DRWResourceHandle ob_handle;
  /** True if current DST.ob_state has its matching DRWObjectInfos init. */
  bool ob_state_obinfo_init;
  /** Handle of current object resource in object resource arrays (DRWObjectMatrices/Infos). */
  DRWResourceHandle resource_handle;
  /** Handle of next DRWPass to be allocated. */
  DRWResourceHandle pass_handle;

  /** Dupli state. NULL if not dupli. */
  struct DupliObject *dupli_source;
  struct Object *dupli_parent;
  struct Object *dupli_origin;
  /** Ghash containing original objects. */
  struct GHash *dupli_ghash;
  /** TODO(fclem) try to remove usage of this. */
  DRWInstanceData *object_instance_data[MAX_INSTANCE_DATA_SIZE];
  /* Array of dupli_data (one for each enabled engine) to handle duplis. */
  void **dupli_datas;

  /* Rendering state */
  GPUShader *shader;
  GPUBatch *batch;

  /* Managed by `DRW_state_set`, `DRW_state_reset` */
  DRWState state;
  DRWState state_lock;

  /* Per viewport */
  GPUViewport *viewport;
  struct GPUFrameBuffer *default_framebuffer;
  float size[2];
  float inv_size[2];
  float screenvecs[2][3];
  float pixsize;

  struct {
    uint is_select : 1;
    uint is_depth : 1;
    uint is_image_render : 1;
    uint is_scene_render : 1;
    uint do_color_management : 1;
    uint draw_background : 1;
    uint draw_text : 1;
  } options;

  /* Current rendering context */
  DRWContextState draw_ctx;

  /* Convenience pointer to text_store owned by the viewport */
  struct DRWTextStore **text_store_p;

  ListBase enabled_engines; /* RenderEngineType */
  void **vedata_array;      /* ViewportEngineData */
  int enabled_engine_count; /* Length of enabled_engines list. */

  bool buffer_finish_called; /* Avoid bad usage of DRW_render_instance_buffer_finish */

  DRWView *view_default;
  DRWView *view_active;
  DRWView *view_previous;
  uint primary_view_ct;
  /** TODO(fclem) Remove this. Only here to support
   * shaders without common_view_lib.glsl */
  DRWViewUboStorage view_storage_cpy;

#ifdef USE_GPU_SELECT
  uint select_id;
#endif

  struct TaskGraph *task_graph;

  /* ---------- Nothing after this point is cleared after use ----------- */

  /* gl_context serves as the offset for clearing only
   * the top portion of the struct so DO NOT MOVE IT! */
  /** Unique ghost context used by the draw manager. */
  void *gl_context;
  GPUContext *gpu_context;
  /** Mutex to lock the drw manager and avoid concurrent context usage. */
  TicketMutex *gl_context_mutex;

  GPUDrawList *draw_list;

  struct {
    /* TODO(fclem) optimize: use chunks. */
    DRWDebugLine *lines;
    DRWDebugSphere *spheres;
  } debug;
} DRWManager;

extern DRWManager DST; /* TODO: get rid of this and allow multi-threaded rendering. */

/* --------------- FUNCTIONS ------------- */

void drw_texture_set_parameters(GPUTexture *tex, DRWTextureFlag flags);

void *drw_viewport_engine_data_ensure(void *engine_type);

void drw_state_set(DRWState state);

void drw_debug_draw(void);
void drw_debug_init(void);

eDRWCommandType command_type_get(uint64_t *command_type_bits, int index);

void drw_batch_cache_validate(Object *ob);
void drw_batch_cache_generate_requested(struct Object *ob);

void drw_resource_buffer_finish(ViewportMemoryPool *vmempool);

/* Procedural Drawing */
GPUBatch *drw_cache_procedural_points_get(void);
GPUBatch *drw_cache_procedural_lines_get(void);
GPUBatch *drw_cache_procedural_triangles_get(void);

#endif /* __DRAW_MANAGER_H__ */