/* SPDX-License-Identifier: GPL-2.0-or-later */

/* Use a define instead of `#pragma once` because of `bmesh_iterators_inline.h` */
#ifndef __BLI_TASK_H__
#define __BLI_TASK_H__

#include <string.h> /* for memset() */

struct ListBase;

/** \file
 * \ingroup bli
 */

#include "BLI_threads.h"
#include "BLI_utildefines.h"

#ifdef __cplusplus
extern "C" {
#endif

struct BLI_mempool;

/* -------------------------------------------------------------------- */
/** \name Task Scheduler
 *
 * Central scheduler that holds running threads ready to execute tasks.
 * A single queue holds the task from all pools.
 *
 * Initialize/exit must be called before/after any task pools are created/freed, and must
 * be called from the main threads. All other scheduler and pool functions are thread-safe.
 * \{ */

void BLI_task_scheduler_init(void);
void BLI_task_scheduler_exit(void);
int BLI_task_scheduler_num_threads(void);

/** \} */

/* -------------------------------------------------------------------- */
/** \name Task Pool
 *
 * Pool of tasks that will be executed by the central task scheduler. For each
 * pool, we can wait for all tasks to be done, or cancel them before they are
 * done.
 *
 * Running tasks may spawn new tasks.
 *
 * Pools may be nested, i.e. a thread running a task can create another task
 * pool with smaller tasks. When other threads are busy they will continue
 * working on their own tasks, if not they will join in, no new threads will
 * be launched.
 * \{ */

typedef enum eTaskPriority {
  TASK_PRIORITY_LOW,
  TASK_PRIORITY_HIGH,
} eTaskPriority;

typedef struct TaskPool TaskPool;
typedef void (*TaskRunFunction)(TaskPool *__restrict pool, void *taskdata);
typedef void (*TaskFreeFunction)(TaskPool *__restrict pool, void *taskdata);

/**
 * Regular task pool that immediately starts executing tasks as soon as they
 * are pushed, either on the current or another thread.
 */
TaskPool *BLI_task_pool_create(void *userdata, eTaskPriority priority);

/**
 * Background: always run tasks in a background thread, never immediately
 * execute them. For running background jobs.
 */
TaskPool *BLI_task_pool_create_background(void *userdata, eTaskPriority priority);

/**
 * Background Serial: run tasks one after the other in the background,
 * without parallelization between the tasks.
 */
TaskPool *BLI_task_pool_create_background_serial(void *userdata, eTaskPriority priority);

/**
 * Suspended: don't execute tasks until work_and_wait is called. This is slower
 * as threads can't immediately start working. But it can be used if the data
 * structures the threads operate on are not fully initialized until all tasks are created.
 */
TaskPool *BLI_task_pool_create_suspended(void *userdata, eTaskPriority priority);

/**
 * No threads: immediately executes tasks on the same thread. For debugging.
 */
TaskPool *BLI_task_pool_create_no_threads(void *userdata);

void BLI_task_pool_free(TaskPool *pool);

void BLI_task_pool_push(TaskPool *pool,
                        TaskRunFunction run,
                        void *taskdata,
                        bool free_taskdata,
                        TaskFreeFunction freedata);

/**
 * Work and wait until all tasks are done.
 */
void BLI_task_pool_work_and_wait(TaskPool *pool);
/**
 * Cancel all tasks, keep worker threads running.
 */
void BLI_task_pool_cancel(TaskPool *pool);

/**
 * For worker threads, test if current task pool canceled. this function may
 * only be called from worker threads and pool must be the task pool that the
 * thread is currently executing a task from.
 */
bool BLI_task_pool_current_canceled(TaskPool *pool);

/**
 * Optional `userdata` pointer to pass along to run function.
 */
void *BLI_task_pool_user_data(TaskPool *pool);

/**
 * Optional mutex to use from run function.
 */
ThreadMutex *BLI_task_pool_user_mutex(TaskPool *pool);

/** \} */

/* -------------------------------------------------------------------- */
/** \name Parallel for Routines
 * \{ */

/**
 * Per-thread specific data passed to the callback.
 */
typedef struct TaskParallelTLS {
  /**
   * Copy of user-specifier chunk, which is copied from original chunk to all worker threads.
   * This is similar to OpenMP's `firstprivate`.
   */
  void *userdata_chunk;
} TaskParallelTLS;

typedef void (*TaskParallelRangeFunc)(void *__restrict userdata,
                                      int iter,
                                      const TaskParallelTLS *__restrict tls);

typedef void (*TaskParallelInitFunc)(const void *__restrict userdata, void *__restrict chunk);

typedef void (*TaskParallelReduceFunc)(const void *__restrict userdata,
                                       void *__restrict chunk_join,
                                       void *__restrict chunk);

typedef void (*TaskParallelFreeFunc)(const void *__restrict userdata, void *__restrict chunk);

typedef struct TaskParallelSettings {
  /* Whether caller allows to do threading of the particular range.
   * Usually set by some equation, which forces threading off when threading
   * overhead becomes higher than speed benefit.
   * BLI_task_parallel_range() by itself will always use threading when range
   * is higher than a chunk size. As in, threading will always be performed.
   */
  bool use_threading;
  /* Each instance of looping chunks will get a copy of this data
   * (similar to OpenMP's firstprivate).
   */
  void *userdata_chunk;       /* Pointer to actual data. */
  size_t userdata_chunk_size; /* Size of that data. */
  /* Function called from calling thread once whole range have been
   * processed.
   */
  /* Function called to initialize user data chunk,
   * typically to allocate data, freed by `func_free`.
   */
  TaskParallelInitFunc func_init;
  /* Function called to join user data chunk into another, to reduce
   * the result to the original userdata_chunk memory.
   * The reduce functions should have no side effects, so that they
   * can be run on any thread. */
  TaskParallelReduceFunc func_reduce;
  /* Function called to free data created by TaskParallelRangeFunc. */
  TaskParallelFreeFunc func_free;
  /* Minimum allowed number of range iterators to be handled by a single
   * thread. This allows to achieve following:
   * - Reduce amount of threading overhead.
   * - Partially occupy thread pool with ranges which are computationally
   *   expensive, but which are smaller than amount of available threads.
   *   For example, it's possible to multi-thread [0 .. 64] range into 4
   *   thread which will be doing 16 iterators each.
   * This is a preferred way to tell scheduler when to start threading than
   * having a global use_threading switch based on just range size.
   */
  int min_iter_per_thread;
} TaskParallelSettings;

BLI_INLINE void BLI_parallel_range_settings_defaults(TaskParallelSettings *settings);

void BLI_task_parallel_range(int start,
                             int stop,
                             void *userdata,
                             TaskParallelRangeFunc func,
                             const TaskParallelSettings *settings);

/**
 * This data is shared between all tasks, its access needs thread lock or similar protection.
 */
typedef struct TaskParallelIteratorStateShared {
  /* Maximum amount of items to acquire at once. */
  int chunk_size;
  /* Next item to be acquired. */
  void *next_item;
  /* Index of the next item to be acquired. */
  int next_index;
  /* Indicates that end of iteration has been reached. */
  bool is_finished;
  /* Helper lock to protect access to this data in iterator getter callback,
   * can be ignored (if the callback implements its own protection system, using atomics e.g.).
   * Will be NULL when iterator is actually processed in a single thread. */
  SpinLock *spin_lock;
} TaskParallelIteratorStateShared;

typedef void (*TaskParallelIteratorIterFunc)(void *__restrict userdata,
                                             const TaskParallelTLS *__restrict tls,
                                             void **r_next_item,
                                             int *r_next_index,
                                             bool *r_do_abort);

typedef void (*TaskParallelIteratorFunc)(void *__restrict userdata,
                                         void *item,
                                         int index,
                                         const TaskParallelTLS *__restrict tls);

/**
 * This function allows to parallelize for loops using a generic iterator.
 *
 * \param userdata: Common userdata passed to all instances of \a func.
 * \param iter_func: Callback function used to generate chunks of items.
 * \param init_item: The initial item, if necessary (may be NULL if unused).
 * \param init_index: The initial index.
 * \param items_num: The total amount of items to iterate over
 *                   (if unknown, set it to a negative number).
 * \param func: Callback function.
 * \param settings: See public API doc of TaskParallelSettings for description of all settings.
 *
 * \note Static scheduling is only available when \a items_num is >= 0.
 */
void BLI_task_parallel_iterator(void *userdata,
                                TaskParallelIteratorIterFunc iter_func,
                                void *init_item,
                                int init_index,
                                int items_num,
                                TaskParallelIteratorFunc func,
                                const TaskParallelSettings *settings);

/**
 * This function allows to parallelize for loops over ListBase items.
 *
 * \param listbase: The double linked list to loop over.
 * \param userdata: Common userdata passed to all instances of \a func.
 * \param func: Callback function.
 * \param settings: See public API doc of ParallelRangeSettings for description of all settings.
 *
 * \note There is no static scheduling here,
 * since it would need another full loop over items to count them.
 */
void BLI_task_parallel_listbase(struct ListBase *listbase,
                                void *userdata,
                                TaskParallelIteratorFunc func,
                                const TaskParallelSettings *settings);

typedef struct MempoolIterData MempoolIterData;

typedef void (*TaskParallelMempoolFunc)(void *userdata,
                                        MempoolIterData *iter,
                                        const TaskParallelTLS *__restrict tls);
/**
 * This function allows to parallelize for loops over Mempool items.
 *
 * \param mempool: The iterable BLI_mempool to loop over.
 * \param userdata: Common userdata passed to all instances of \a func.
 * \param func: Callback function.
 * \param settings: See public API doc of TaskParallelSettings for description of all settings.
 *
 * \note There is no static scheduling here.
 */
void BLI_task_parallel_mempool(struct BLI_mempool *mempool,
                               void *userdata,
                               TaskParallelMempoolFunc func,
                               const TaskParallelSettings *settings);

/** TODO(sergey): Think of a better place for this. */
BLI_INLINE void BLI_parallel_range_settings_defaults(TaskParallelSettings *settings)
{
  memset(settings, 0, sizeof(*settings));
  settings->use_threading = true;
  /* Use default heuristic to define actual chunk size. */
  settings->min_iter_per_thread = 0;
}

BLI_INLINE void BLI_parallel_mempool_settings_defaults(TaskParallelSettings *settings)
{
  memset(settings, 0, sizeof(*settings));
  settings->use_threading = true;
}

/**
 * Don't use this, store any thread specific data in `tls->userdata_chunk` instead.
 * Only here for code to be removed.
 */
int BLI_task_parallel_thread_id(const TaskParallelTLS *tls);

/** \} */

/* -------------------------------------------------------------------- */
/** \name Task Graph Scheduling
 *
 * Task Graphs can be used to create a forest of directional trees and schedule work to any tree.
 * The nodes in the graph can be run in separate threads.
 *
 * \code{.unparsed}
 *     +---- [root] ----+
 *     |                |
 *     v                v
 * [node_1]    +---- [node_2] ----+
 *             |                  |
 *             v                  v
 *          [node_3]           [node_4]
 * \endcode
 *
 * \code{.c}
 * TaskGraph *task_graph = BLI_task_graph_create();
 * TaskNode *root = BLI_task_graph_node_create(task_graph, root_exec, NULL, NULL);
 * TaskNode *node_1 = BLI_task_graph_node_create(task_graph, node_exec, NULL, NULL);
 * TaskNode *node_2 = BLI_task_graph_node_create(task_graph, node_exec, NULL, NULL);
 * TaskNode *node_3 = BLI_task_graph_node_create(task_graph, node_exec, NULL, NULL);
 * TaskNode *node_4 = BLI_task_graph_node_create(task_graph, node_exec, NULL, NULL);
 *
 * BLI_task_graph_edge_create(root, node_1);
 * BLI_task_graph_edge_create(root, node_2);
 * BLI_task_graph_edge_create(node_2, node_3);
 * BLI_task_graph_edge_create(node_2, node_4);
 * \endcode
 *
 * Any node can be triggered to start a chain of tasks. Normally you would trigger a root node but
 * it is supported to start the chain of tasks anywhere in the forest or tree. When a node
 * completes, the execution flow is forwarded via the created edges.
 * When a child node has multiple parents the child node will be triggered once for each parent.
 *
 *    `BLI_task_graph_node_push_work(root);`
 *
 * In this example After `root` is finished, `node_1` and `node_2` will be started.
 * Only after `node_2` is finished `node_3` and `node_4` will be started.
 *
 * After scheduling work we need to wait until all the tasks have been finished.
 *
 *    `BLI_task_graph_work_and_wait();`
 *
 * When finished you can clean up all the resources by freeing the task_graph. Nodes are owned by
 * the graph and are freed task_data will only be freed if a free_func was given.
 *
 *    `BLI_task_graph_free(task_graph);`
 *
 * Work can enter a tree on any node. Normally this would be the root_node.
 * A `task_graph` can be reused, but the caller needs to make sure the task_data is reset.
 *
 * Task-Data
 * ---------
 *
 * Typically you want give a task data to work on.
 * Task data can be shared with other nodes, but be careful not to free the data multiple times.
 * Task data is freed when calling #BLI_task_graph_free.
 *
 * \code{.c}
 * MyData *task_data = MEM_callocN(sizeof(MyData), __func__);
 * TaskNode *root = BLI_task_graph_node_create(task_graph, root_exec, task_data, MEM_freeN);
 * TaskNode *node_1 = BLI_task_graph_node_create(task_graph, node_exec, task_data, NULL);
 * TaskNode *node_2 = BLI_task_graph_node_create(task_graph, node_exec, task_data, NULL);
 * TaskNode *node_3 = BLI_task_graph_node_create(task_graph, node_exec, task_data, NULL);
 * TaskNode *node_4 = BLI_task_graph_node_create(task_graph, node_exec, task_data, NULL);
 * \endcode
 * \{ */

struct TaskGraph;
struct TaskNode;

typedef void (*TaskGraphNodeRunFunction)(void *__restrict task_data);
typedef void (*TaskGraphNodeFreeFunction)(void *task_data);

struct TaskGraph *BLI_task_graph_create(void);
void BLI_task_graph_work_and_wait(struct TaskGraph *task_graph);
void BLI_task_graph_free(struct TaskGraph *task_graph);
struct TaskNode *BLI_task_graph_node_create(struct TaskGraph *task_graph,
                                            TaskGraphNodeRunFunction run,
                                            void *user_data,
                                            TaskGraphNodeFreeFunction free_func);
bool BLI_task_graph_node_push_work(struct TaskNode *task_node);
void BLI_task_graph_edge_create(struct TaskNode *from_node, struct TaskNode *to_node);

/** \} */

/* -------------------------------------------------------------------- */
/** \name Task Isolation
 *
 * Task isolation helps avoid unexpected task scheduling decisions that can lead to bugs if wrong
 * assumptions were made. Typically that happens when doing "nested threading", i.e. one thread
 * schedules a bunch of main-tasks and those spawn new sub-tasks.
 *
 * What can happen is that when a main-task waits for its sub-tasks to complete on other threads,
 * another main-task is scheduled within the already running main-task. Generally, this is good,
 * because it leads to better performance. However, sometimes code (often unintentionally) makes
 * the assumption that at most one main-task runs on a thread at a time.
 *
 * The bugs often show themselves in two ways:
 * - Deadlock, when a main-task holds a mutex while waiting for its sub-tasks to complete.
 * - Data corruption, when a main-task makes wrong assumptions about a thread-local variable.
 *
 * Task isolation can avoid these bugs by making sure that a main-task does not start executing
 * another main-task while waiting for its sub-tasks. More precisely, a function that runs in an
 * isolated region is only allowed to run sub-tasks that were spawned in the same isolated region.
 *
 * Unfortunately, incorrect use of task isolation can lead to deadlocks itself. This can happen
 * when threading primitives are used that separate spawning tasks from executing them. The problem
 * occurs when a task is spawned in one isolated region while the tasks are waited for in another
 * isolated region. In this setup, the thread that is waiting for the spawned tasks to complete
 * cannot run the tasks itself. On a single thread, that causes a deadlock already. When there are
 * multiple threads, another thread will typically run the task and avoid the deadlock. However, if
 * this situation happens on all threads at the same time, all threads will deadlock. This happened
 * in T88598.
 * \{ */

void BLI_task_isolate(void (*func)(void *userdata), void *userdata);

/** \} */

#ifdef __cplusplus
}
#endif

#endif