/*
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

#ifndef __BLI_TASK_H__
#define __BLI_TASK_H__

#include <string.h>  /* for memset() */

struct Link;
struct ListBase;

/** \file \ingroup bli
 */

#ifdef __cplusplus
extern "C" {
#endif

#include "BLI_threads.h"
#include "BLI_utildefines.h"

struct BLI_mempool;

/* Task Scheduler
 *
 * Central scheduler that holds running threads ready to execute tasks. A single
 * queue holds the task from all pools.
 *
 * Init/exit must be called before/after any task pools are created/freed, and
 * must be called from the main threads. All other scheduler and pool functions
 * are thread-safe. */

typedef struct TaskScheduler TaskScheduler;

enum {
	TASK_SCHEDULER_AUTO_THREADS = 0,
	TASK_SCHEDULER_SINGLE_THREAD = 1,
};

TaskScheduler *BLI_task_scheduler_create(int num_threads);
void BLI_task_scheduler_free(TaskScheduler *scheduler);

int BLI_task_scheduler_num_threads(TaskScheduler *scheduler);

/* Task Pool
 *
 * Pool of tasks that will be executed by the central TaskScheduler. For each
 * pool, we can wait for all tasks to be done, or cancel them before they are
 * done.
 *
 * Running tasks may spawn new tasks.
 *
 * Pools may be nested, i.e. a thread running a task can create another task
 * pool with smaller tasks. When other threads are busy they will continue
 * working on their own tasks, if not they will join in, no new threads will
 * be launched.
 */

typedef enum TaskPriority {
	TASK_PRIORITY_LOW,
	TASK_PRIORITY_HIGH
} TaskPriority;

typedef struct TaskPool TaskPool;
typedef void (*TaskRunFunction)(TaskPool *__restrict pool, void *taskdata, int threadid);
typedef void (*TaskFreeFunction)(TaskPool *__restrict pool, void *taskdata, int threadid);

TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata);
TaskPool *BLI_task_pool_create_background(TaskScheduler *scheduler, void *userdata);
TaskPool *BLI_task_pool_create_suspended(TaskScheduler *scheduler, void *userdata);
void BLI_task_pool_free(TaskPool *pool);

void BLI_task_pool_push_ex(
        TaskPool *pool, TaskRunFunction run, void *taskdata,
        bool free_taskdata, TaskFreeFunction freedata, TaskPriority priority);
void BLI_task_pool_push(TaskPool *pool, TaskRunFunction run,
        void *taskdata, bool free_taskdata, TaskPriority priority);
void BLI_task_pool_push_from_thread(TaskPool *pool, TaskRunFunction run,
        void *taskdata, bool free_taskdata, TaskPriority priority, int thread_id);

/* work and wait until all tasks are done */
void BLI_task_pool_work_and_wait(TaskPool *pool);
/* work and wait until all tasks are done, then reset to the initial suspended state */
void BLI_task_pool_work_wait_and_reset(TaskPool *pool);
/* cancel all tasks, keep worker threads running */
void BLI_task_pool_cancel(TaskPool *pool);

/* for worker threads, test if canceled */
bool BLI_task_pool_canceled(TaskPool *pool);

/* optional userdata pointer to pass along to run function */
void *BLI_task_pool_userdata(TaskPool *pool);

/* optional mutex to use from run function */
ThreadMutex *BLI_task_pool_user_mutex(TaskPool *pool);

/* Delayed push, use that to reduce thread overhead by accumulating
 * all new tasks into local queue first and pushing it to scheduler
 * from within a single mutex lock.
 */
void BLI_task_pool_delayed_push_begin(TaskPool *pool, int thread_id);
void BLI_task_pool_delayed_push_end(TaskPool *pool, int thread_id);

/* Parallel for routines */

typedef enum eTaskSchedulingMode {
	/* Task scheduler will divide overall work into equal chunks, scheduling
	 * even chunks to all worker threads.
	 * Least run time benefit, ideal for cases when each task requires equal
	 * amount of compute power.
	 */
	TASK_SCHEDULING_STATIC,
	/* Task scheduler will schedule small amount of work to each worker thread.
	 * Has more run time overhead, but deals much better with cases when each
	 * part of the work requires totally different amount of compute power.
	 */
	TASK_SCHEDULING_DYNAMIC,
} eTaskSchedulingMode;

/* Per-thread specific data passed to the callback. */
typedef struct ParallelRangeTLS {
	/* Identifier of the thread who this data belongs to. */
	int thread_id;
	/* Copy of user-specifier chunk, which is copied from original chunk to all
	 * worker threads. This is similar to OpenMP's firstprivate.
	 */
	void *userdata_chunk;
} ParallelRangeTLS;

typedef void (*TaskParallelRangeFunc)(void *__restrict userdata,
                                      const int iter,
                                      const ParallelRangeTLS *__restrict tls);
typedef void (*TaskParallelRangeFuncFinalize)(void *__restrict userdata,
                                              void *__restrict userdata_chunk);

typedef struct ParallelRangeSettings {
	/* Whether caller allows to do threading of the particular range.
	 * Usually set by some equation, which forces threading off when threading
	 * overhead becomes higher than speed benefit.
	 * BLI_task_parallel_range() by itself will always use threading when range
	 * is higher than a chunk size. As in, threading will always be performed.
	 */
	bool use_threading;
	/* Scheduling mode to use for this parallel range invocation. */
	eTaskSchedulingMode scheduling_mode;
	/* Each instance of looping chunks will get a copy of this data
	 * (similar to OpenMP's firstprivate).
	 */
	void *userdata_chunk;        /* Pointer to actual data. */
	size_t userdata_chunk_size;  /* Size of that data.  */
	/* Function called from calling thread once whole range have been
	 * processed.
	 */
	TaskParallelRangeFuncFinalize func_finalize;
	/* Minimum allowed number of range iterators to be handled by a single
	 * thread. This allows to achieve following:
	 * - Reduce amount of threading overhead.
	 * - Partially occupy thread pool with ranges which are computationally
	 *   expensive, but which are smaller than amount of available threads.
	 *   For example, it's possible to multi-thread [0 .. 64] range into 4
	 *   thread which will be doing 16 iterators each.
	 * This is a preferred way to tell scheduler when to start threading than
	 * having a global use_threading switch based on just range size.
	 */
	int min_iter_per_thread;
} ParallelRangeSettings;

BLI_INLINE void BLI_parallel_range_settings_defaults(
        ParallelRangeSettings *settings);

void BLI_task_parallel_range(
        const int start, const int stop,
        void *userdata,
        TaskParallelRangeFunc func,
        const ParallelRangeSettings *settings);

typedef void (*TaskParallelListbaseFunc)(void *userdata,
                                         struct Link *iter,
                                         int index);
void BLI_task_parallel_listbase(
        struct ListBase *listbase,
        void *userdata,
        TaskParallelListbaseFunc func,
        const bool use_threading);

typedef struct MempoolIterData MempoolIterData;
typedef void (*TaskParallelMempoolFunc)(void *userdata,
                                        MempoolIterData *iter);
void BLI_task_parallel_mempool(
        struct BLI_mempool *mempool,
        void *userdata,
        TaskParallelMempoolFunc func,
        const bool use_threading);

/* TODO(sergey): Think of a better place for this. */
BLI_INLINE void BLI_parallel_range_settings_defaults(
        ParallelRangeSettings *settings)
{
	memset(settings, 0, sizeof(*settings));
	settings->use_threading = true;
	settings->scheduling_mode = TASK_SCHEDULING_STATIC;
	/* NOTE: Current value mimics old behavior, but it's not ideal by any
	 * means. Would be cool to find a common value which will work good enough
	 * for both static and dynamic scheduling.
	 */
	settings->min_iter_per_thread = 1;
}

#ifdef __cplusplus
}
#endif

#endif