Cleanup: refactor to make number of channels for shader evaluation variable

author: Brecht Van Lommel <brecht@blender.org> 2021-10-13 20:13:35 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-10-15 16:42:44 +0300
commit: 2ba7c3aa650c3c795d903a24998204f67c75b017 (patch)
tree: ef80c7cadbe59d1062dd75818baad4d8ad594bcb /intern
parent: 70376154a0b09dc05fcc5bd79c33fdf7c6acbd9a (diff)
10 files changed, 53 insertions, 36 deletions
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
index 54b18308544..b5f0d873f30 100644
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -54,7 +54,7 @@ class CPUKernels {
   /* Shader evaluation. */
 
   using ShaderEvalFunction = CPUKernelFunction<void (*)(
-      const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>;
+      const KernelGlobals *kg, const KernelShaderEvalInput *, float *, const int)>;
 
   ShaderEvalFunction shader_eval_displace;
   ShaderEvalFunction shader_eval_background;
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
index a14e41ec5be..53546c03872 100644
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -34,9 +34,10 @@ ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), pr
 }
 
 bool ShaderEval::eval(const ShaderEvalType type,
-                      const int max_num_points,
+                      const int max_num_inputs,
+                      const int num_channels,
                       const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
-                      const function<void(device_vector<float4> &)> &read_output)
+                      const function<void(device_vector<float> &)> &read_output)
 {
   bool first_device = true;
   bool success = true;
@@ -50,26 +51,27 @@ bool ShaderEval::eval(const ShaderEvalType type,
     first_device = false;
 
     device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
-    device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE);
+    device_vector<float> output(device, "ShaderEval output", MEM_READ_WRITE);
 
     /* Allocate and copy device buffers. */
     DCHECK_EQ(input.device, device);
     DCHECK_EQ(output.device, device);
     DCHECK_LE(output.size(), input.size());
 
-    input.alloc(max_num_points);
+    input.alloc(max_num_inputs);
     int num_points = fill_input(input);
     if (num_points == 0) {
       return;
     }
 
     input.copy_to_device();
-    output.alloc(num_points);
+    output.alloc(num_points * num_channels);
     output.zero_to_device();
 
     /* Evaluate on CPU or GPU. */
-    success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) :
-                                                  eval_gpu(device, type, input, output);
+    success = (device->info.type == DEVICE_CPU) ?
+                  eval_cpu(device, type, input, output, num_points) :
+                  eval_gpu(device, type, input, output, num_points);
 
     /* Copy data back from device if not canceled. */
     if (success) {
@@ -87,7 +89,8 @@ bool ShaderEval::eval(const ShaderEvalType type,
 bool ShaderEval::eval_cpu(Device *device,
                           const ShaderEvalType type,
                           device_vector<KernelShaderEvalInput> &input,
-                          device_vector<float4> &output)
+                          device_vector<float> &output,
+                          const int64_t work_size)
 {
   vector<CPUKernelThreadGlobals> kernel_thread_globals;
   device->get_cpu_kernel_thread_globals(kernel_thread_globals);
@@ -96,9 +99,8 @@ bool ShaderEval::eval_cpu(Device *device,
   const CPUKernels &kernels = *(device->get_cpu_kernels());
 
   /* Simple parallel_for over all work items. */
-  const int64_t work_size = output.size();
   KernelShaderEvalInput *input_data = input.data();
-  float4 *output_data = output.data();
+  float *output_data = output.data();
   bool success = true;
 
   tbb::task_arena local_arena(device->info.cpu_threads);
@@ -130,7 +132,8 @@ bool ShaderEval::eval_cpu(Device *device,
 bool ShaderEval::eval_gpu(Device *device,
                           const ShaderEvalType type,
                           device_vector<KernelShaderEvalInput> &input,
-                          device_vector<float4> &output)
+                          device_vector<float> &output,
+                          const int64_t work_size)
 {
   /* Find required kernel function. */
   DeviceKernel kernel;
@@ -151,7 +154,6 @@ bool ShaderEval::eval_gpu(Device *device,
    * TODO : query appropriate size from device.*/
   const int64_t chunk_size = 65536;
 
-  const int64_t work_size = output.size();
   void *d_input = (void *)input.device_pointer;
   void *d_output = (void *)output.device_pointer;
 
diff --git a/intern/cycles/integrator/shader_eval.h b/intern/cycles/integrator/shader_eval.h
index 7dbf334b8d7..013fad17d4f 100644
--- a/intern/cycles/integrator/shader_eval.h
+++ b/intern/cycles/integrator/shader_eval.h
@@ -40,19 +40,22 @@ class ShaderEval {
   /* Evaluate shader at points specified by KernelShaderEvalInput and write out
    * RGBA colors to output. */
   bool eval(const ShaderEvalType type,
-            const int max_num_points,
+            const int max_num_inputs,
+            const int num_channels,
             const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
-            const function<void(device_vector<float4> &)> &read_output);
+            const function<void(device_vector<float> &)> &read_output);
 
  protected:
   bool eval_cpu(Device *device,
                 const ShaderEvalType type,
                 device_vector<KernelShaderEvalInput> &input,
-                device_vector<float4> &output);
+                device_vector<float> &output,
+                const int64_t work_size);
   bool eval_gpu(Device *device,
                 const ShaderEvalType type,
                 device_vector<KernelShaderEvalInput> &input,
-                device_vector<float4> &output);
+                device_vector<float> &output,
+                const int64_t work_size);
 
   Device *device_;
   Progress &progress_;
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index 81f328c710b..8b7b0ec0548 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -58,11 +58,11 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
 
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
                                                        const KernelShaderEvalInput *input,
-                                                       float4 *output,
+                                                       float *output,
                                                        const int offset);
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
                                                      const KernelShaderEvalInput *input,
-                                                     float4 *output,
+                                                     float *output,
                                                      const int offset);
 
 /* --------------------------------------------------------------------
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 1432abfd330..23e371f165f 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -114,7 +114,7 @@ DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
 
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
                                                      const KernelShaderEvalInput *input,
-                                                     float4 *output,
+                                                     float *output,
                                                      const int offset)
 {
 #ifdef KERNEL_STUB
@@ -126,7 +126,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
 
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
                                                        const KernelShaderEvalInput *input,
-                                                       float4 *output,
+                                                       float *output,
                                                        const int offset)
 {
 #ifdef KERNEL_STUB
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 3379114fc62..21901215757 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -615,7 +615,7 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
-                                    float4 *output,
+                                    float *output,
                                     const int offset,
                                     const int work_size)
 {
@@ -629,7 +629,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
-                                      float4 *output,
+                                      float *output,
                                       const int offset,
                                       const int work_size)
 {
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
index 00d44f0e5ed..3ebd21e4651 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
@@ -85,7 +85,8 @@ ccl_device bool integrate_intersect_shadow_transparent(INTEGRATOR_STATE_ARGS,
     if (num_recorded_hits > 0) {
       sort_intersections(isect, num_recorded_hits);
 
-      /* Write intersection result into global integrator state memory. */
+      /* Write intersection result into global integrator state memory.
+       * More efficient may be to do this directly from the intersection kernel. */
       for (int hit = 0; hit < num_recorded_hits; hit++) {
         integrator_state_write_shadow_isect(INTEGRATOR_STATE_PASS, &isect[hit], hit);
       }
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index cfff727d007..6cbb8dcc291 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -26,7 +26,7 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_displace_evaluate(ccl_global const KernelGlobals *kg,
                                          ccl_global const KernelShaderEvalInput *input,
-                                         ccl_global float4 *output,
+                                         ccl_global float *output,
                                          const int offset)
 {
   /* Setup shader data. */
@@ -53,12 +53,14 @@ ccl_device void kernel_displace_evaluate(ccl_global const KernelGlobals *kg,
   D = ensure_finite3(D);
 
   /* Write output. */
-  output[offset] += make_float4(D.x, D.y, D.z, 0.0f);
+  output[offset * 3 + 0] += D.x;
+  output[offset * 3 + 1] += D.y;
+  output[offset * 3 + 2] += D.z;
 }
 
 ccl_device void kernel_background_evaluate(ccl_global const KernelGlobals *kg,
                                            ccl_global const KernelShaderEvalInput *input,
-                                           ccl_global float4 *output,
+                                           ccl_global float *output,
                                            const int offset)
 {
   /* Setup ray */
@@ -88,7 +90,9 @@ ccl_device void kernel_background_evaluate(ccl_global const KernelGlobals *kg,
   color = ensure_finite3(color);
 
   /* Write output. */
-  output[offset] += make_float4(color.x, color.y, color.z, 0.0f);
+  output[offset * 3 + 0] += color.x;
+  output[offset * 3 + 1] += color.y;
+  output[offset * 3 + 2] += color.z;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index ae1150fc07b..400ed0802a6 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -50,6 +50,7 @@ static void shade_background_pixels(Device *device,
   device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
   const int size = width * height;
+  const int num_channels = 3;
   pixels.resize(size);
 
   /* Evaluate shader on device. */
@@ -57,6 +58,7 @@ static void shade_background_pixels(Device *device,
   shader_eval.eval(
       SHADER_EVAL_BACKGROUND,
       size,
+      num_channels,
       [&](device_vector<KernelShaderEvalInput> &d_input) {
         /* Fill coordinates for shading. */
         KernelShaderEvalInput *d_input_data = d_input.data();
@@ -77,15 +79,15 @@ static void shade_background_pixels(Device *device,
 
         return size;
       },
-      [&](device_vector<float4> &d_output) {
+      [&](device_vector<float> &d_output) {
         /* Copy output to pixel buffer. */
-        float4 *d_output_data = d_output.data();
+        float *d_output_data = d_output.data();
 
         for (int y = 0; y < height; y++) {
           for (int x = 0; x < width; x++) {
-            pixels[y * width + x].x = d_output_data[y * width + x].x;
-            pixels[y * width + x].y = d_output_data[y * width + x].y;
-            pixels[y * width + x].z = d_output_data[y * width + x].z;
+            pixels[y * width + x].x = d_output_data[(y * width + x) * num_channels + 0];
+            pixels[y * width + x].y = d_output_data[(y * width + x) * num_channels + 1];
+            pixels[y * width + x].z = d_output_data[(y * width + x) * num_channels + 2];
           }
         }
       });
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index c00c4c24211..bf8a4585907 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -115,7 +115,7 @@ static int fill_shader_input(const Scene *scene,
 /* Read back mesh displacement shader output. */
 static void read_shader_output(const Scene *scene,
                                Mesh *mesh,
-                               const device_vector<float4> &d_output)
+                               const device_vector<float> &d_output)
 {
   const array<int> &mesh_shaders = mesh->get_shader();
   const array<Node *> &mesh_used_shaders = mesh->get_used_shaders();
@@ -125,7 +125,7 @@ static void read_shader_output(const Scene *scene,
   const int num_motion_steps = mesh->get_motion_steps();
   vector<bool> done(num_verts, false);
 
-  const float4 *d_output_data = d_output.data();
+  const float *d_output_data = d_output.data();
   int d_output_index = 0;
 
   Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
@@ -144,7 +144,11 @@ static void read_shader_output(const Scene *scene,
     for (int j = 0; j < 3; j++) {
       if (!done[t.v[j]]) {
         done[t.v[j]] = true;
-        float3 off = float4_to_float3(d_output_data[d_output_index++]);
+        float3 off = make_float3(d_output_data[d_output_index + 0],
+                                 d_output_data[d_output_index + 1],
+                                 d_output_data[d_output_index + 2]);
+        d_output_index += 3;
+
         /* Avoid illegal vertex coordinates. */
         off = ensure_finite3(off);
         mesh_verts[t.v[j]] += off;
@@ -194,6 +198,7 @@ bool GeometryManager::displace(
   ShaderEval shader_eval(device, progress);
   if (!shader_eval.eval(SHADER_EVAL_DISPLACE,
                         num_verts,
+                        3,
                         function_bind(&fill_shader_input, scene, mesh, object_index, _1),
                         function_bind(&read_shader_output, scene, mesh, _1))) {
     return false;
author	Brecht Van Lommel <brecht@blender.org>	2021-10-13 20:13:35 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-10-15 16:42:44 +0300
commit	2ba7c3aa650c3c795d903a24998204f67c75b017 (patch)
tree	ef80c7cadbe59d1062dd75818baad4d8ad594bcb /intern
parent	70376154a0b09dc05fcc5bd79c33fdf7c6acbd9a (diff)