diff options
Diffstat (limited to 'intern/cycles')
62 files changed, 4721 insertions, 2231 deletions
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 94ed3dd4311..3c9c83fec42 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -383,7 +383,6 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): sub.prop(cscene, "use_progressive_refine") subsub = sub.column(align=True) - subsub.enabled = not rd.use_border subsub.prop(rd, "use_save_buffers") col = split.column(align=True) @@ -1599,89 +1598,62 @@ def draw_pause(self, context): def get_panels(): - types = bpy.types - panels = [ - "RENDER_PT_render", - "RENDER_PT_output", - "RENDER_PT_encoding", - "RENDER_PT_dimensions", - "RENDER_PT_stamp", - "RENDER_PT_freestyle", - "RENDERLAYER_PT_layers", - "RENDERLAYER_PT_freestyle", - "RENDERLAYER_PT_freestyle_lineset", - "RENDERLAYER_PT_freestyle_linestyle", - "SCENE_PT_scene", - "SCENE_PT_color_management", - "SCENE_PT_custom_props", - "SCENE_PT_audio", - "SCENE_PT_unit", - "SCENE_PT_keying_sets", - "SCENE_PT_keying_set_paths", - "SCENE_PT_physics", - "WORLD_PT_context_world", - "WORLD_PT_custom_props", - "DATA_PT_context_mesh", - "DATA_PT_context_camera", - "DATA_PT_context_lamp", - "DATA_PT_context_speaker", - "DATA_PT_normals", - "DATA_PT_texture_space", - "DATA_PT_curve_texture_space", - "DATA_PT_mball_texture_space", - "DATA_PT_vertex_groups", - "DATA_PT_shape_keys", - "DATA_PT_uv_texture", - "DATA_PT_vertex_colors", - "DATA_PT_camera", - "DATA_PT_camera_display", - "DATA_PT_camera_stereoscopy", - "DATA_PT_camera_safe_areas", - "DATA_PT_lens", - "DATA_PT_speaker", - "DATA_PT_distance", - "DATA_PT_cone", - "DATA_PT_customdata", - "DATA_PT_custom_props_mesh", - "DATA_PT_custom_props_camera", - "DATA_PT_custom_props_lamp", - "DATA_PT_custom_props_speaker", - "DATA_PT_custom_props_arm", - "DATA_PT_custom_props_curve", - "DATA_PT_custom_props_lattice", - "DATA_PT_custom_props_metaball", - "TEXTURE_PT_preview", - "TEXTURE_PT_custom_props", - "TEXTURE_PT_clouds", - "TEXTURE_PT_wood", - "TEXTURE_PT_marble", - "TEXTURE_PT_magic", - "TEXTURE_PT_blend", - "TEXTURE_PT_stucci", - "TEXTURE_PT_image", - "TEXTURE_PT_image_sampling", - "TEXTURE_PT_image_mapping", - "TEXTURE_PT_musgrave", - "TEXTURE_PT_voronoi", - "TEXTURE_PT_distortednoise", - "TEXTURE_PT_voxeldata", - "TEXTURE_PT_pointdensity", - "TEXTURE_PT_pointdensity_turbulence", - "TEXTURE_PT_mapping", - "TEXTURE_PT_ocean", - "TEXTURE_PT_influence", - "TEXTURE_PT_colors", - "SCENE_PT_rigid_body_world", - "SCENE_PT_rigid_body_cache", - "SCENE_PT_rigid_body_field_weights", - "MATERIAL_PT_custom_props", - "MATERIAL_PT_freestyle_line", - "BONE_PT_custom_props", - "OBJECT_PT_custom_props", - ] - - return [getattr(types, p) for p in panels if hasattr(types, p)] - + exclude_panels = { + 'DATA_PT_area', + 'DATA_PT_camera_dof', + 'DATA_PT_falloff_curve', + 'DATA_PT_lamp', + 'DATA_PT_preview', + 'DATA_PT_shadow', + 'DATA_PT_spot', + 'DATA_PT_sunsky', + 'MATERIAL_PT_context_material', + 'MATERIAL_PT_diffuse', + 'MATERIAL_PT_flare', + 'MATERIAL_PT_halo', + 'MATERIAL_PT_mirror', + 'MATERIAL_PT_options', + 'MATERIAL_PT_pipeline', + 'MATERIAL_PT_preview', + 'MATERIAL_PT_shading', + 'MATERIAL_PT_shadow', + 'MATERIAL_PT_specular', + 'MATERIAL_PT_sss', + 'MATERIAL_PT_strand', + 'MATERIAL_PT_transp', + 'MATERIAL_PT_volume_density', + 'MATERIAL_PT_volume_integration', + 'MATERIAL_PT_volume_lighting', + 'MATERIAL_PT_volume_options', + 'MATERIAL_PT_volume_shading', + 'MATERIAL_PT_volume_transp', + 'RENDERLAYER_PT_layer_options', + 'RENDERLAYER_PT_layer_passes', + 'RENDERLAYER_PT_views', + 'RENDER_PT_antialiasing', + 'RENDER_PT_bake', + 'RENDER_PT_motion_blur', + 'RENDER_PT_performance', + 'RENDER_PT_post_processing', + 'RENDER_PT_shading', + 'SCENE_PT_simplify', + 'TEXTURE_PT_context_texture', + 'WORLD_PT_ambient_occlusion', + 'WORLD_PT_environment_lighting', + 'WORLD_PT_gather', + 'WORLD_PT_indirect_lighting', + 'WORLD_PT_mist', + 'WORLD_PT_preview', + 'WORLD_PT_world' + } + + panels = [] + for panel in bpy.types.Panel.__subclasses__(): + if hasattr(panel, 'COMPAT_ENGINES') and 'BLENDER_RENDER' in panel.COMPAT_ENGINES: + if panel.__name__ not in exclude_panels: + panels.append(panel) + + return panels def register(): bpy.types.RENDER_PT_render.append(draw_device) @@ -1690,10 +1662,10 @@ def register(): for panel in get_panels(): panel.COMPAT_ENGINES.add('CYCLES') - def unregister(): bpy.types.RENDER_PT_render.remove(draw_device) bpy.types.VIEW3D_HT_header.remove(draw_pause) for panel in get_panels(): - panel.COMPAT_ENGINES.remove('CYCLES') + if 'CYCLES' in panel.COMPAT_ENGINES: + panel.COMPAT_ENGINES.remove('CYCLES') diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp index 7ca23f23cb4..64559804ccb 100644 --- a/intern/cycles/blender/blender_shader.cpp +++ b/intern/cycles/blender/blender_shader.cpp @@ -440,7 +440,7 @@ static ShaderNode *add_node(Scene *scene, glossy->distribution = CLOSURE_BSDF_MICROFACET_GGX_ID; break; case BL::ShaderNodeBsdfGlossy::distribution_ASHIKHMIN_SHIRLEY: - glossy->distribution = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID; + glossy->distribution = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID; break; case BL::ShaderNodeBsdfGlossy::distribution_MULTI_GGX: glossy->distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index 5729fa6113d..92e48f0d87f 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -19,6 +19,7 @@ set(SRC bvh_node.cpp bvh_sort.cpp bvh_split.cpp + bvh_unaligned.cpp ) set(SRC_HEADERS @@ -29,6 +30,7 @@ set(SRC_HEADERS bvh_params.h bvh_sort.h bvh_split.h + bvh_unaligned.h ) include_directories(${INC}) diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index fa2b9ae7279..e92526ac1c4 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -24,6 +24,7 @@ #include "bvh_build.h" #include "bvh_node.h" #include "bvh_params.h" +#include "bvh_unaligned.h" #include "util_debug.h" #include "util_foreach.h" @@ -121,7 +122,7 @@ void BVH::refit(Progress& progress) /* Triangles */ -void BVH::pack_triangle(int idx, float4 storage[3]) +void BVH::pack_triangle(int idx, float4 tri_verts[3]) { int tob = pack.prim_object[idx]; assert(tob >= 0 && tob < objects.size()); @@ -129,49 +130,58 @@ void BVH::pack_triangle(int idx, float4 storage[3]) int tidx = pack.prim_index[idx]; Mesh::Triangle t = mesh->get_triangle(tidx); - const float3* vpos = &mesh->verts[0]; + const float3 *vpos = &mesh->verts[0]; float3 v0 = vpos[t.v[0]]; float3 v1 = vpos[t.v[1]]; float3 v2 = vpos[t.v[2]]; - storage[0] = float3_to_float4(v0); - storage[1] = float3_to_float4(v1); - storage[2] = float3_to_float4(v2); + tri_verts[0] = float3_to_float4(v0); + tri_verts[1] = float3_to_float4(v1); + tri_verts[2] = float3_to_float4(v2); } void BVH::pack_primitives() { - int nsize = TRI_NODE_SIZE; - size_t tidx_size = pack.prim_index.size(); - - pack.tri_storage.clear(); - pack.tri_storage.resize(tidx_size * nsize); + const size_t tidx_size = pack.prim_index.size(); + size_t num_prim_triangles = 0; + /* Count number of triangles primitives in BVH. */ + for(unsigned int i = 0; i < tidx_size; i++) { + if((pack.prim_index[i] != -1)) { + if ((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) { + ++num_prim_triangles; + } + } + } + /* Reserve size for arrays. */ + pack.prim_tri_index.clear(); + pack.prim_tri_index.resize(tidx_size); + pack.prim_tri_verts.clear(); + pack.prim_tri_verts.resize(num_prim_triangles * 3); pack.prim_visibility.clear(); pack.prim_visibility.resize(tidx_size); - + /* Fill in all the arrays. */ + size_t prim_triangle_index = 0; for(unsigned int i = 0; i < tidx_size; i++) { if(pack.prim_index[i] != -1) { - float4 storage[3]; + int tob = pack.prim_object[i]; + Object *ob = objects[tob]; - if(pack.prim_type[i] & PRIMITIVE_TRIANGLE) { - pack_triangle(i, storage); + if((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) { + pack_triangle(i, (float4*)&pack.prim_tri_verts[3 * prim_triangle_index]); + pack.prim_tri_index[i] = 3 * prim_triangle_index; + ++prim_triangle_index; } else { - /* Avoid use of uninitialized memory. */ - memset(&storage, 0, sizeof(storage)); + pack.prim_tri_index[i] = -1; } - memcpy(&pack.tri_storage[i * nsize], storage, sizeof(float4)*3); - - int tob = pack.prim_object[i]; - Object *ob = objects[tob]; pack.prim_visibility[i] = ob->visibility; if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE) pack.prim_visibility[i] |= PATH_RAY_CURVE; } else { - memset(&pack.tri_storage[i * nsize], 0, sizeof(float4)*3); + pack.prim_tri_index[i] = -1; pack.prim_visibility[i] = 0; } } @@ -183,13 +193,13 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) { /* The BVH's for instances are built separately, but for traversal all * BVH's are stored in global arrays. This function merges them into the - * top level BVH, adjusting indexes and offsets where appropriate. */ - bool use_qbvh = params.use_qbvh; - size_t nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE; - size_t nsize_leaf = (use_qbvh)? BVH_QNODE_LEAF_SIZE: BVH_NODE_LEAF_SIZE; + * top level BVH, adjusting indexes and offsets where appropriate. + */ + const bool use_qbvh = params.use_qbvh; - /* adjust primitive index to point to the triangle in the global array, for - * meshes with transform applied and already in the top level BVH */ + /* Adjust primitive index to point to the triangle in the global array, for + * meshes with transform applied and already in the top level BVH. + */ for(size_t i = 0; i < pack.prim_index.size(); i++) if(pack.prim_index[i] != -1) { if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE) @@ -208,10 +218,10 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) /* reserve */ size_t prim_index_size = pack.prim_index.size(); - size_t tri_storage_size = pack.tri_storage.size(); + size_t prim_tri_verts_size = pack.prim_tri_verts.size(); size_t pack_prim_index_offset = prim_index_size; - size_t pack_tri_storage_offset = tri_storage_size; + size_t pack_prim_tri_verts_offset = prim_tri_verts_size; size_t pack_nodes_offset = nodes_size; size_t pack_leaf_nodes_offset = leaf_nodes_size; size_t object_offset = 0; @@ -225,7 +235,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) if(mesh->need_build_bvh()) { if(mesh_map.find(mesh) == mesh_map.end()) { prim_index_size += bvh->pack.prim_index.size(); - tri_storage_size += bvh->pack.tri_storage.size(); + prim_tri_verts_size += bvh->pack.prim_tri_verts.size(); nodes_size += bvh->pack.nodes.size(); leaf_nodes_size += bvh->pack.leaf_nodes.size(); @@ -240,7 +250,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) pack.prim_type.resize(prim_index_size); pack.prim_object.resize(prim_index_size); pack.prim_visibility.resize(prim_index_size); - pack.tri_storage.resize(tri_storage_size); + pack.prim_tri_verts.resize(prim_tri_verts_size); + pack.prim_tri_index.resize(prim_index_size); pack.nodes.resize(nodes_size); pack.leaf_nodes.resize(leaf_nodes_size); pack.object_node.resize(objects.size()); @@ -249,7 +260,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL; int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL; uint *pack_prim_visibility = (pack.prim_visibility.size())? &pack.prim_visibility[0]: NULL; - float4 *pack_tri_storage = (pack.tri_storage.size())? &pack.tri_storage[0]: NULL; + float4 *pack_prim_tri_verts = (pack.prim_tri_verts.size())? &pack.prim_tri_verts[0]: NULL; + uint *pack_prim_tri_index = (pack.prim_tri_index.size())? &pack.prim_tri_index[0]: NULL; int4 *pack_nodes = (pack.nodes.size())? &pack.nodes[0]: NULL; int4 *pack_leaf_nodes = (pack.leaf_nodes.size())? &pack.leaf_nodes[0]: NULL; @@ -277,8 +289,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) BVH *bvh = mesh->bvh; - int noffset = nodes_offset/nsize; - int noffset_leaf = nodes_leaf_offset/nsize_leaf; + int noffset = nodes_offset; + int noffset_leaf = nodes_leaf_offset; int mesh_tri_offset = mesh->tri_offset; int mesh_curve_offset = mesh->curve_offset; @@ -290,18 +302,24 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) mesh_map[mesh] = pack.object_node[object_offset-1]; - /* merge primitive and object indexes */ + /* merge primitive, object and triangle indexes */ if(bvh->pack.prim_index.size()) { size_t bvh_prim_index_size = bvh->pack.prim_index.size(); int *bvh_prim_index = &bvh->pack.prim_index[0]; int *bvh_prim_type = &bvh->pack.prim_type[0]; uint *bvh_prim_visibility = &bvh->pack.prim_visibility[0]; + uint *bvh_prim_tri_index = &bvh->pack.prim_tri_index[0]; for(size_t i = 0; i < bvh_prim_index_size; i++) { - if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) + if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) { pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_curve_offset; - else + pack_prim_tri_index[pack_prim_index_offset] = -1; + } + else { pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_tri_offset; + pack_prim_tri_index[pack_prim_index_offset] = + bvh_prim_tri_index[i] + pack_prim_tri_verts_offset; + } pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i]; pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i]; @@ -310,50 +328,64 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) } } - /* merge triangle intersection data */ - if(bvh->pack.tri_storage.size()) { - memcpy(pack_tri_storage + pack_tri_storage_offset, - &bvh->pack.tri_storage[0], - bvh->pack.tri_storage.size()*sizeof(float4)); - pack_tri_storage_offset += bvh->pack.tri_storage.size(); + /* Merge triangle vertices data. */ + if(bvh->pack.prim_tri_verts.size()) { + const size_t prim_tri_size = bvh->pack.prim_tri_verts.size(); + memcpy(pack_prim_tri_verts + pack_prim_tri_verts_offset, + &bvh->pack.prim_tri_verts[0], + prim_tri_size*sizeof(float4)); + pack_prim_tri_verts_offset += prim_tri_size; } /* merge nodes */ if(bvh->pack.leaf_nodes.size()) { int4 *leaf_nodes_offset = &bvh->pack.leaf_nodes[0]; size_t leaf_nodes_offset_size = bvh->pack.leaf_nodes.size(); - for(size_t i = 0, j = 0; i < leaf_nodes_offset_size; i+=nsize_leaf, j++) { + for(size_t i = 0, j = 0; + i < leaf_nodes_offset_size; + i+= BVH_NODE_LEAF_SIZE, j++) + { int4 data = leaf_nodes_offset[i]; data.x += prim_offset; data.y += prim_offset; pack_leaf_nodes[pack_leaf_nodes_offset] = data; - for(int j = 1; j < nsize_leaf; ++j) { + for(int j = 1; j < BVH_NODE_LEAF_SIZE; ++j) { pack_leaf_nodes[pack_leaf_nodes_offset + j] = leaf_nodes_offset[i + j]; } - pack_leaf_nodes_offset += nsize_leaf; + pack_leaf_nodes_offset += BVH_NODE_LEAF_SIZE; } } if(bvh->pack.nodes.size()) { - /* For QBVH we're packing a child bbox into 6 float4, - * and for regular BVH they're packed into 3 float4. - */ - size_t nsize_bbox = (use_qbvh)? 6: 3; int4 *bvh_nodes = &bvh->pack.nodes[0]; - size_t bvh_nodes_size = bvh->pack.nodes.size(); + size_t bvh_nodes_size = bvh->pack.nodes.size(); + + for(size_t i = 0, j = 0; i < bvh_nodes_size; j++) { + size_t nsize, nsize_bbox; + if(bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) { + nsize = use_qbvh + ? BVH_UNALIGNED_QNODE_SIZE + : BVH_UNALIGNED_NODE_SIZE; + nsize_bbox = (use_qbvh)? 13: 0; + } + else { + nsize = (use_qbvh)? BVH_QNODE_SIZE: BVH_NODE_SIZE; + nsize_bbox = (use_qbvh)? 7: 0; + } - for(size_t i = 0, j = 0; i < bvh_nodes_size; i+=nsize, j++) { - memcpy(pack_nodes + pack_nodes_offset, bvh_nodes + i, nsize_bbox*sizeof(int4)); + memcpy(pack_nodes + pack_nodes_offset, + bvh_nodes + i, + nsize_bbox*sizeof(int4)); - /* modify offsets into arrays */ + /* Modify offsets into arrays */ int4 data = bvh_nodes[i + nsize_bbox]; - data.x += (data.x < 0)? -noffset_leaf: noffset; - data.y += (data.y < 0)? -noffset_leaf: noffset; + data.z += (data.z < 0)? -noffset_leaf: noffset; + data.w += (data.w < 0)? -noffset_leaf: noffset; if(use_qbvh) { - data.z += (data.z < 0)? -noffset_leaf: noffset; - data.w += (data.w < 0)? -noffset_leaf: noffset; + data.x += (data.x < 0)? -noffset_leaf: noffset; + data.y += (data.y < 0)? -noffset_leaf: noffset; } pack_nodes[pack_nodes_offset + nsize_bbox] = data; @@ -366,6 +398,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) sizeof(int4) * (nsize - (nsize_bbox+1))); pack_nodes_offset += nsize; + i += nsize; } } @@ -377,12 +410,20 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) /* Regular BVH */ +static bool node_bvh_is_unaligned(const BVHNode *node) +{ + const BVHNode *node0 = node->get_child(0), + *node1 = node->get_child(1); + return node0->is_unaligned() || node1->is_unaligned(); +} + RegularBVH::RegularBVH(const BVHParams& params_, const vector<Object*>& objects_) : BVH(params_, objects_) { } -void RegularBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf) +void RegularBVH::pack_leaf(const BVHStackEntry& e, + const LeafNode *leaf) { float4 data[BVH_NODE_LEAF_SIZE]; memset(data, 0, sizeof(data)); @@ -401,54 +442,130 @@ void RegularBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf) data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]); } - memcpy(&pack.leaf_nodes[e.idx * BVH_NODE_LEAF_SIZE], data, sizeof(float4)*BVH_NODE_LEAF_SIZE); + memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE); +} + +void RegularBVH::pack_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) +{ + if (e0.node->is_unaligned() || e1.node->is_unaligned()) { + pack_unaligned_inner(e, e0, e1); + } else { + pack_aligned_inner(e, e0, e1); + } } -void RegularBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry& e0, const BVHStackEntry& e1) +void RegularBVH::pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) { - pack_node(e.idx, e0.node->m_bounds, e1.node->m_bounds, e0.encodeIdx(), e1.encodeIdx(), e0.node->m_visibility, e1.node->m_visibility); + pack_aligned_node(e.idx, + e0.node->m_bounds, e1.node->m_bounds, + e0.encodeIdx(), e1.encodeIdx(), + e0.node->m_visibility & ~PATH_RAY_NODE_UNALIGNED, + e1.node->m_visibility & ~PATH_RAY_NODE_UNALIGNED); } -void RegularBVH::pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int c0, int c1, uint visibility0, uint visibility1) +void RegularBVH::pack_aligned_node(int idx, + const BoundBox& b0, + const BoundBox& b1, + int c0, int c1, + uint visibility0, uint visibility1) { int4 data[BVH_NODE_SIZE] = { + make_int4(visibility0, visibility1, c0, c1), make_int4(__float_as_int(b0.min.x), __float_as_int(b1.min.x), __float_as_int(b0.max.x), __float_as_int(b1.max.x)), make_int4(__float_as_int(b0.min.y), __float_as_int(b1.min.y), __float_as_int(b0.max.y), __float_as_int(b1.max.y)), make_int4(__float_as_int(b0.min.z), __float_as_int(b1.min.z), __float_as_int(b0.max.z), __float_as_int(b1.max.z)), - make_int4(c0, c1, visibility0, visibility1) }; - memcpy(&pack.nodes[idx * BVH_NODE_SIZE], data, sizeof(int4)*BVH_NODE_SIZE); + memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE); } -void RegularBVH::pack_nodes(const BVHNode *root) +void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) { - size_t tot_node_size = root->getSubtreeSize(BVH_STAT_NODE_COUNT); - size_t leaf_node_size = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); - size_t node_size = tot_node_size - leaf_node_size; + pack_unaligned_node(e.idx, + e0.node->get_aligned_space(), + e1.node->get_aligned_space(), + e0.node->m_bounds, + e1.node->m_bounds, + e0.encodeIdx(), e1.encodeIdx(), + e0.node->m_visibility, e1.node->m_visibility); +} - /* resize arrays */ - pack.nodes.clear(); +void RegularBVH::pack_unaligned_node(int idx, + const Transform& aligned_space0, + const Transform& aligned_space1, + const BoundBox& bounds0, + const BoundBox& bounds1, + int c0, int c1, + uint visibility0, uint visibility1) +{ + float4 data[BVH_UNALIGNED_NODE_SIZE]; + Transform space0 = BVHUnaligned::compute_node_transform(bounds0, + aligned_space0); + Transform space1 = BVHUnaligned::compute_node_transform(bounds1, + aligned_space1); + data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED), + __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED), + __int_as_float(c0), + __int_as_float(c1)); + + data[1] = space0.x; + data[2] = space0.y; + data[3] = space0.z; + data[4] = space1.x; + data[5] = space1.y; + data[6] = space1.z; + + memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE); +} - /* for top level BVH, first merge existing BVH's so we know the offsets */ +void RegularBVH::pack_nodes(const BVHNode *root) +{ + const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT); + const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); + assert(num_leaf_nodes <= num_nodes); + const size_t num_inner_nodes = num_nodes - num_leaf_nodes; + size_t node_size; + if(params.use_unaligned_nodes) { + const size_t num_unaligned_nodes = + root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT); + node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) + + (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE; + } + else { + node_size = num_inner_nodes * BVH_NODE_SIZE; + } + /* Resize arrays */ + pack.nodes.clear(); + pack.leaf_nodes.clear(); + /* For top level BVH, first merge existing BVH's so we know the offsets. */ if(params.top_level) { - pack_instances(node_size*BVH_NODE_SIZE, - leaf_node_size*BVH_NODE_LEAF_SIZE); + pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE); } else { - pack.nodes.resize(node_size*BVH_NODE_SIZE); - pack.leaf_nodes.resize(leaf_node_size*BVH_NODE_LEAF_SIZE); + pack.nodes.resize(node_size); + pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE); } int nextNodeIdx = 0, nextLeafNodeIdx = 0; vector<BVHStackEntry> stack; stack.reserve(BVHParams::MAX_DEPTH*2); - if(root->is_leaf()) + if(root->is_leaf()) { stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); - else - stack.push_back(BVHStackEntry(root, nextNodeIdx++)); + } + else { + stack.push_back(BVHStackEntry(root, nextNodeIdx)); + nextNodeIdx += node_bvh_is_unaligned(root) + ? BVH_UNALIGNED_NODE_SIZE + : BVH_NODE_SIZE; + } while(stack.size()) { BVHStackEntry e = stack.back(); @@ -456,20 +573,31 @@ void RegularBVH::pack_nodes(const BVHNode *root) if(e.node->is_leaf()) { /* leaf node */ - const LeafNode* leaf = reinterpret_cast<const LeafNode*>(e.node); + const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); pack_leaf(e, leaf); } else { /* innner node */ - int idx0 = (e.node->get_child(0)->is_leaf())? (nextLeafNodeIdx++) : (nextNodeIdx++); - int idx1 = (e.node->get_child(1)->is_leaf())? (nextLeafNodeIdx++) : (nextNodeIdx++); - stack.push_back(BVHStackEntry(e.node->get_child(0), idx0)); - stack.push_back(BVHStackEntry(e.node->get_child(1), idx1)); + int idx[2]; + for (int i = 0; i < 2; ++i) { + if (e.node->get_child(i)->is_leaf()) { + idx[i] = nextLeafNodeIdx++; + } + else { + idx[i] = nextNodeIdx; + nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i)) + ? BVH_UNALIGNED_NODE_SIZE + : BVH_NODE_SIZE; + } + } + + stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0])); + stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1])); pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]); } } - + assert(node_size == nextNodeIdx); /* root index to start traversal at, to handle case of single leaf node */ pack.root_index = (root->is_leaf())? -1: 0; } @@ -486,7 +614,7 @@ void RegularBVH::refit_nodes() void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) { if(leaf) { - int4 *data = &pack.leaf_nodes[idx*BVH_NODE_LEAF_SIZE]; + int4 *data = &pack.leaf_nodes[idx]; int c0 = data[0].x; int c1 = data[0].y; /* refit leaf node */ @@ -565,9 +693,9 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility sizeof(float4)*BVH_NODE_LEAF_SIZE); } else { - int4 *data = &pack.nodes[idx*BVH_NODE_SIZE]; - int c0 = data[3].x; - int c1 = data[3].y; + int4 *data = &pack.nodes[idx]; + int c0 = data[0].z; + int c1 = data[0].w; /* refit inner node, set bbox from children */ BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty; uint visibility0 = 0, visibility1 = 0; @@ -575,7 +703,7 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0); refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1); - pack_node(idx, bbox0, bbox1, c0, c1, visibility0, visibility1); + pack_aligned_node(idx, bbox0, bbox1, c0, c1, visibility0, visibility1); bbox.grow(bbox0); bbox.grow(bbox1); @@ -585,6 +713,33 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility /* QBVH */ +/* Can we avoid this somehow or make more generic? + * + * Perhaps we can merge nodes in actual tree and make our + * life easier all over the place. + */ +static bool node_qbvh_is_unaligned(const BVHNode *node) +{ + const BVHNode *node0 = node->get_child(0), + *node1 = node->get_child(1); + bool has_unaligned = false; + if(node0->is_leaf()) { + has_unaligned |= node0->is_unaligned(); + } + else { + has_unaligned |= node0->get_child(0)->is_unaligned(); + has_unaligned |= node0->get_child(1)->is_unaligned(); + } + if(node1->is_leaf()) { + has_unaligned |= node1->is_unaligned(); + } + else { + has_unaligned |= node1->get_child(0)->is_unaligned(); + has_unaligned |= node1->get_child(1)->is_unaligned(); + } + return has_unaligned; +} + QBVH::QBVH(const BVHParams& params_, const vector<Object*>& objects_) : BVH(params_, objects_) { @@ -610,66 +765,153 @@ void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf) data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]); } - memcpy(&pack.leaf_nodes[e.idx * BVH_QNODE_LEAF_SIZE], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); + memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); +} + +void QBVH::pack_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + bool has_unaligned = false; + /* Check whether we have to create unaligned node or all nodes are aligned + * and we can cut some corner here. + */ + if(params.use_unaligned_nodes) { + for(int i = 0; i < num; i++) { + if(en[i].node->is_unaligned()) { + has_unaligned = true; + break; + } + } + } + if(has_unaligned) { + /* There's no unaligned children, pack into AABB node. */ + pack_unaligned_inner(e, en, num); + } + else { + /* Create unaligned node with orientation transform for each of the + * children. + */ + pack_aligned_inner(e, en, num); + } } -void QBVH::pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num) +void QBVH::pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) { float4 data[BVH_QNODE_SIZE]; + memset(data, 0, sizeof(data)); + data[0].x = __uint_as_float(e.node->m_visibility & ~PATH_RAY_NODE_UNALIGNED); for(int i = 0; i < num; i++) { float3 bb_min = en[i].node->m_bounds.min; float3 bb_max = en[i].node->m_bounds.max; - data[0][i] = bb_min.x; - data[1][i] = bb_max.x; - data[2][i] = bb_min.y; - data[3][i] = bb_max.y; - data[4][i] = bb_min.z; - data[5][i] = bb_max.z; + data[1][i] = bb_min.x; + data[2][i] = bb_max.x; + data[3][i] = bb_min.y; + data[4][i] = bb_max.y; + data[5][i] = bb_min.z; + data[6][i] = bb_max.z; - data[6][i] = __int_as_float(en[i].encodeIdx()); + data[7][i] = __int_as_float(en[i].encodeIdx()); } for(int i = num; i < 4; i++) { /* We store BB which would never be recorded as intersection * so kernel might safely assume there are always 4 child nodes. */ - data[0][i] = FLT_MAX; - data[1][i] = -FLT_MAX; + data[1][i] = FLT_MAX; + data[2][i] = -FLT_MAX; + + data[3][i] = FLT_MAX; + data[4][i] = -FLT_MAX; + + data[5][i] = FLT_MAX; + data[6][i] = -FLT_MAX; + + data[7][i] = __int_as_float(0); + } + + memcpy(&pack.nodes[e.idx], data, sizeof(float4)*BVH_QNODE_SIZE); +} + +void QBVH::pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + float4 data[BVH_UNALIGNED_QNODE_SIZE]; + memset(data, 0, sizeof(data)); + + data[0].x = __uint_as_float(e.node->m_visibility | PATH_RAY_NODE_UNALIGNED); + + for(int i = 0; i < num; i++) { + Transform space = BVHUnaligned::compute_node_transform( + en[i].node->m_bounds, + en[i].node->get_aligned_space()); + + data[1][i] = space.x.x; + data[2][i] = space.x.y; + data[3][i] = space.x.z; + + data[4][i] = space.y.x; + data[5][i] = space.y.y; + data[6][i] = space.y.z; + + data[7][i] = space.z.x; + data[8][i] = space.z.y; + data[9][i] = space.z.z; - data[2][i] = FLT_MAX; - data[3][i] = -FLT_MAX; + data[10][i] = space.x.w; + data[11][i] = space.y.w; + data[12][i] = space.z.w; - data[4][i] = FLT_MAX; - data[5][i] = -FLT_MAX; + data[13][i] = __int_as_float(en[i].encodeIdx()); + } - data[6][i] = __int_as_float(0); + for(int i = num; i < 4; i++) { + /* We store BB which would never be recorded as intersection + * so kernel might safely assume there are always 4 child nodes. + */ + for(int j = 1; j < 13; ++j) { + data[j][i] = 0.0f; + } + data[13][i] = __int_as_float(0); } - memcpy(&pack.nodes[e.idx * BVH_QNODE_SIZE], data, sizeof(float4)*BVH_QNODE_SIZE); + memcpy(&pack.nodes[e.idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE); } /* Quad SIMD Nodes */ void QBVH::pack_nodes(const BVHNode *root) { - size_t tot_node_size = root->getSubtreeSize(BVH_STAT_QNODE_COUNT); - size_t leaf_node_size = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); - size_t node_size = tot_node_size - leaf_node_size; - - /* resize arrays */ + /* Calculate size of the arrays required. */ + const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT); + const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); + assert(num_leaf_nodes <= num_nodes); + const size_t num_inner_nodes = num_nodes - num_leaf_nodes; + size_t node_size; + if(params.use_unaligned_nodes) { + const size_t num_unaligned_nodes = + root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT); + node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) + + (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE; + } + else { + node_size = num_inner_nodes * BVH_QNODE_SIZE; + } + /* Resize arrays. */ pack.nodes.clear(); pack.leaf_nodes.clear(); - - /* for top level BVH, first merge existing BVH's so we know the offsets */ + /* For top level BVH, first merge existing BVH's so we know the offsets. */ if(params.top_level) { - pack_instances(node_size*BVH_QNODE_SIZE, - leaf_node_size*BVH_QNODE_LEAF_SIZE); + pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE); } else { - pack.nodes.resize(node_size*BVH_QNODE_SIZE); - pack.leaf_nodes.resize(leaf_node_size*BVH_QNODE_LEAF_SIZE); + pack.nodes.resize(node_size); + pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE); } int nextNodeIdx = 0, nextLeafNodeIdx = 0; @@ -680,7 +922,10 @@ void QBVH::pack_nodes(const BVHNode *root) stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); } else { - stack.push_back(BVHStackEntry(root, nextNodeIdx++)); + stack.push_back(BVHStackEntry(root, nextNodeIdx)); + nextNodeIdx += node_qbvh_is_unaligned(root) + ? BVH_UNALIGNED_QNODE_SIZE + : BVH_QNODE_SIZE; } while(stack.size()) { @@ -689,19 +934,17 @@ void QBVH::pack_nodes(const BVHNode *root) if(e.node->is_leaf()) { /* leaf node */ - const LeafNode* leaf = reinterpret_cast<const LeafNode*>(e.node); + const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); pack_leaf(e, leaf); } else { - /* inner node */ + /* Inner node. */ const BVHNode *node = e.node; const BVHNode *node0 = node->get_child(0); const BVHNode *node1 = node->get_child(1); - - /* collect nodes */ + /* Collect nodes. */ const BVHNode *nodes[4]; int numnodes = 0; - if(node0->is_leaf()) { nodes[numnodes++] = node0; } @@ -709,7 +952,6 @@ void QBVH::pack_nodes(const BVHNode *root) nodes[numnodes++] = node0->get_child(0); nodes[numnodes++] = node0->get_child(1); } - if(node1->is_leaf()) { nodes[numnodes++] = node1; } @@ -717,25 +959,26 @@ void QBVH::pack_nodes(const BVHNode *root) nodes[numnodes++] = node1->get_child(0); nodes[numnodes++] = node1->get_child(1); } - - /* push entries on the stack */ - for(int i = 0; i < numnodes; i++) { + /* Push entries on the stack. */ + for(int i = 0; i < numnodes; ++i) { int idx; if(nodes[i]->is_leaf()) { idx = nextLeafNodeIdx++; } else { - idx = nextNodeIdx++; + idx = nextNodeIdx; + nextNodeIdx += node_qbvh_is_unaligned(nodes[i]) + ? BVH_UNALIGNED_QNODE_SIZE + : BVH_QNODE_SIZE; } stack.push_back(BVHStackEntry(nodes[i], idx)); } - - /* set node */ + /* Set node. */ pack_inner(e, &stack[stack.size()-numnodes], numnodes); } } - - /* root index to start traversal at, to handle case of single leaf node */ + assert(node_size == nextNodeIdx); + /* Root index to start traversal at, to handle case of single leaf node. */ pack.root_index = (root->is_leaf())? -1: 0; } @@ -751,7 +994,7 @@ void QBVH::refit_nodes() void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) { if(leaf) { - int4 *data = &pack.leaf_nodes[idx*BVH_QNODE_LEAF_SIZE]; + int4 *data = &pack.leaf_nodes[idx]; int4 c = data[0]; /* Refit leaf node. */ for(int prim = c.x; prim < c.y; prim++) { @@ -833,13 +1076,18 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) leaf_data[0].y = __int_as_float(c.y); leaf_data[0].z = __uint_as_float(visibility); leaf_data[0].w = __uint_as_float(c.w); - memcpy(&pack.leaf_nodes[idx * BVH_QNODE_LEAF_SIZE], - leaf_data, - sizeof(float4)*BVH_QNODE_LEAF_SIZE); + memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); } else { - int4 *data = &pack.nodes[idx*BVH_QNODE_SIZE]; - int4 c = data[6]; + int4 *data = &pack.nodes[idx]; + bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; + int4 c; + if(is_unaligned) { + c = data[13]; + } + else { + c = data[7]; + } /* Refit inner node, set bbox from children. */ BoundBox child_bbox[4] = {BoundBox::empty, BoundBox::empty, @@ -858,21 +1106,62 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) } } - float4 inner_data[BVH_QNODE_SIZE]; - for(int i = 0; i < 4; ++i) { - float3 bb_min = child_bbox[i].min; - float3 bb_max = child_bbox[i].max; - inner_data[0][i] = bb_min.x; - inner_data[1][i] = bb_max.x; - inner_data[2][i] = bb_min.y; - inner_data[3][i] = bb_max.y; - inner_data[4][i] = bb_min.z; - inner_data[5][i] = bb_max.z; - inner_data[6][i] = __int_as_float(c[i]); + /* TODO(sergey): To be de-duplicated with pack_inner(), + * but for that need some sort of pack_node(). which operates with + * direct data, not stack element. + */ + if(is_unaligned) { + Transform aligned_space = transform_identity(); + float4 inner_data[BVH_UNALIGNED_QNODE_SIZE]; + inner_data[0] = make_float4( + __int_as_float(visibility | PATH_RAY_NODE_UNALIGNED), + 0.0f, + 0.0f, + 0.0f); + for(int i = 0; i < 4; ++i) { + Transform space = BVHUnaligned::compute_node_transform( + child_bbox[i], + aligned_space); + inner_data[1][i] = space.x.x; + inner_data[2][i] = space.x.y; + inner_data[3][i] = space.x.z; + + inner_data[4][i] = space.y.x; + inner_data[5][i] = space.y.y; + inner_data[6][i] = space.y.z; + + inner_data[7][i] = space.z.x; + inner_data[8][i] = space.z.y; + inner_data[9][i] = space.z.z; + + inner_data[10][i] = space.x.w; + inner_data[11][i] = space.y.w; + inner_data[12][i] = space.z.w; + + inner_data[13][i] = __int_as_float(c[i]); + } + memcpy(&pack.nodes[idx], inner_data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE); + } + else { + float4 inner_data[BVH_QNODE_SIZE]; + inner_data[0] = make_float4( + __int_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED), + 0.0f, + 0.0f, + 0.0f); + for(int i = 0; i < 4; ++i) { + float3 bb_min = child_bbox[i].min; + float3 bb_max = child_bbox[i].max; + inner_data[1][i] = bb_min.x; + inner_data[2][i] = bb_max.x; + inner_data[3][i] = bb_min.y; + inner_data[4][i] = bb_max.y; + inner_data[5][i] = bb_min.z; + inner_data[6][i] = bb_max.z; + inner_data[7][i] = __int_as_float(c[i]); + } + memcpy(&pack.nodes[idx], inner_data, sizeof(float4)*BVH_QNODE_SIZE); } - memcpy(&pack.nodes[idx * BVH_QNODE_SIZE], - inner_data, - sizeof(float4)*BVH_QNODE_SIZE); } } diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h index 6076c25ca31..16752076f6a 100644 --- a/intern/cycles/bvh/bvh.h +++ b/intern/cycles/bvh/bvh.h @@ -35,11 +35,14 @@ class Progress; #define BVH_NODE_SIZE 4 #define BVH_NODE_LEAF_SIZE 1 -#define BVH_QNODE_SIZE 7 +#define BVH_QNODE_SIZE 8 #define BVH_QNODE_LEAF_SIZE 1 #define BVH_ALIGN 4096 #define TRI_NODE_SIZE 3 +#define BVH_UNALIGNED_NODE_SIZE 7 +#define BVH_UNALIGNED_QNODE_SIZE 14 + /* Packed BVH * * BVH stored as it will be used for traversal on the rendering device. */ @@ -52,8 +55,10 @@ struct PackedBVH { array<int4> leaf_nodes; /* object index to BVH node index mapping for instances */ array<int> object_node; - /* Aligned triangle storage for fatser lookup in the kernel. */ - array<float4> tri_storage; + /* Mapping from primitive index to index in triangle array. */ + array<uint> prim_tri_index; + /* Continuous storage of triangle vertices. */ + array<float4> prim_tri_verts; /* primitive type - triangle or strand */ array<int> prim_type; /* visibility visibilitys for primitives */ @@ -91,7 +96,7 @@ public: protected: BVH(const BVHParams& params, const vector<Object*>& objects); - /* triangles and strands*/ + /* triangles and strands */ void pack_primitives(); void pack_triangle(int idx, float4 storage[3]); @@ -115,9 +120,32 @@ protected: /* pack */ void pack_nodes(const BVHNode *root); - void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf); - void pack_inner(const BVHStackEntry& e, const BVHStackEntry& e0, const BVHStackEntry& e1); - void pack_node(int idx, const BoundBox& b0, const BoundBox& b1, int c0, int c1, uint visibility0, uint visibility1); + + void pack_leaf(const BVHStackEntry& e, + const LeafNode *leaf); + void pack_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1); + + void pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1); + void pack_aligned_node(int idx, + const BoundBox& b0, + const BoundBox& b1, + int c0, int c1, + uint visibility0, uint visibility1); + + void pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1); + void pack_unaligned_node(int idx, + const Transform& aligned_space0, + const Transform& aligned_space1, + const BoundBox& b0, + const BoundBox& b1, + int c0, int c1, + uint visibility0, uint visibility1); /* refit */ void refit_nodes(); @@ -136,9 +164,17 @@ protected: /* pack */ void pack_nodes(const BVHNode *root); + void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf); void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num); + void pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num); + void pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num); + /* refit */ void refit_nodes(); void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility); diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp index b07e870d759..5ddd7349f7b 100644 --- a/intern/cycles/bvh/bvh_binning.cpp +++ b/intern/cycles/bvh/bvh_binning.cpp @@ -52,12 +52,35 @@ __forceinline int get_best_dimension(const float4& bestSAH) /* BVH Object Binning */ -BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims) -: BVHRange(job), splitSAH(FLT_MAX), dim(0), pos(0) +BVHObjectBinning::BVHObjectBinning(const BVHRange& job, + BVHReference *prims, + const BVHUnaligned *unaligned_heuristic, + const Transform *aligned_space) +: BVHRange(job), + splitSAH(FLT_MAX), + dim(0), + pos(0), + unaligned_heuristic_(unaligned_heuristic), + aligned_space_(aligned_space) { + if(aligned_space_ == NULL) { + bounds_ = bounds(); + cent_bounds_ = cent_bounds(); + } + else { + /* TODO(sergey): With some additional storage we can avoid + * need in re-calculating this. + */ + bounds_ = unaligned_heuristic->compute_aligned_boundbox( + *this, + prims, + *aligned_space, + ¢_bounds_); + } + /* compute number of bins to use and precompute scaling factor for binning */ num_bins = min(size_t(MAX_BINS), size_t(4.0f + 0.05f*size())); - scale = rcp(cent_bounds().size()) * make_float3((float)num_bins); + scale = rcp(cent_bounds_.size()) * make_float3((float)num_bins); /* initialize binning counter and bounds */ BoundBox bin_bounds[MAX_BINS][4]; /* bounds for every bin in every dimension */ @@ -79,30 +102,34 @@ BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims) const BVHReference& prim0 = prims[start() + i + 0]; const BVHReference& prim1 = prims[start() + i + 1]; - int4 bin0 = get_bin(prim0.bounds()); - int4 bin1 = get_bin(prim1.bounds()); + BoundBox bounds0 = get_prim_bounds(prim0); + BoundBox bounds1 = get_prim_bounds(prim1); + + int4 bin0 = get_bin(bounds0); + int4 bin1 = get_bin(bounds1); /* increase bounds for bins for even primitive */ - int b00 = (int)extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(prim0.bounds()); - int b01 = (int)extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(prim0.bounds()); - int b02 = (int)extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(prim0.bounds()); + int b00 = (int)extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(bounds0); + int b01 = (int)extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(bounds0); + int b02 = (int)extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(bounds0); /* increase bounds of bins for odd primitive */ - int b10 = (int)extract<0>(bin1); bin_count[b10][0]++; bin_bounds[b10][0].grow(prim1.bounds()); - int b11 = (int)extract<1>(bin1); bin_count[b11][1]++; bin_bounds[b11][1].grow(prim1.bounds()); - int b12 = (int)extract<2>(bin1); bin_count[b12][2]++; bin_bounds[b12][2].grow(prim1.bounds()); + int b10 = (int)extract<0>(bin1); bin_count[b10][0]++; bin_bounds[b10][0].grow(bounds1); + int b11 = (int)extract<1>(bin1); bin_count[b11][1]++; bin_bounds[b11][1].grow(bounds1); + int b12 = (int)extract<2>(bin1); bin_count[b12][2]++; bin_bounds[b12][2].grow(bounds1); } /* for uneven number of primitives */ if(i < ssize_t(size())) { /* map primitive to bin */ const BVHReference& prim0 = prims[start() + i]; - int4 bin0 = get_bin(prim0.bounds()); + BoundBox bounds0 = get_prim_bounds(prim0); + int4 bin0 = get_bin(bounds0); /* increase bounds of bins */ - int b00 = (int)extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(prim0.bounds()); - int b01 = (int)extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(prim0.bounds()); - int b02 = (int)extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(prim0.bounds()); + int b00 = (int)extract<0>(bin0); bin_count[b00][0]++; bin_bounds[b00][0].grow(bounds0); + int b01 = (int)extract<1>(bin0); bin_count[b01][1]++; bin_bounds[b01][1].grow(bounds0); + int b02 = (int)extract<2>(bin0); bin_count[b02][2]++; bin_bounds[b02][2].grow(bounds0); } } @@ -151,17 +178,19 @@ BVHObjectBinning::BVHObjectBinning(const BVHRange& job, BVHReference *prims) bestSAH = min(sah,bestSAH); } - int4 mask = float3_to_float4(cent_bounds().size()) <= make_float4(0.0f); + int4 mask = float3_to_float4(cent_bounds_.size()) <= make_float4(0.0f); bestSAH = insert<3>(select(mask, make_float4(FLT_MAX), bestSAH), FLT_MAX); /* find best dimension */ dim = get_best_dimension(bestSAH); splitSAH = bestSAH[dim]; pos = bestSplit[dim]; - leafSAH = bounds().half_area() * blocks(size()); + leafSAH = bounds_.half_area() * blocks(size()); } -void BVHObjectBinning::split(BVHReference* prims, BVHObjectBinning& left_o, BVHObjectBinning& right_o) const +void BVHObjectBinning::split(BVHReference* prims, + BVHObjectBinning& left_o, + BVHObjectBinning& right_o) const { size_t N = size(); @@ -176,10 +205,12 @@ void BVHObjectBinning::split(BVHReference* prims, BVHObjectBinning& left_o, BVHO prefetch_L2(&prims[start() + l + 8]); prefetch_L2(&prims[start() + r - 8]); - const BVHReference& prim = prims[start() + l]; + BVHReference prim = prims[start() + l]; + BoundBox unaligned_bounds = get_prim_bounds(prim); + float3 unaligned_center = unaligned_bounds.center2(); float3 center = prim.bounds().center2(); - if(get_bin(center)[dim] < pos) { + if(get_bin(unaligned_center)[dim] < pos) { lgeom_bounds.grow(prim.bounds()); lcent_bounds.grow(center); l++; @@ -191,7 +222,6 @@ void BVHObjectBinning::split(BVHReference* prims, BVHObjectBinning& left_o, BVHO r--; } } - /* finish */ if(l != 0 && N-1-r != 0) { right_o = BVHObjectBinning(BVHRange(rgeom_bounds, rcent_bounds, start() + l, N-1-r), prims); diff --git a/intern/cycles/bvh/bvh_binning.h b/intern/cycles/bvh/bvh_binning.h index 60742157055..52955f70151 100644 --- a/intern/cycles/bvh/bvh_binning.h +++ b/intern/cycles/bvh/bvh_binning.h @@ -19,11 +19,14 @@ #define __BVH_BINNING_H__ #include "bvh_params.h" +#include "bvh_unaligned.h" #include "util_types.h" CCL_NAMESPACE_BEGIN +class BVHBuild; + /* Single threaded object binner. Finds the split with the best SAH heuristic * by testing for each dimension multiple partitionings for regular spaced * partition locations. A partitioning for a partition location is computed, @@ -34,10 +37,18 @@ CCL_NAMESPACE_BEGIN class BVHObjectBinning : public BVHRange { public: - __forceinline BVHObjectBinning() {} - BVHObjectBinning(const BVHRange& job, BVHReference *prims); + __forceinline BVHObjectBinning() : leafSAH(FLT_MAX) {} + + BVHObjectBinning(const BVHRange& job, + BVHReference *prims, + const BVHUnaligned *unaligned_heuristic = NULL, + const Transform *aligned_space = NULL); - void split(BVHReference *prims, BVHObjectBinning& left_o, BVHObjectBinning& right_o) const; + void split(BVHReference *prims, + BVHObjectBinning& left_o, + BVHObjectBinning& right_o) const; + + __forceinline const BoundBox& unaligned_bounds() { return bounds_; } float splitSAH; /* SAH cost of the best split */ float leafSAH; /* SAH cost of creating a leaf */ @@ -48,13 +59,20 @@ protected: size_t num_bins; /* actual number of bins to use */ float3 scale; /* scaling factor to compute bin */ + /* Effective bounds and centroid bounds. */ + BoundBox bounds_; + BoundBox cent_bounds_; + + const BVHUnaligned *unaligned_heuristic_; + const Transform *aligned_space_; + enum { MAX_BINS = 32 }; enum { LOG_BLOCK_SIZE = 2 }; /* computes the bin numbers for each dimension for a box. */ __forceinline int4 get_bin(const BoundBox& box) const { - int4 a = make_int4((box.center2() - cent_bounds().min)*scale - make_float3(0.5f)); + int4 a = make_int4((box.center2() - cent_bounds_.min)*scale - make_float3(0.5f)); int4 mn = make_int4(0); int4 mx = make_int4((int)num_bins-1); @@ -64,7 +82,7 @@ protected: /* computes the bin numbers for each dimension for a point. */ __forceinline int4 get_bin(const float3& c) const { - return make_int4((c - cent_bounds().min)*scale - make_float3(0.5f)); + return make_int4((c - cent_bounds_.min)*scale - make_float3(0.5f)); } /* compute the number of blocks occupied for each dimension. */ @@ -78,6 +96,17 @@ protected: { return (int)((a+((1LL << LOG_BLOCK_SIZE)-1)) >> LOG_BLOCK_SIZE); } + + __forceinline BoundBox get_prim_bounds(const BVHReference& prim) const + { + if(aligned_space_ == NULL) { + return prim.bounds(); + } + else { + return unaligned_heuristic_->compute_aligned_prim_boundbox( + prim, *aligned_space_); + } + } }; CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp index 3f687224eee..67ffb6853d6 100644 --- a/intern/cycles/bvh/bvh_build.cpp +++ b/intern/cycles/bvh/bvh_build.cpp @@ -33,6 +33,7 @@ #include "util_stack_allocator.h" #include "util_simd.h" #include "util_time.h" +#include "util_queue.h" CCL_NAMESPACE_BEGIN @@ -99,7 +100,8 @@ BVHBuild::BVHBuild(const vector<Object*>& objects_, prim_object(prim_object_), params(params_), progress(progress_), - progress_start_time(0.0) + progress_start_time(0.0), + unaligned_heuristic(objects_) { spatial_min_overlap = 0.0f; } @@ -112,70 +114,74 @@ BVHBuild::~BVHBuild() void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i) { - Attribute *attr_mP = NULL; - - if(mesh->has_motion_blur()) - attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) { + Attribute *attr_mP = NULL; - size_t num_triangles = mesh->num_triangles(); - for(uint j = 0; j < num_triangles; j++) { - Mesh::Triangle t = mesh->get_triangle(j); - BoundBox bounds = BoundBox::empty; - PrimitiveType type = PRIMITIVE_TRIANGLE; + if(mesh->has_motion_blur()) + attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + + size_t num_triangles = mesh->num_triangles(); + for(uint j = 0; j < num_triangles; j++) { + Mesh::Triangle t = mesh->get_triangle(j); + BoundBox bounds = BoundBox::empty; + PrimitiveType type = PRIMITIVE_TRIANGLE; - t.bounds_grow(&mesh->verts[0], bounds); + t.bounds_grow(&mesh->verts[0], bounds); - /* motion triangles */ - if(attr_mP) { - size_t mesh_size = mesh->verts.size(); - size_t steps = mesh->motion_steps - 1; - float3 *vert_steps = attr_mP->data_float3(); + /* motion triangles */ + if(attr_mP) { + size_t mesh_size = mesh->verts.size(); + size_t steps = mesh->motion_steps - 1; + float3 *vert_steps = attr_mP->data_float3(); - for(size_t i = 0; i < steps; i++) - t.bounds_grow(vert_steps + i*mesh_size, bounds); + for(size_t i = 0; i < steps; i++) + t.bounds_grow(vert_steps + i*mesh_size, bounds); - type = PRIMITIVE_MOTION_TRIANGLE; - } + type = PRIMITIVE_MOTION_TRIANGLE; + } - if(bounds.valid()) { - references.push_back(BVHReference(bounds, j, i, type)); - root.grow(bounds); - center.grow(bounds.center2()); + if(bounds.valid()) { + references.push_back(BVHReference(bounds, j, i, type)); + root.grow(bounds); + center.grow(bounds.center2()); + } } } - Attribute *curve_attr_mP = NULL; + if(params.primitive_mask & PRIMITIVE_ALL_CURVE) { + Attribute *curve_attr_mP = NULL; - if(mesh->has_motion_blur()) - curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if(mesh->has_motion_blur()) + curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - size_t num_curves = mesh->num_curves(); - for(uint j = 0; j < num_curves; j++) { - Mesh::Curve curve = mesh->get_curve(j); - PrimitiveType type = PRIMITIVE_CURVE; + size_t num_curves = mesh->num_curves(); + for(uint j = 0; j < num_curves; j++) { + Mesh::Curve curve = mesh->get_curve(j); + PrimitiveType type = PRIMITIVE_CURVE; - for(int k = 0; k < curve.num_keys - 1; k++) { - BoundBox bounds = BoundBox::empty; - curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bounds); + for(int k = 0; k < curve.num_keys - 1; k++) { + BoundBox bounds = BoundBox::empty; + curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bounds); - /* motion curve */ - if(curve_attr_mP) { - size_t mesh_size = mesh->curve_keys.size(); - size_t steps = mesh->motion_steps - 1; - float3 *key_steps = curve_attr_mP->data_float3(); + /* motion curve */ + if(curve_attr_mP) { + size_t mesh_size = mesh->curve_keys.size(); + size_t steps = mesh->motion_steps - 1; + float3 *key_steps = curve_attr_mP->data_float3(); - for(size_t i = 0; i < steps; i++) - curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bounds); + for(size_t i = 0; i < steps; i++) + curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bounds); - type = PRIMITIVE_MOTION_CURVE; - } + type = PRIMITIVE_MOTION_CURVE; + } - if(bounds.valid()) { - int packed_type = PRIMITIVE_PACK_SEGMENT(type, k); - - references.push_back(BVHReference(bounds, j, i, packed_type)); - root.grow(bounds); - center.grow(bounds.center2()); + if(bounds.valid()) { + int packed_type = PRIMITIVE_PACK_SEGMENT(type, k); + + references.push_back(BVHReference(bounds, j, i, packed_type)); + root.grow(bounds); + center.grow(bounds.center2()); + } } } } @@ -209,15 +215,23 @@ void BVHBuild::add_references(BVHRange& root) continue; } if(!ob->mesh->is_instanced()) { - num_alloc_references += ob->mesh->num_triangles(); - num_alloc_references += count_curve_segments(ob->mesh); + if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) { + num_alloc_references += ob->mesh->num_triangles(); + } + if(params.primitive_mask & PRIMITIVE_ALL_CURVE) { + num_alloc_references += count_curve_segments(ob->mesh); + } } else num_alloc_references++; } else { - num_alloc_references += ob->mesh->num_triangles(); - num_alloc_references += count_curve_segments(ob->mesh); + if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) { + num_alloc_references += ob->mesh->num_triangles(); + } + if(params.primitive_mask & PRIMITIVE_ALL_CURVE) { + num_alloc_references += count_curve_segments(ob->mesh); + } } } @@ -340,6 +354,8 @@ BVHNode* BVHBuild::run() << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_INNER_COUNT)) << "\n" << " Number of leaf nodes: " << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_LEAF_COUNT)) << "\n" + << " Number of unaligned nodes: " + << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_UNALIGNED_COUNT)) << "\n" << " Allocation slop factor: " << ((prim_type.capacity() != 0) ? (float)prim_type.size() / prim_type.capacity() @@ -445,10 +461,11 @@ BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level) float leafSAH = params.sah_primitive_cost * range.leafSAH; float splitSAH = params.sah_node_cost * range.bounds().half_area() + params.sah_primitive_cost * range.splitSAH; - /* have at least one inner node on top level, for performance and correct - * visibility tests, since object instances do not check visibility flag */ + /* Have at least one inner node on top level, for performance and correct + * visibility tests, since object instances do not check visibility flag. + */ if(!(range.size() > 0 && params.top_level && level == 0)) { - /* make leaf node when threshold reached or SAH tells us */ + /* Make leaf node when threshold reached or SAH tells us. */ if((params.small_enough_for_leaf(size, level)) || (range_within_max_leaf_size(range, references) && leafSAH < splitSAH)) { @@ -456,28 +473,70 @@ BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level) } } - /* perform split */ + BVHObjectBinning unaligned_range; + float unalignedSplitSAH = FLT_MAX; + float unalignedLeafSAH = FLT_MAX; + Transform aligned_space; + if(params.use_unaligned_nodes && + splitSAH > params.unaligned_split_threshold*leafSAH) + { + aligned_space = unaligned_heuristic.compute_aligned_space( + range, &references[0]); + unaligned_range = BVHObjectBinning(range, + &references[0], + &unaligned_heuristic, + &aligned_space); + unalignedSplitSAH = params.sah_node_cost * unaligned_range.unaligned_bounds().half_area() + + params.sah_primitive_cost * unaligned_range.splitSAH; + unalignedLeafSAH = params.sah_primitive_cost * unaligned_range.leafSAH; + if(!(range.size() > 0 && params.top_level && level == 0)) { + if(unalignedLeafSAH < unalignedSplitSAH && unalignedSplitSAH < splitSAH && + range_within_max_leaf_size(range, references)) + { + return create_leaf_node(range, references); + } + } + } + + /* Perform split. */ BVHObjectBinning left, right; - range.split(&references[0], left, right); + if(unalignedSplitSAH < splitSAH) { + unaligned_range.split(&references[0], left, right); + } + else { + range.split(&references[0], left, right); + } - /* create inner node. */ - InnerNode *inner; + BoundBox bounds; + if(unalignedSplitSAH < splitSAH) { + bounds = unaligned_heuristic.compute_aligned_boundbox( + range, &references[0], aligned_space); + } + else { + bounds = range.bounds(); + } + /* Create inner node. */ + InnerNode *inner; if(range.size() < THREAD_TASK_SIZE) { /* local build */ BVHNode *leftnode = build_node(left, level + 1); BVHNode *rightnode = build_node(right, level + 1); - inner = new InnerNode(range.bounds(), leftnode, rightnode); + inner = new InnerNode(bounds, leftnode, rightnode); } else { - /* threaded build */ - inner = new InnerNode(range.bounds()); + /* Threaded build */ + inner = new InnerNode(bounds); task_pool.push(new BVHBuildTask(this, inner, 0, left, level + 1), true); task_pool.push(new BVHBuildTask(this, inner, 1, right, level + 1), true); } + if(unalignedSplitSAH < splitSAH) { + inner->set_aligned_space(aligned_space); + } + return inner; } @@ -516,16 +575,54 @@ BVHNode* BVHBuild::build_node(const BVHRange& range, return create_leaf_node(range, *references); } } + float leafSAH = params.sah_primitive_cost * split.leafSAH; + float splitSAH = params.sah_node_cost * range.bounds().half_area() + + params.sah_primitive_cost * split.nodeSAH; + + BVHMixedSplit unaligned_split; + float unalignedSplitSAH = FLT_MAX; + /* float unalignedLeafSAH = FLT_MAX; */ + Transform aligned_space; + if(params.use_unaligned_nodes && + splitSAH > params.unaligned_split_threshold*leafSAH) + { + aligned_space = + unaligned_heuristic.compute_aligned_space(range, &references->at(0)); + unaligned_split = BVHMixedSplit(this, + storage, + range, + references, + level, + &unaligned_heuristic, + &aligned_space); + /* unalignedLeafSAH = params.sah_primitive_cost * split.leafSAH; */ + unalignedSplitSAH = params.sah_node_cost * unaligned_split.bounds.half_area() + + params.sah_primitive_cost * unaligned_split.nodeSAH; + /* TOOD(sergey): Check we can create leaf already. */ + } /* Do split. */ BVHRange left, right; - split.split(this, left, right, range); + if(unalignedSplitSAH < splitSAH) { + unaligned_split.split(this, left, right, range); + } + else { + split.split(this, left, right, range); + } progress_total += left.size() + right.size() - range.size(); + BoundBox bounds; + if(unalignedSplitSAH < splitSAH) { + bounds = unaligned_heuristic.compute_aligned_boundbox( + range, &references->at(0), aligned_space); + } + else { + bounds = range.bounds(); + } + /* Create inner node. */ InnerNode *inner; - if(range.size() < THREAD_TASK_SIZE) { /* Local build. */ @@ -539,11 +636,11 @@ BVHNode* BVHBuild::build_node(const BVHRange& range, /* Build right node. */ BVHNode *rightnode = build_node(right, ©, level + 1, thread_id); - inner = new InnerNode(range.bounds(), leftnode, rightnode); + inner = new InnerNode(bounds, leftnode, rightnode); } else { /* Threaded build. */ - inner = new InnerNode(range.bounds()); + inner = new InnerNode(bounds); task_pool.push(new BVHSpatialSplitBuildTask(this, inner, 0, @@ -560,6 +657,10 @@ BVHNode* BVHBuild::build_node(const BVHRange& range, true); } + if(unalignedSplitSAH < splitSAH) { + inner->set_aligned_space(aligned_space); + } + return inner; } @@ -616,6 +717,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL]; vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL]; vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL]; + vector<BVHReference, LeafStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL]; /* TODO(sergey): In theory we should be able to store references. */ typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator; @@ -634,6 +736,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, const BVHReference& ref = references[range.start() + i]; if(ref.prim_index() != -1) { int type_index = bitscan(ref.prim_type() & PRIMITIVE_ALL); + p_ref[type_index].push_back(ref); p_type[type_index].push_back(ref.prim_type()); p_index[type_index].push_back(ref.prim_index()); p_object[type_index].push_back(ref.prim_object()); @@ -674,16 +777,38 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, if(num != 0) { assert(p_type[i].size() == p_index[i].size()); assert(p_type[i].size() == p_object[i].size()); + Transform aligned_space; + bool alignment_found = false; for(int j = 0; j < num; ++j) { const int index = start_index + j; local_prim_type[index] = p_type[i][j]; local_prim_index[index] = p_index[i][j]; local_prim_object[index] = p_object[i][j]; + if(params.use_unaligned_nodes && !alignment_found) { + alignment_found = + unaligned_heuristic.compute_aligned_space(p_ref[i][j], + &aligned_space); + } + } + LeafNode *leaf_node = new LeafNode(bounds[i], + visibility[i], + start_index, + start_index + num); + if(alignment_found) { + /* Need to recalculate leaf bounds with new alignment. */ + leaf_node->m_bounds = BoundBox::empty; + for(int j = 0; j < num; ++j) { + const BVHReference &ref = p_ref[i][j]; + BoundBox ref_bounds = + unaligned_heuristic.compute_aligned_prim_boundbox( + ref, + aligned_space); + leaf_node->m_bounds.grow(ref_bounds); + } + /* Set alignment space. */ + leaf_node->set_aligned_space(aligned_space); } - leaves[num_leaves++] = new LeafNode(bounds[i], - visibility[i], - start_index, - start_index + num); + leaves[num_leaves++] = leaf_node; start_index += num; } } @@ -765,6 +890,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, ++num_leaves; } + /* TODO(sergey): Need to take care of alignment when number of leaves + * is more than 1. + */ if(num_leaves == 1) { /* Simplest case: single leaf, just return it. * In all the rest cases we'll be creating intermediate inner node with diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h index a015b89d72f..64180349935 100644 --- a/intern/cycles/bvh/bvh_build.h +++ b/intern/cycles/bvh/bvh_build.h @@ -22,6 +22,7 @@ #include "bvh.h" #include "bvh_binning.h" +#include "bvh_unaligned.h" #include "util_boundbox.h" #include "util_task.h" @@ -59,13 +60,14 @@ protected: friend class BVHSpatialSplit; friend class BVHBuildTask; friend class BVHSpatialSplitBuildTask; + friend class BVHObjectBinning; - /* adding references */ + /* Adding references. */ void add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i); void add_reference_object(BoundBox& root, BoundBox& center, Object *ob, int i); void add_references(BVHRange& root); - /* building */ + /* Building. */ BVHNode *build_node(const BVHRange& range, vector<BVHReference> *references, int level, @@ -78,7 +80,7 @@ protected: bool range_within_max_leaf_size(const BVHRange& range, const vector<BVHReference>& references) const; - /* threads */ + /* Threads. */ enum { THREAD_TASK_SIZE = 4096 }; void thread_build_node(InnerNode *node, int child, @@ -92,41 +94,44 @@ protected: int thread_id); thread_mutex build_mutex; - /* progress */ + /* Progress. */ void progress_update(); - /* tree rotations */ + /* Tree rotations. */ void rotate(BVHNode *node, int max_depth); void rotate(BVHNode *node, int max_depth, int iterations); - /* objects and primitive references */ + /* Objects and primitive references. */ vector<Object*> objects; vector<BVHReference> references; int num_original_references; - /* output primitive indexes and objects */ + /* Output primitive indexes and objects. */ array<int>& prim_type; array<int>& prim_index; array<int>& prim_object; - /* build parameters */ + /* Build parameters. */ BVHParams params; - /* progress reporting */ + /* Progress reporting. */ Progress& progress; double progress_start_time; size_t progress_count; size_t progress_total; size_t progress_original_total; - /* spatial splitting */ + /* Spatial splitting. */ float spatial_min_overlap; vector<BVHSpatialStorage> spatial_storage; size_t spatial_free_index; thread_spin_lock spatial_spin_lock; - /* threads */ + /* Threads. */ TaskPool task_pool; + + /* Unaligned building. */ + BVHUnaligned unaligned_heuristic; }; CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp index 8294690da7d..f5cd699bdf4 100644 --- a/intern/cycles/bvh/bvh_node.cpp +++ b/intern/cycles/bvh/bvh_node.cpp @@ -61,6 +61,76 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const } } return cnt; + case BVH_STAT_ALIGNED_COUNT: + if(!is_unaligned()) { + cnt = 1; + } + break; + case BVH_STAT_UNALIGNED_COUNT: + if(is_unaligned()) { + cnt = 1; + } + break; + case BVH_STAT_ALIGNED_INNER_COUNT: + if(!is_leaf()) { + bool has_unaligned = false; + for(int j = 0; j < num_children(); j++) { + has_unaligned |= get_child(j)->is_unaligned(); + } + cnt += has_unaligned? 0: 1; + } + break; + case BVH_STAT_UNALIGNED_INNER_COUNT: + if(!is_leaf()) { + bool has_unaligned = false; + for(int j = 0; j < num_children(); j++) { + has_unaligned |= get_child(j)->is_unaligned(); + } + cnt += has_unaligned? 1: 0; + } + break; + case BVH_STAT_ALIGNED_INNER_QNODE_COUNT: + { + bool has_unaligned = false; + for(int i = 0; i < num_children(); i++) { + BVHNode *node = get_child(i); + if(node->is_leaf()) { + has_unaligned |= node->is_unaligned(); + } + else { + for(int j = 0; j < node->num_children(); j++) { + cnt += node->get_child(j)->getSubtreeSize(stat); + has_unaligned |= node->get_child(j)->is_unaligned(); + } + } + } + cnt += has_unaligned? 0: 1; + } + return cnt; + case BVH_STAT_UNALIGNED_INNER_QNODE_COUNT: + { + bool has_unaligned = false; + for(int i = 0; i < num_children(); i++) { + BVHNode *node = get_child(i); + if(node->is_leaf()) { + has_unaligned |= node->is_unaligned(); + } + else { + for(int j = 0; j < node->num_children(); j++) { + cnt += node->get_child(j)->getSubtreeSize(stat); + has_unaligned |= node->get_child(j)->is_unaligned(); + } + } + } + cnt += has_unaligned? 1: 0; + } + return cnt; + case BVH_STAT_ALIGNED_LEAF_COUNT: + cnt = (is_leaf() && !is_unaligned()) ? 1 : 0; + break; + case BVH_STAT_UNALIGNED_LEAF_COUNT: + cnt = (is_leaf() && is_unaligned()) ? 1 : 0; + break; default: assert(0); /* unknown mode */ } diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h index d476fb917ed..f2965a785e6 100644 --- a/intern/cycles/bvh/bvh_node.h +++ b/intern/cycles/bvh/bvh_node.h @@ -31,6 +31,14 @@ enum BVH_STAT { BVH_STAT_TRIANGLE_COUNT, BVH_STAT_CHILDNODE_COUNT, BVH_STAT_QNODE_COUNT, + BVH_STAT_ALIGNED_COUNT, + BVH_STAT_UNALIGNED_COUNT, + BVH_STAT_ALIGNED_INNER_COUNT, + BVH_STAT_UNALIGNED_INNER_COUNT, + BVH_STAT_ALIGNED_INNER_QNODE_COUNT, + BVH_STAT_UNALIGNED_INNER_QNODE_COUNT, + BVH_STAT_ALIGNED_LEAF_COUNT, + BVH_STAT_UNALIGNED_LEAF_COUNT, }; class BVHParams; @@ -38,16 +46,41 @@ class BVHParams; class BVHNode { public: - BVHNode() + BVHNode() : m_is_unaligned(false), + m_aligned_space(NULL) { } - virtual ~BVHNode() {} + virtual ~BVHNode() + { + delete m_aligned_space; + } + virtual bool is_leaf() const = 0; virtual int num_children() const = 0; virtual BVHNode *get_child(int i) const = 0; virtual int num_triangles() const { return 0; } virtual void print(int depth = 0) const = 0; + bool is_unaligned() const { return m_is_unaligned; } + + inline void set_aligned_space(const Transform& aligned_space) + { + m_is_unaligned = true; + if (m_aligned_space == NULL) { + m_aligned_space = new Transform(aligned_space); + } + else { + *m_aligned_space = aligned_space; + } + } + + inline Transform get_aligned_space() const + { + if(m_aligned_space == NULL) { + return transform_identity(); + } + return *m_aligned_space; + } BoundBox m_bounds; uint m_visibility; @@ -58,12 +91,20 @@ public: void deleteSubtree(); uint update_visibility(); + + bool m_is_unaligned; + + // TODO(sergey): Can be stored as 3x3 matrix, but better to have some + // utilities and type defines in util_transform first. + Transform *m_aligned_space; }; class InnerNode : public BVHNode { public: - InnerNode(const BoundBox& bounds, BVHNode* child0, BVHNode* child1) + InnerNode(const BoundBox& bounds, + BVHNode* child0, + BVHNode* child1) { m_bounds = bounds; children[0] = child0; diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h index cf683df1b31..2e698a80742 100644 --- a/intern/cycles/bvh/bvh_params.h +++ b/intern/cycles/bvh/bvh_params.h @@ -20,6 +20,8 @@ #include "util_boundbox.h" +#include "kernel_types.h" + CCL_NAMESPACE_BEGIN /* BVH Parameters */ @@ -31,6 +33,9 @@ public: bool use_spatial_split; float spatial_split_alpha; + /* Unaligned nodes creation threshold */ + float unaligned_split_threshold; + /* SAH costs */ float sah_node_cost; float sah_primitive_cost; @@ -46,6 +51,14 @@ public: /* QBVH */ bool use_qbvh; + /* Mask of primitives to be included into the BVH. */ + int primitive_mask; + + /* Use unaligned bounding boxes. + * Only used for curves BVH. + */ + bool use_unaligned_nodes; + /* fixed parameters */ enum { MAX_DEPTH = 64, @@ -58,6 +71,8 @@ public: use_spatial_split = true; spatial_split_alpha = 1e-5f; + unaligned_split_threshold = 0.7f; + /* todo: see if splitting up primitive cost to be separate for triangles * and curves can help. so far in tests it doesn't help, but why? */ sah_node_cost = 1.0f; @@ -69,6 +84,9 @@ public: top_level = false; use_qbvh = false; + use_unaligned_nodes = false; + + primitive_mask = PRIMITIVE_ALL; } /* SAH costs */ diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp index e9032c61c3b..e5bcf9995bf 100644 --- a/intern/cycles/bvh/bvh_sort.cpp +++ b/intern/cycles/bvh/bvh_sort.cpp @@ -26,23 +26,27 @@ CCL_NAMESPACE_BEGIN static const int BVH_SORT_THRESHOLD = 4096; -/* Silly workaround for float extended precision that happens when compiling - * on x86, due to one float staying in 80 bit precision register and the other - * not, which causes the strictly weak ordering to break. - */ -#if !defined(__i386__) -# define NO_EXTENDED_PRECISION -#else -# define NO_EXTENDED_PRECISION volatile -#endif - struct BVHReferenceCompare { public: int dim; + const BVHUnaligned *unaligned_heuristic; + const Transform *aligned_space; + + BVHReferenceCompare(int dim, + const BVHUnaligned *unaligned_heuristic, + const Transform *aligned_space) + : dim(dim), + unaligned_heuristic(unaligned_heuristic), + aligned_space(aligned_space) + { + } - explicit BVHReferenceCompare(int dim_) + __forceinline BoundBox get_prim_bounds(const BVHReference& prim) const { - dim = dim_; + return (aligned_space != NULL) + ? unaligned_heuristic->compute_aligned_prim_boundbox( + prim, *aligned_space) + : prim.bounds(); } /* Compare two references. @@ -52,8 +56,10 @@ public: __forceinline int compare(const BVHReference& ra, const BVHReference& rb) const { - NO_EXTENDED_PRECISION float ca = ra.bounds().min[dim] + ra.bounds().max[dim]; - NO_EXTENDED_PRECISION float cb = rb.bounds().min[dim] + rb.bounds().max[dim]; + BoundBox ra_bounds = get_prim_bounds(ra), + rb_bounds = get_prim_bounds(rb); + float ca = ra_bounds.min[dim] + ra_bounds.max[dim]; + float cb = rb_bounds.min[dim] + rb_bounds.max[dim]; if(ca < cb) return -1; else if(ca > cb) return 1; @@ -171,10 +177,15 @@ static void bvh_reference_sort_threaded(TaskPool *task_pool, } } -void bvh_reference_sort(int start, int end, BVHReference *data, int dim) +void bvh_reference_sort(int start, + int end, + BVHReference *data, + int dim, + const BVHUnaligned *unaligned_heuristic, + const Transform *aligned_space) { const int count = end - start; - BVHReferenceCompare compare(dim); + BVHReferenceCompare compare(dim, unaligned_heuristic, aligned_space); if(count < BVH_SORT_THRESHOLD) { /* It is important to not use any mutex if array is small enough, * otherwise we end up in situation when we're going to sleep far diff --git a/intern/cycles/bvh/bvh_sort.h b/intern/cycles/bvh/bvh_sort.h index 18aafb5f1ff..b49ca02eb60 100644 --- a/intern/cycles/bvh/bvh_sort.h +++ b/intern/cycles/bvh/bvh_sort.h @@ -20,7 +20,15 @@ CCL_NAMESPACE_BEGIN -void bvh_reference_sort(int start, int end, BVHReference *data, int dim); +class BVHUnaligned; +struct Transform; + +void bvh_reference_sort(int start, + int end, + BVHReference *data, + int dim, + const BVHUnaligned *unaligned_heuristic = NULL, + const Transform *aligned_space = NULL); CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp index bf68b41021f..d0d5fbe5a7a 100644 --- a/intern/cycles/bvh/bvh_split.cpp +++ b/intern/cycles/bvh/bvh_split.cpp @@ -32,14 +32,18 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder, BVHSpatialStorage *storage, const BVHRange& range, vector<BVHReference> *references, - float nodeSAH) + float nodeSAH, + const BVHUnaligned *unaligned_heuristic, + const Transform *aligned_space) : sah(FLT_MAX), dim(0), num_left(0), left_bounds(BoundBox::empty), right_bounds(BoundBox::empty), storage_(storage), - references_(references) + references_(references), + unaligned_heuristic_(unaligned_heuristic), + aligned_space_(aligned_space) { const BVHReference *ref_ptr = &references_->at(range.start()); float min_sah = FLT_MAX; @@ -51,12 +55,15 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder, bvh_reference_sort(range.start(), range.end(), &references_->at(0), - dim); + dim, + unaligned_heuristic_, + aligned_space_); /* sweep right to left and determine bounds. */ BoundBox right_bounds = BoundBox::empty; for(int i = range.size() - 1; i > 0; i--) { - right_bounds.grow(ref_ptr[i].bounds()); + BoundBox prim_bounds = get_prim_bounds(ref_ptr[i]); + right_bounds.grow(prim_bounds); storage_->right_bounds[i - 1] = right_bounds; } @@ -64,7 +71,8 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder, BoundBox left_bounds = BoundBox::empty; for(int i = 1; i < range.size(); i++) { - left_bounds.grow(ref_ptr[i - 1].bounds()); + BoundBox prim_bounds = get_prim_bounds(ref_ptr[i - 1]); + left_bounds.grow(prim_bounds); right_bounds = storage_->right_bounds[i - 1]; float sah = nodeSAH + @@ -88,16 +96,37 @@ void BVHObjectSplit::split(BVHRange& left, BVHRange& right, const BVHRange& range) { + assert(references_->size() > 0); /* sort references according to split */ bvh_reference_sort(range.start(), range.end(), &references_->at(0), - this->dim); + this->dim, + unaligned_heuristic_, + aligned_space_); + + BoundBox effective_left_bounds, effective_right_bounds; + const int num_right = range.size() - this->num_left; + if(aligned_space_ == NULL) { + effective_left_bounds = left_bounds; + effective_right_bounds = right_bounds; + } + else { + effective_left_bounds = BoundBox::empty; + effective_right_bounds = BoundBox::empty; + for(int i = 0; i < this->num_left; ++i) { + BoundBox prim_boundbox = references_->at(range.start() + i).bounds(); + effective_left_bounds.grow(prim_boundbox); + } + for(int i = 0; i < num_right; ++i) { + BoundBox prim_boundbox = references_->at(range.start() + this->num_left + i).bounds(); + effective_right_bounds.grow(prim_boundbox); + } + } /* split node ranges */ - left = BVHRange(this->left_bounds, range.start(), this->num_left); - right = BVHRange(this->right_bounds, left.end(), range.size() - this->num_left); - + left = BVHRange(effective_left_bounds, range.start(), this->num_left); + right = BVHRange(effective_right_bounds, left.end(), num_right); } /* Spatial Split */ @@ -106,16 +135,31 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild& builder, BVHSpatialStorage *storage, const BVHRange& range, vector<BVHReference> *references, - float nodeSAH) + float nodeSAH, + const BVHUnaligned *unaligned_heuristic, + const Transform *aligned_space) : sah(FLT_MAX), dim(0), pos(0.0f), storage_(storage), - references_(references) + references_(references), + unaligned_heuristic_(unaligned_heuristic), + aligned_space_(aligned_space) { /* initialize bins. */ - float3 origin = range.bounds().min; - float3 binSize = (range.bounds().max - origin) * (1.0f / (float)BVHParams::NUM_SPATIAL_BINS); + BoundBox range_bounds; + if(aligned_space == NULL) { + range_bounds = range.bounds(); + } + else { + range_bounds = unaligned_heuristic->compute_aligned_boundbox( + range, + &references->at(0), + *aligned_space); + } + + float3 origin = range_bounds.min; + float3 binSize = (range_bounds.max - origin) * (1.0f / (float)BVHParams::NUM_SPATIAL_BINS); float3 invBinSize = 1.0f / binSize; for(int dim = 0; dim < 3; dim++) { @@ -131,8 +175,9 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild& builder, /* chop references into bins. */ for(unsigned int refIdx = range.start(); refIdx < range.end(); refIdx++) { const BVHReference& ref = references_->at(refIdx); - float3 firstBinf = (ref.bounds().min - origin) * invBinSize; - float3 lastBinf = (ref.bounds().max - origin) * invBinSize; + BoundBox prim_bounds = get_prim_bounds(ref); + float3 firstBinf = (prim_bounds.min - origin) * invBinSize; + float3 lastBinf = (prim_bounds.max - origin) * invBinSize; int3 firstBin = make_int3((int)firstBinf.x, (int)firstBinf.y, (int)firstBinf.z); int3 lastBin = make_int3((int)lastBinf.x, (int)lastBinf.y, (int)lastBinf.z); @@ -140,7 +185,10 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild& builder, lastBin = clamp(lastBin, firstBin, BVHParams::NUM_SPATIAL_BINS - 1); for(int dim = 0; dim < 3; dim++) { - BVHReference currRef = ref; + BVHReference currRef(get_prim_bounds(ref), + ref.prim_index(), + ref.prim_object(), + ref.prim_type()); for(int i = firstBin[dim]; i < lastBin[dim]; i++) { BVHReference leftRef, rightRef; @@ -209,14 +257,15 @@ void BVHSpatialSplit::split(BVHBuild *builder, BoundBox right_bounds = BoundBox::empty; for(int i = left_end; i < right_start; i++) { - if(refs[i].bounds().max[this->dim] <= this->pos) { + BoundBox prim_bounds = get_prim_bounds(refs[i]); + if(prim_bounds.max[this->dim] <= this->pos) { /* entirely on the left-hand side */ - left_bounds.grow(refs[i].bounds()); + left_bounds.grow(prim_bounds); swap(refs[i], refs[left_end++]); } - else if(refs[i].bounds().min[this->dim] >= this->pos) { + else if(prim_bounds.min[this->dim] >= this->pos) { /* entirely on the right-hand side */ - right_bounds.grow(refs[i].bounds()); + right_bounds.grow(prim_bounds); swap(refs[i--], refs[--right_start]); } } @@ -231,8 +280,12 @@ void BVHSpatialSplit::split(BVHBuild *builder, new_refs.reserve(right_start - left_end); while(left_end < right_start) { /* split reference. */ + BVHReference curr_ref(get_prim_bounds(refs[left_end]), + refs[left_end].prim_index(), + refs[left_end].prim_object(), + refs[left_end].prim_type()); BVHReference lref, rref; - split_reference(*builder, lref, rref, refs[left_end], this->dim, this->pos); + split_reference(*builder, lref, rref, curr_ref, this->dim, this->pos); /* compute SAH for duplicate/unsplit candidates. */ BoundBox lub = left_bounds; // Unsplit to left: new left-hand bounds. @@ -240,8 +293,8 @@ void BVHSpatialSplit::split(BVHBuild *builder, BoundBox ldb = left_bounds; // Duplicate: new left-hand bounds. BoundBox rdb = right_bounds; // Duplicate: new right-hand bounds. - lub.grow(refs[left_end].bounds()); - rub.grow(refs[left_end].bounds()); + lub.grow(curr_ref.bounds()); + rub.grow(curr_ref.bounds()); ldb.grow(lref.bounds()); rdb.grow(rref.bounds()); @@ -280,6 +333,17 @@ void BVHSpatialSplit::split(BVHBuild *builder, new_refs.begin(), new_refs.end()); } + if(aligned_space_ != NULL) { + left_bounds = right_bounds = BoundBox::empty; + for(int i = left_start; i < left_end - left_start; ++i) { + BoundBox prim_boundbox = references_->at(i).bounds(); + left_bounds.grow(prim_boundbox); + } + for(int i = right_start; i < right_end - right_start; ++i) { + BoundBox prim_boundbox = references_->at(i).bounds(); + right_bounds.grow(prim_boundbox); + } + } left = BVHRange(left_bounds, left_start, left_end - left_start); right = BVHRange(right_bounds, right_start, right_end - right_start); } @@ -295,11 +359,13 @@ void BVHSpatialSplit::split_triangle_primitive(const Mesh *mesh, Mesh::Triangle t = mesh->get_triangle(prim_index); const float3 *verts = &mesh->verts[0]; float3 v1 = tfm ? transform_point(tfm, verts[t.v[2]]) : verts[t.v[2]]; + v1 = get_unaligned_point(v1); for(int i = 0; i < 3; i++) { float3 v0 = v1; int vindex = t.v[i]; v1 = tfm ? transform_point(tfm, verts[vindex]) : verts[vindex]; + v1 = get_unaligned_point(v1); float v0p = v0[dim]; float v1p = v1[dim]; @@ -339,6 +405,8 @@ void BVHSpatialSplit::split_curve_primitive(const Mesh *mesh, v0 = transform_point(tfm, v0); v1 = transform_point(tfm, v1); } + v0 = get_unaligned_point(v0); + v1 = get_unaligned_point(v1); float v0p = v0[dim]; float v1p = v1[dim]; @@ -473,6 +541,7 @@ void BVHSpatialSplit::split_reference(const BVHBuild& builder, /* intersect with original bounds. */ left_bounds.max[dim] = pos; right_bounds.min[dim] = pos; + left_bounds.intersect(ref.bounds()); right_bounds.intersect(ref.bounds()); diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h index aea8b2565e0..dbdb51f1a5b 100644 --- a/intern/cycles/bvh/bvh_split.h +++ b/intern/cycles/bvh/bvh_split.h @@ -24,6 +24,7 @@ CCL_NAMESPACE_BEGIN class BVHBuild; +struct Transform; /* Object Split */ @@ -41,7 +42,9 @@ public: BVHSpatialStorage *storage, const BVHRange& range, vector<BVHReference> *references, - float nodeSAH); + float nodeSAH, + const BVHUnaligned *unaligned_heuristic = NULL, + const Transform *aligned_space = NULL); void split(BVHRange& left, BVHRange& right, @@ -50,6 +53,19 @@ public: protected: BVHSpatialStorage *storage_; vector<BVHReference> *references_; + const BVHUnaligned *unaligned_heuristic_; + const Transform *aligned_space_; + + __forceinline BoundBox get_prim_bounds(const BVHReference& prim) const + { + if(aligned_space_ == NULL) { + return prim.bounds(); + } + else { + return unaligned_heuristic_->compute_aligned_prim_boundbox( + prim, *aligned_space_); + } + } }; /* Spatial Split */ @@ -70,7 +86,9 @@ public: BVHSpatialStorage *storage, const BVHRange& range, vector<BVHReference> *references, - float nodeSAH); + float nodeSAH, + const BVHUnaligned *unaligned_heuristic = NULL, + const Transform *aligned_space = NULL); void split(BVHBuild *builder, BVHRange& left, @@ -87,6 +105,8 @@ public: protected: BVHSpatialStorage *storage_; vector<BVHReference> *references_; + const BVHUnaligned *unaligned_heuristic_; + const Transform *aligned_space_; /* Lower-level functions which calculates boundaries of left and right nodes * needed for spatial split. @@ -132,6 +152,27 @@ protected: float pos, BoundBox& left_bounds, BoundBox& right_bounds); + + __forceinline BoundBox get_prim_bounds(const BVHReference& prim) const + { + if(aligned_space_ == NULL) { + return prim.bounds(); + } + else { + return unaligned_heuristic_->compute_aligned_prim_boundbox( + prim, *aligned_space_); + } + } + + __forceinline float3 get_unaligned_point(const float3& point) const + { + if(aligned_space_ == NULL) { + return point; + } + else { + return transform_point(aligned_space_, point); + } + } }; /* Mixed Object-Spatial Split */ @@ -148,19 +189,40 @@ public: bool no_split; + BoundBox bounds; + + BVHMixedSplit() {} + __forceinline BVHMixedSplit(BVHBuild *builder, BVHSpatialStorage *storage, const BVHRange& range, vector<BVHReference> *references, - int level) + int level, + const BVHUnaligned *unaligned_heuristic = NULL, + const Transform *aligned_space = NULL) { + if(aligned_space == NULL) { + bounds = range.bounds(); + } + else { + bounds = unaligned_heuristic->compute_aligned_boundbox( + range, + &references->at(0), + *aligned_space); + } /* find split candidates. */ - float area = range.bounds().safe_area(); + float area = bounds.safe_area(); leafSAH = area * builder->params.primitive_cost(range.size()); nodeSAH = area * builder->params.node_cost(2); - object = BVHObjectSplit(builder, storage, range, references, nodeSAH); + object = BVHObjectSplit(builder, + storage, + range, + references, + nodeSAH, + unaligned_heuristic, + aligned_space); if(builder->params.use_spatial_split && level < BVHParams::MAX_SPATIAL_DEPTH) { BoundBox overlap = object.left_bounds; @@ -171,7 +233,9 @@ public: storage, range, references, - nodeSAH); + nodeSAH, + unaligned_heuristic, + aligned_space); } } @@ -181,7 +245,10 @@ public: builder->range_within_max_leaf_size(range, *references)); } - __forceinline void split(BVHBuild *builder, BVHRange& left, BVHRange& right, const BVHRange& range) + __forceinline void split(BVHBuild *builder, + BVHRange& left, + BVHRange& right, + const BVHRange& range) { if(builder->params.use_spatial_split && minSAH == spatial.sah) spatial.split(builder, left, right, range); @@ -193,4 +260,3 @@ public: CCL_NAMESPACE_END #endif /* __BVH_SPLIT_H__ */ - diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp new file mode 100644 index 00000000000..a876c670914 --- /dev/null +++ b/intern/cycles/bvh/bvh_unaligned.cpp @@ -0,0 +1,178 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "bvh_unaligned.h" + +#include "mesh.h" +#include "object.h" + +#include "bvh_binning.h" +#include "bvh_params.h" + +#include "util_boundbox.h" +#include "util_debug.h" +#include "util_transform.h" + +CCL_NAMESPACE_BEGIN + + +BVHUnaligned::BVHUnaligned(const vector<Object*>& objects) + : objects_(objects) +{ +} + +Transform BVHUnaligned::compute_aligned_space( + const BVHObjectBinning& range, + const BVHReference *references) const +{ + for(int i = range.start(); i < range.end(); ++i) { + const BVHReference& ref = references[i]; + Transform aligned_space; + /* Use first primitive which defines correct direction to define + * the orientation space. + */ + if(compute_aligned_space(ref, &aligned_space)) { + return aligned_space; + } + } + return transform_identity(); +} + +Transform BVHUnaligned::compute_aligned_space( + const BVHRange& range, + const BVHReference *references) const +{ + for(int i = range.start(); i < range.end(); ++i) { + const BVHReference& ref = references[i]; + Transform aligned_space; + /* Use first primitive which defines correct direction to define + * the orientation space. + */ + if(compute_aligned_space(ref, &aligned_space)) { + return aligned_space; + } + } + return transform_identity(); +} + +bool BVHUnaligned::compute_aligned_space(const BVHReference& ref, + Transform *aligned_space) const +{ + const Object *object = objects_[ref.prim_object()]; + const int packed_type = ref.prim_type(); + const int type = (packed_type & PRIMITIVE_ALL); + if(type & PRIMITIVE_CURVE) { + const int curve_index = ref.prim_index(); + const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type); + const Mesh *mesh = object->mesh; + const Mesh::Curve& curve = mesh->get_curve(curve_index); + const int key = curve.first_key + segment; + const float3 v1 = mesh->curve_keys[key], + v2 = mesh->curve_keys[key + 1]; + float length; + const float3 axis = normalize_len(v2 - v1, &length); + if(length > 1e-6f) { + *aligned_space = make_transform_frame(axis); + return true; + } + } + *aligned_space = transform_identity(); + return false; +} + +BoundBox BVHUnaligned::compute_aligned_prim_boundbox( + const BVHReference& prim, + const Transform& aligned_space) const +{ + BoundBox bounds = BoundBox::empty; + const Object *object = objects_[prim.prim_object()]; + const int packed_type = prim.prim_type(); + const int type = (packed_type & PRIMITIVE_ALL); + if(type & PRIMITIVE_CURVE) { + const int curve_index = prim.prim_index(); + const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type); + const Mesh *mesh = object->mesh; + const Mesh::Curve& curve = mesh->get_curve(curve_index); + curve.bounds_grow(segment, + &mesh->curve_keys[0], + &mesh->curve_radius[0], + aligned_space, + bounds); + } + else { + bounds = prim.bounds().transformed(&aligned_space); + } + return bounds; +} + +BoundBox BVHUnaligned::compute_aligned_boundbox( + const BVHObjectBinning& range, + const BVHReference *references, + const Transform& aligned_space, + BoundBox *cent_bounds) const +{ + BoundBox bounds = BoundBox::empty; + if(cent_bounds != NULL) { + *cent_bounds = BoundBox::empty; + } + for(int i = range.start(); i < range.end(); ++i) { + const BVHReference& ref = references[i]; + BoundBox ref_bounds = compute_aligned_prim_boundbox(ref, aligned_space); + bounds.grow(ref_bounds); + if(cent_bounds != NULL) { + cent_bounds->grow(ref_bounds.center2()); + } + } + return bounds; +} + +BoundBox BVHUnaligned::compute_aligned_boundbox( + const BVHRange& range, + const BVHReference *references, + const Transform& aligned_space, + BoundBox *cent_bounds) const +{ + BoundBox bounds = BoundBox::empty; + if(cent_bounds != NULL) { + *cent_bounds = BoundBox::empty; + } + for(int i = range.start(); i < range.end(); ++i) { + const BVHReference& ref = references[i]; + BoundBox ref_bounds = compute_aligned_prim_boundbox(ref, aligned_space); + bounds.grow(ref_bounds); + if(cent_bounds != NULL) { + cent_bounds->grow(ref_bounds.center2()); + } + } + return bounds; +} + +Transform BVHUnaligned::compute_node_transform( + const BoundBox& bounds, + const Transform& aligned_space) +{ + Transform space = aligned_space; + space.x.w -= bounds.min.x; + space.y.w -= bounds.min.y; + space.z.w -= bounds.min.z; + float3 dim = bounds.max - bounds.min; + return transform_scale(1.0f / max(1e-18f, dim.x), + 1.0f / max(1e-18f, dim.y), + 1.0f / max(1e-18f, dim.z)) * space; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h new file mode 100644 index 00000000000..4d0872f4a39 --- /dev/null +++ b/intern/cycles/bvh/bvh_unaligned.h @@ -0,0 +1,81 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BVH_UNALIGNED_H__ +#define __BVH_UNALIGNED_H__ + +#include "util_vector.h" + +CCL_NAMESPACE_BEGIN + +class BoundBox; +class BVHObjectBinning; +class BVHRange; +class BVHReference; +struct Transform; +class Object; + +/* Helper class to perform calculations needed for unaligned nodes. */ +class BVHUnaligned { +public: + BVHUnaligned(const vector<Object*>& objects); + + /* Calculate alignment for the oriented node for a given range. */ + Transform compute_aligned_space( + const BVHObjectBinning& range, + const BVHReference *references) const; + Transform compute_aligned_space( + const BVHRange& range, + const BVHReference *references) const; + + /* Calculate alignment for the oriented node for a given reference. + * + * Return true when space was calculated successfully. + */ + bool compute_aligned_space(const BVHReference& ref, + Transform *aligned_space) const; + + /* Calculate primitive's bounding box in given space. */ + BoundBox compute_aligned_prim_boundbox( + const BVHReference& prim, + const Transform& aligned_space) const; + + /* Calculate bounding box in given space. */ + BoundBox compute_aligned_boundbox( + const BVHObjectBinning& range, + const BVHReference *references, + const Transform& aligned_space, + BoundBox *cent_bounds = NULL) const; + BoundBox compute_aligned_boundbox( + const BVHRange& range, + const BVHReference *references, + const Transform& aligned_space, + BoundBox *cent_bounds = NULL) const; + + /* Calculate affine transform for node packing. + * Bounds will be in the range of 0..1. + */ + static Transform compute_node_transform(const BoundBox& bounds, + const Transform& aligned_space); +protected: + /* List of objects BVH is being created for. */ + const vector<Object*>& objects_; +}; + +CCL_NAMESPACE_END + +#endif /* __BVH_UNALIGNED_H__ */ + diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index f0adbc03e22..bd3969b2889 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -28,6 +28,22 @@ set(SRC kernels/cuda/kernel.cu ) +set(SRC_BVH_HEADERS + bvh/bvh.h + bvh/bvh_nodes.h + bvh/bvh_shadow_all.h + bvh/bvh_subsurface.h + bvh/bvh_traversal.h + bvh/bvh_volume.h + bvh/bvh_volume_all.h + bvh/qbvh_nodes.h + bvh/qbvh_shadow_all.h + bvh/qbvh_subsurface.h + bvh/qbvh_traversal.h + bvh/qbvh_volume.h + bvh/qbvh_volume_all.h +) + set(SRC_HEADERS kernel_accumulate.h kernel_bake.h @@ -140,23 +156,11 @@ set(SRC_SVM_HEADERS set(SRC_GEOM_HEADERS geom/geom.h geom/geom_attribute.h - geom/geom_bvh.h - geom/geom_bvh_shadow.h - geom/geom_bvh_subsurface.h - geom/geom_bvh_traversal.h - geom/geom_bvh_volume.h - geom/geom_bvh_volume_all.h geom/geom_curve.h geom/geom_motion_curve.h geom/geom_motion_triangle.h geom/geom_object.h geom/geom_primitive.h - geom/geom_qbvh.h - geom/geom_qbvh_shadow.h - geom/geom_qbvh_subsurface.h - geom/geom_qbvh_traversal.h - geom/geom_qbvh_volume.h - geom/geom_qbvh_volume_all.h geom/geom_triangle.h geom/geom_triangle_intersect.h geom/geom_volume.h @@ -212,7 +216,14 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() # build for each arch - set(cuda_sources kernels/cuda/kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS}) + set(cuda_sources kernels/cuda/kernel.cu + ${SRC_HEADERS} + ${SRC_BVH_HEADERS} + ${SRC_SVM_HEADERS} + ${SRC_GEOM_HEADERS} + ${SRC_CLOSURE_HEADERS} + ${SRC_UTIL_HEADERS} + ) set(cuda_cubins) macro(CYCLES_CUDA_KERNEL_ADD arch experimental) @@ -312,6 +323,7 @@ add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} + ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} @@ -346,6 +358,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteratio delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom) diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/bvh/bvh.h index d0eedd3396a..59881738195 100644 --- a/intern/cycles/kernel/geom/geom_bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -35,6 +35,13 @@ CCL_NAMESPACE_BEGIN # define ccl_device_intersect ccl_device_inline #endif +/* bottom-most stack entry, indicating the end of traversal */ +#define ENTRYPOINT_SENTINEL 0x76543210 + +/* 64 object BVH + 64 mesh BVH + 64 object node splitting */ +#define BVH_STACK_SIZE 192 +#define BVH_QSTACK_SIZE 384 + /* BVH intersection function variations */ #define BVH_INSTANCING 1 @@ -72,71 +79,73 @@ CCL_NAMESPACE_BEGIN /* Common QBVH functions. */ #ifdef __QBVH__ -# include "geom_qbvh.h" +# include "qbvh_nodes.h" #endif /* Regular BVH traversal */ +#include "bvh_nodes.h" + #define BVH_FUNCTION_NAME bvh_intersect #define BVH_FUNCTION_FEATURES 0 -#include "geom_bvh_traversal.h" +#include "bvh_traversal.h" #if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "geom_bvh_traversal.h" +# include "bvh_traversal.h" #endif #if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_hair # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH -# include "geom_bvh_traversal.h" +# include "bvh_traversal.h" #endif #if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "geom_bvh_traversal.h" +# include "bvh_traversal.h" #endif #if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_hair_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION -# include "geom_bvh_traversal.h" +# include "bvh_traversal.h" #endif /* Subsurface scattering BVH traversal */ #if defined(__SUBSURFACE__) # define BVH_FUNCTION_NAME bvh_intersect_subsurface -# define BVH_FUNCTION_FEATURES 0 -# include "geom_bvh_subsurface.h" +# define BVH_FUNCTION_FEATURES BVH_HAIR +# include "bvh_subsurface.h" #endif #if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion -# define BVH_FUNCTION_FEATURES BVH_MOTION -# include "geom_bvh_subsurface.h" +# define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR +# include "bvh_subsurface.h" #endif /* Volume BVH traversal */ #if defined(__VOLUME__) # define BVH_FUNCTION_NAME bvh_intersect_volume -# define BVH_FUNCTION_FEATURES 0 -# include "geom_bvh_volume.h" +# define BVH_FUNCTION_FEATURES BVH_HAIR +# include "bvh_volume.h" #endif #if defined(__VOLUME__) && defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "geom_bvh_volume.h" +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR +# include "bvh_volume.h" #endif #if defined(__VOLUME__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "geom_bvh_volume.h" +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR +# include "bvh_volume.h" #endif /* Record all intersections - Shadow BVH traversal */ @@ -144,51 +153,51 @@ CCL_NAMESPACE_BEGIN #if defined(__SHADOW_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all # define BVH_FUNCTION_FEATURES 0 -# include "geom_bvh_shadow.h" +# include "bvh_shadow_all.h" #endif #if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "geom_bvh_shadow.h" +# include "bvh_shadow_all.h" #endif #if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "geom_bvh_shadow.h" +# include "bvh_shadow_all.h" #endif #if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "geom_bvh_shadow.h" +# include "bvh_shadow_all.h" #endif #if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION -# include "geom_bvh_shadow.h" +# include "bvh_shadow_all.h" #endif /* Record all intersections - Volume BVH traversal */ #if defined(__VOLUME_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all -# define BVH_FUNCTION_FEATURES 0 -# include "geom_bvh_volume_all.h" +# define BVH_FUNCTION_FEATURES BVH_HAIR +# include "bvh_volume_all.h" #endif #if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "geom_bvh_volume_all.h" +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR +# include "bvh_volume_all.h" #endif #if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "geom_bvh_volume_all.h" +# define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR +# include "bvh_volume_all.h" #endif #undef BVH_FEATURE diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h new file mode 100644 index 00000000000..db2275b0ff8 --- /dev/null +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -0,0 +1,656 @@ +/* + * Copyright 2011-2016, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and +// 3-vector which might be faster. +ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg, + int node_addr, + int child) +{ + Transform space; + const int child_addr = node_addr + child * 3; + space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1); + space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2); + space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3); + space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); + return space; +} + +#if !defined(__KERNEL_SSE2__) +ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) +{ + + /* fetch node data */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1); + float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2); + float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3); + + /* intersect ray against child nodes */ + float c0lox = (node0.x - P.x) * idir.x; + float c0hix = (node0.z - P.x) * idir.x; + float c0loy = (node1.x - P.y) * idir.y; + float c0hiy = (node1.z - P.y) * idir.y; + float c0loz = (node2.x - P.z) * idir.z; + float c0hiz = (node2.z - P.z) * idir.z; + float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + float c1lox = (node0.y - P.x) * idir.x; + float c1hix = (node0.w - P.x) * idir.x; + float c1loy = (node1.y - P.y) * idir.y; + float c1hiy = (node1.w - P.y) * idir.y; + float c1loz = (node2.y - P.z) * idir.z; + float c1hiz = (node2.w - P.z) * idir.z; + float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + + dist[0] = c0min; + dist[1] = c1min; + +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); +#else + return ((c0max >= c0min)? 1: 0) | + ((c1max >= c1min)? 2: 0); +#endif +} + +ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) +{ + + /* fetch node data */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1); + float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2); + float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3); + + /* intersect ray against child nodes */ + float c0lox = (node0.x - P.x) * idir.x; + float c0hix = (node0.z - P.x) * idir.x; + float c0loy = (node1.x - P.y) * idir.y; + float c0hiy = (node1.z - P.y) * idir.y; + float c0loz = (node2.x - P.z) * idir.z; + float c0hiz = (node2.z - P.z) * idir.z; + float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); + float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + + float c1lox = (node0.y - P.x) * idir.x; + float c1hix = (node0.w - P.x) * idir.x; + float c1loy = (node1.y - P.y) * idir.y; + float c1hiy = (node1.w - P.y) * idir.y; + float c1loz = (node2.y - P.z) * idir.z; + float c1hiz = (node2.w - P.z) * idir.z; + float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); + float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + + if(difl != 0.0f) { + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } + + dist[0] = c0min; + dist[1] = c1min; + +#ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); +#else + return ((c0max >= c0min)? 1: 0) | + ((c1max >= c1min)? 2: 0); +#endif +} + +ccl_device_inline bool bvh_unaligned_node_intersect_child( + KernelGlobals *kg, + const float3 P, + const float3 dir, + const float t, + int node_addr, + int child, + float dist[2]) +{ + Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); + float3 aligned_dir = transform_direction(&space, dir); + float3 aligned_P = transform_point(&space, P); + float3 nrdir = -bvh_inverse_direction(aligned_dir); + float3 lower_xyz = aligned_P * nrdir; + float3 upper_xyz = lower_xyz - nrdir; + const float near_x = min(lower_xyz.x, upper_xyz.x); + const float near_y = min(lower_xyz.y, upper_xyz.y); + const float near_z = min(lower_xyz.z, upper_xyz.z); + const float far_x = max(lower_xyz.x, upper_xyz.x); + const float far_y = max(lower_xyz.y, upper_xyz.y); + const float far_z = max(lower_xyz.z, upper_xyz.z); + const float tnear = max4(0.0f, near_x, near_y, near_z); + const float tfar = min4(t, far_x, far_y, far_z); + *dist = tnear; + return tnear <= tfar; +} + +ccl_device_inline bool bvh_unaligned_node_intersect_child_robust( + KernelGlobals *kg, + const float3 P, + const float3 dir, + const float t, + const float difl, + int node_addr, + int child, + float dist[2]) +{ + Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); + float3 aligned_dir = transform_direction(&space, dir); + float3 aligned_P = transform_point(&space, P); + float3 nrdir = -bvh_inverse_direction(aligned_dir); + float3 tLowerXYZ = aligned_P * nrdir; + float3 tUpperXYZ = tLowerXYZ - nrdir; + const float near_x = min(tLowerXYZ.x, tUpperXYZ.x); + const float near_y = min(tLowerXYZ.y, tUpperXYZ.y); + const float near_z = min(tLowerXYZ.z, tUpperXYZ.z); + const float far_x = max(tLowerXYZ.x, tUpperXYZ.x); + const float far_y = max(tLowerXYZ.y, tUpperXYZ.y); + const float far_z = max(tLowerXYZ.z, tUpperXYZ.z); + const float tnear = max4(0.0f, near_x, near_y, near_z); + const float tfar = min4(t, far_x, far_y, far_z); + *dist = tnear; + if(difl != 0.0f) { + /* TODO(sergey): Same as for QBVH, needs a proper use. */ + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + return round_down*tnear <= round_up*tfar; + } + else { + return tnear <= tfar; + } +} + +ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) +{ + int mask = 0; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.x) & visibility)) +#endif + { + mask |= 1; + } + } + if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.y) & visibility)) +#endif + { + mask |= 2; + } + } + return mask; +} + +ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) +{ + int mask = 0; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.x) & visibility)) +#endif + { + mask |= 1; + } + } + if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) { +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(cnodes.y) & visibility)) +#endif + { + mask |= 2; + } + } + return mask; +} + +ccl_device_inline int bvh_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect(kg, + P, + dir, + idir, + t, + node_addr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect(kg, + P, + idir, + t, + node_addr, + visibility, + dist); + } +} + +ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect_robust(kg, + P, + dir, + idir, + t, + difl, + extmax, + node_addr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect_robust(kg, + P, + idir, + t, + difl, + extmax, + node_addr, + visibility, + dist); + } +} +#else /* !defined(__KERNEL_SSE2__) */ + +int ccl_device_inline bvh_aligned_node_intersect( + KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int node_addr, + const uint visibility, + float dist[2]) +{ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + + /* fetch node data */ + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + node_addr; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + dist[0] = tminmax[0]; + dist[1] = tminmax[1]; + + int mask = movemask(lrhit); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +int ccl_device_inline bvh_aligned_node_intersect_robust( + KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int nodeAddr, + const uint visibility, + float dist[2]) +{ + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + + /* fetch node data */ + const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + + if(difl != 0.0f) { + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + float4 *tminmaxview = (float4*)&tminmax; + float& c0min = tminmaxview->x, &c1min = tminmaxview->y; + float& c0max = tminmaxview->z, &c1max = tminmaxview->w; + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } + + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + dist[0] = tminmax[0]; + dist[1] = tminmax[1]; + + int mask = movemask(lrhit); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg, + const float3 P, + const float3 dir, + const ssef& isect_near, + const ssef& isect_far, + const int node_addr, + const uint visibility, + float dist[2]) +{ + Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); + Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); + + float3 aligned_dir0 = transform_direction(&space0, dir), + aligned_dir1 = transform_direction(&space1, dir);; + float3 aligned_P0 = transform_point(&space0, P), + aligned_P1 = transform_point(&space1, P); + float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), + nrdir1 = -bvh_inverse_direction(aligned_dir1); + + ssef lower_x = ssef(aligned_P0.x * nrdir0.x, + aligned_P1.x * nrdir1.x, + 0.0f, 0.0f), + lower_y = ssef(aligned_P0.y * nrdir0.y, + aligned_P1.y * nrdir1.y, + 0.0f, + 0.0f), + lower_z = ssef(aligned_P0.z * nrdir0.z, + aligned_P1.z * nrdir1.z, + 0.0f, + 0.0f); + + ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), + upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), + upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); + + ssef tnear_x = min(lower_x, upper_x); + ssef tnear_y = min(lower_y, upper_y); + ssef tnear_z = min(lower_z, upper_z); + ssef tfar_x = max(lower_x, upper_x); + ssef tfar_y = max(lower_y, upper_y); + ssef tfar_z = max(lower_z, upper_z); + + const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + sseb vmask = tnear <= tfar; + dist[0] = tnear.f[0]; + dist[1] = tnear.f[1]; + + int mask = (int)movemask(vmask); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const ssef& isect_near, + const ssef& isect_far, + const float difl, + const int node_addr, + const uint visibility, + float dist[2]) +{ + Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); + Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); + + float3 aligned_dir0 = transform_direction(&space0, dir), + aligned_dir1 = transform_direction(&space1, dir);; + float3 aligned_P0 = transform_point(&space0, P), + aligned_P1 = transform_point(&space1, P); + float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), + nrdir1 = -bvh_inverse_direction(aligned_dir1); + + ssef lower_x = ssef(aligned_P0.x * nrdir0.x, + aligned_P1.x * nrdir1.x, + 0.0f, 0.0f), + lower_y = ssef(aligned_P0.y * nrdir0.y, + aligned_P1.y * nrdir1.y, + 0.0f, + 0.0f), + lower_z = ssef(aligned_P0.z * nrdir0.z, + aligned_P1.z * nrdir1.z, + 0.0f, + 0.0f); + + ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), + upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), + upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); + + ssef tnear_x = min(lower_x, upper_x); + ssef tnear_y = min(lower_y, upper_y); + ssef tnear_z = min(lower_z, upper_z); + ssef tfar_x = max(lower_x, upper_x); + ssef tfar_y = max(lower_y, upper_y); + ssef tfar_z = max(lower_z, upper_z); + + const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + sseb vmask; + if(difl != 0.0f) { + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + vmask = round_down*tnear <= round_up*tfar; + } + else { + vmask = tnear <= tfar; + } + + dist[0] = tnear.f[0]; + dist[1] = tnear.f[1]; + + int mask = (int)movemask(vmask); + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); + return cmask; +# else + return mask & 3; +# endif +} + +ccl_device_inline int bvh_node_intersect(KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& isect_near, + const ssef& isect_far, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int node_addr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect(kg, + P, + dir, + isect_near, + isect_far, + node_addr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect(kg, + P, + dir, + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); + } +} + +ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg, + const float3& P, + const float3& dir, + const ssef& isect_near, + const ssef& isect_far, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) +{ + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect_robust(kg, + P, + dir, + isect_near, + isect_far, + difl, + node_addr, + visibility, + dist); + } + else { + return bvh_aligned_node_intersect_robust(kg, + P, + dir, + tsplat, + Psplat, + idirsplat, + shufflexyz, + difl, + extmax, + node_addr, + visibility, + dist); + } +} +#endif /* !defined(__KERNEL_SSE2__) */ diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index 4005489f77d..1869457f0c3 100644 --- a/intern/cycles/kernel/geom/geom_bvh_shadow.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -18,7 +18,13 @@ */ #ifdef __QBVH__ -# include "geom_qbvh_shadow.h" +# include "qbvh_shadow_all.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect #endif /* This is a template BVH traversal function, where various features can be @@ -41,14 +47,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, * - likely and unlikely for if() statements * - test restrict attribute for pointers */ - + /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* ray parameters in registers */ const float tmax = ray->t; @@ -72,9 +78,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect_t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -93,130 +102,87 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_ahild1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect_t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW); - traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW); -# else - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, # endif - + idir, + isect_t, + node_addr, + PATH_RAY_SHADOW, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW); - traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW); -# else - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, # endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + PATH_RAY_SHADOW, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); + node_addr = __float_as_int(cnodes.z); + node_addr_ahild1 = __float_as_int(cnodes.w); - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif - - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if(is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_ahild1; + node_addr_ahild1 = tmp; } - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = nodeAddrChild1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_ahild1; } else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; + /* One child was intersected. */ + if(traverse_mask == 2) { + node_addr = node_addr_ahild1; } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + else if(traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - const int primAddr2 = __float_as_int(leaf.y); + const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); const uint p_type = type & PRIMITIVE_ALL; /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; /* primitive intersection */ - while(primAddr < primAddr2) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + while(prim_addr < prim_addr2) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); bool hit; @@ -226,22 +192,57 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr); + hit = triangle_intersect(kg, + &isect_precalc, + isect_array, + P, + PATH_RAY_SHADOW, + object, + prim_addr); break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr); + hit = motion_triangle_intersect(kg, + isect_array, + P, + dir, + ray->time, + PATH_RAY_SHADOW, + object, + prim_addr); break; } #endif #if BVH_FEATURE(BVH_HAIR) case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) - hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); - else - hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = bvh_cardinal_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + type, + NULL, + 0, 0); + } + else { + hit = bvh_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + type, + NULL, + 0, 0); + } break; } #endif @@ -253,6 +254,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* shadow ray early termination */ if(hit) { + /* Update number of hits now, so we do proper check on max bounces. */ + (*num_hits)++; + /* detect if this surface has a shader with transparent shadows */ /* todo: optimize so primitive visibility flag indicates if @@ -283,23 +287,20 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, return true; } - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; #if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; #endif - - isect_array->t = isect_t; + /* Move on to next entry in intersections array */ + isect_array++; } - primAddr++; + prim_addr++; } } #if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); @@ -317,21 +318,24 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); if(num_hits_in_instance) { @@ -369,15 +373,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return false; } @@ -410,3 +417,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h index 915e9415c93..18978efcfa3 100644 --- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h +++ b/intern/cycles/kernel/bvh/bvh_subsurface.h @@ -18,7 +18,13 @@ */ #ifdef __QBVH__ -# include "geom_qbvh_subsurface.h" +# include "qbvh_subsurface.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect #endif /* This is a template BVH traversal function for subsurface scattering, where @@ -44,12 +50,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, */ /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_tex_fetch(__object_node, subsurface_object); + int stack_ptr = 0; + int node_addr = kernel_tex_fetch(__object_node, subsurface_object); /* ray parameters in registers */ float3 P = ray->P; @@ -84,6 +90,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect_t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -100,127 +109,94 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* traversal loop */ do { - do - { + do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) - { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect_t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); - + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect_t, + node_addr, + PATH_RAY_ALL_VISIBILITY, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + PATH_RAY_ALL_VISIBILITY, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); - - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if(is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; } - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = nodeAddrChild1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; } else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; + /* One child was intersected. */ + if(traverse_mask == 2) { + node_addr = node_addr_child1; } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + else if(traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); - const int primAddr2 = __float_as_int(leaf.y); + const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; /* primitive intersection */ switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); triangle_intersect_subsurface(kg, &isect_precalc, ss_isect, P, object, - primAddr, + prim_addr, isect_t, lcg_state, max_hits); @@ -230,15 +206,15 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); motion_triangle_intersect_subsurface(kg, ss_isect, P, dir, ray->time, object, - primAddr, + prim_addr, isect_t, lcg_state, max_hits); @@ -251,8 +227,8 @@ ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, } } } - } while(nodeAddr != ENTRYPOINT_SENTINEL); - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); } ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg, @@ -286,3 +262,4 @@ ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index ae919ef3f86..68a11b65ad7 100644 --- a/intern/cycles/kernel/geom/geom_bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -18,7 +18,15 @@ */ #ifdef __QBVH__ -# include "geom_qbvh_traversal.h" +# include "qbvh_traversal.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +# define NODE_INTERSECT_ROBUST bvh_node_intersect_robust +#else +# define NODE_INTERSECT bvh_aligned_node_intersect +# define NODE_INTERSECT_ROBUST bvh_aligned_node_intersect_robust #endif /* This is a template BVH traversal function, where various features can be @@ -49,14 +57,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, * - likely and unlikely for if() statements * - test restrict attribute for pointers */ - + /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* ray parameters in registers */ float3 P = ray->P; @@ -79,9 +87,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect->t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -100,174 +111,148 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect->t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(difl != 0.0f) { - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } + traverse_mask = NODE_INTERSECT_ROBUST(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect->t, + difl, + extmax, + node_addr, + visibility, + dist); } + else # endif - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility); -# else - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); -# endif - + { + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect->t, + node_addr, + visibility, + dist); + } #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(difl != 0.0f) { - float4 *tminmaxview = (float4*)&tminmax; - float &c0min = tminmaxview->x, &c1min = tminmaxview->y; - float &c0max = tminmaxview->z, &c1max = tminmaxview->w; - - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } + traverse_mask = NODE_INTERSECT_ROBUST(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + difl, + extmax, + node_addr, + visibility, + dist); } + else # endif - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility); - traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility); -# else - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); -# endif + { + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); + } #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); - - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if(is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; } - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = nodeAddrChild1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; } else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; + /* One child was intersected. */ + if(traverse_mask == 2) { + node_addr = node_addr_child1; } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + else if(traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } BVH_DEBUG_NEXT_STEP(); } /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - const int primAddr2 = __float_as_int(leaf.y); + const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; /* primitive intersection */ switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { + for(; prim_addr < prim_addr2; prim_addr++) { BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(triangle_intersect(kg, + &isect_precalc, + isect, + P, + visibility, + object, + prim_addr)) + { /* shadow ray early termination */ #if defined(__KERNEL_SSE2__) if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif #else if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; @@ -278,15 +263,26 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { + for(; prim_addr < prim_addr2; prim_addr++) { BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(motion_triangle_intersect(kg, + isect, + P, + dir, + ray->time, + visibility, + object, + prim_addr)) + { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif # else if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; @@ -299,20 +295,47 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_HAIR) case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { - for(; primAddr < primAddr2; primAddr++) { + for(; prim_addr < prim_addr2; prim_addr++) { BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); bool hit; - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) - hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); - else - hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = bvh_cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + type, + lcg_state, + difl, + extmax); + } + else { + hit = bvh_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + type, + lcg_state, + difl, + extmax); + } if(hit) { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif # else if(visibility == PATH_RAY_SHADOW_OPAQUE) return true; @@ -327,7 +350,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); @@ -342,24 +365,27 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); BVH_DEBUG_NEXT_INSTANCE(); } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* instance pop */ @@ -376,16 +402,19 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return (isect->prim != PRIM_NONE); } @@ -433,3 +462,5 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT +#undef NODE_INTERSECT_ROBUST diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index f3edf85d723..03499e94347 100644 --- a/intern/cycles/kernel/geom/geom_bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -18,7 +18,13 @@ */ #ifdef __QBVH__ -#include "geom_qbvh_volume.h" +# include "qbvh_volume.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect #endif /* This is a template BVH traversal function for volumes, where @@ -43,12 +49,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, */ /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* ray parameters in registers */ float3 P = ray->P; @@ -69,9 +75,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect->t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -90,143 +99,124 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect->t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); - + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect->t, + node_addr, + visibility, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); - - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if(is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; } - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = nodeAddrChild1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; } else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; + /* One child was intersected. */ + if(traverse_mask == 2) { + node_addr = node_addr_child1; } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + else if(traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - const int primAddr2 = __float_as_int(leaf.y); + const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; /* primitive intersection */ switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr); + triangle_intersect(kg, + &isect_precalc, + isect, + P, + visibility, + object, + prim_addr); } break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr); + motion_triangle_intersect(kg, + isect, + P, + dir, + ray->time, + visibility, + object, + prim_addr); } break; } @@ -239,7 +229,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { @@ -258,29 +248,32 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } else { /* pop */ object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* instance pop */ @@ -298,16 +291,19 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect->t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } #endif /* FEATURE(BVH_MOTION) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return (isect->prim != PRIM_NONE); } @@ -337,3 +333,4 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index ec837212471..b5405e8e57b 100644 --- a/intern/cycles/kernel/geom/geom_bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -18,7 +18,13 @@ */ #ifdef __QBVH__ -#include "geom_qbvh_volume_all.h" +# include "qbvh_volume_all.h" +#endif + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT bvh_node_intersect +#else +# define NODE_INTERSECT bvh_aligned_node_intersect #endif /* This is a template BVH traversal function for volumes, where @@ -44,12 +50,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, */ /* traversal stack in CUDA thread-local memory */ - int traversalStack[BVH_STACK_SIZE]; - traversalStack[0] = ENTRYPOINT_SENTINEL; + int traversal_stack[BVH_STACK_SIZE]; + traversal_stack[0] = ENTRYPOINT_SENTINEL; /* traversal variables in registers */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* ray parameters in registers */ const float tmax = ray->t; @@ -73,9 +79,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if defined(__KERNEL_SSE2__) const shuffle_swap_t shuf_identity = shuffle_swap_identity(); const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); ssef Psplat[3], idirsplat[3]; +# if BVH_FEATURE(BVH_HAIR) + ssef tnear(0.0f), tfar(isect_t); +# endif shuffle_swap_t shufflexyz[3]; Psplat[0] = ssef(P.x); @@ -94,137 +103,109 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, do { do { /* traverse internal nodes */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - bool traverseChild0, traverseChild1; - int nodeAddrChild1; + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + int node_addr_child1, traverse_mask; + float dist[2]; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); #if !defined(__KERNEL_SSE2__) - /* Intersect two child bounding boxes, non-SSE version */ - float t = isect_array->t; - - /* fetch node data */ - float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0); - float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1); - float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2); - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); - - /* intersect ray against child nodes */ - NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x; - NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y; - NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z; - NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - - NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x; - NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y; - NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z; - NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); - - /* decide which nodes to traverse next */ - traverseChild0 = (c0max >= c0min); - traverseChild1 = (c1max >= c1min); - + traverse_mask = NODE_INTERSECT(kg, + P, +# if BVH_FEATURE(BVH_HAIR) + dir, +# endif + idir, + isect_t, + node_addr, + visibility, + dist); #else // __KERNEL_SSE2__ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE; - const float4 cnodes = ((float4*)bvh_nodes)[3]; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - /* decide which nodes to traverse next */ - traverseChild0 = (movemask(lrhit) & 1); - traverseChild1 = (movemask(lrhit) & 2); + traverse_mask = NODE_INTERSECT(kg, + P, + dir, +# if BVH_FEATURE(BVH_HAIR) + tnear, + tfar, +# endif + tsplat, + Psplat, + idirsplat, + shufflexyz, + node_addr, + visibility, + dist); #endif // __KERNEL_SSE2__ - nodeAddr = __float_as_int(cnodes.x); - nodeAddrChild1 = __float_as_int(cnodes.y); + node_addr = __float_as_int(cnodes.z); + node_addr_child1 = __float_as_int(cnodes.w); - if(traverseChild0 && traverseChild1) { - /* both children were intersected, push the farther one */ -#if !defined(__KERNEL_SSE2__) - bool closestChild1 = (c1min < c0min); -#else - bool closestChild1 = tminmax[1] < tminmax[0]; -#endif - - if(closestChild1) { - int tmp = nodeAddr; - nodeAddr = nodeAddrChild1; - nodeAddrChild1 = tmp; + if(traverse_mask == 3) { + /* Both children were intersected, push the farther one. */ + bool is_closest_child1 = (dist[1] < dist[0]); + if(is_closest_child1) { + int tmp = node_addr; + node_addr = node_addr_child1; + node_addr_child1 = tmp; } - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = nodeAddrChild1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = node_addr_child1; } else { - /* one child was intersected */ - if(traverseChild1) { - nodeAddr = nodeAddrChild1; + /* One child was intersected. */ + if(traverse_mask == 2) { + node_addr = node_addr_child1; } - else if(!traverseChild0) { - /* neither child was intersected */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + else if(traverse_mask == 0) { + /* Neither child was intersected. */ + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } /* if node is leaf, fetch triangle list */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - const int primAddr2 = __float_as_int(leaf.y); + const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); bool hit; /* pop */ - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; /* primitive intersection */ switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr); + hit = triangle_intersect(kg, + &isect_precalc, + isect_array, + P, + visibility, + object, + prim_addr); if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; + /* Update number of hits now, so we do proper check on max bounces. */ num_hits++; #if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; #endif - isect_array->t = isect_t; if(num_hits == max_hits) { #if BVH_FEATURE(BVH_INSTANCING) # if BVH_FEATURE(BVH_MOTION) @@ -239,6 +220,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #endif /* BVH_FEATURE(BVH_INSTANCING) */ return num_hits; } + /* Move on to next entry in intersections array */ + isect_array++; + isect_array->t = isect_t; } } break; @@ -246,23 +230,28 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { /* intersect ray against primitive */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* only primitives from volume object */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr); + hit = motion_triangle_intersect(kg, + isect_array, + P, + dir, + ray->time, + visibility, + object, + prim_addr); if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; + /* Update number of hits now, so we do proper check on max bounces. */ num_hits++; # if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; # endif - isect_array->t = isect_t; if(num_hits == max_hits) { # if BVH_FEATURE(BVH_INSTANCING) # if BVH_FEATURE(BVH_MOTION) @@ -277,6 +266,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, # endif /* BVH_FEATURE(BVH_INSTANCING) */ return num_hits; } + /* Move on to next entry in intersections array */ + isect_array++; + isect_array->t = isect_t; } } break; @@ -290,7 +282,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { @@ -311,29 +303,32 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif - ++stackPtr; - kernel_assert(stackPtr < BVH_STACK_SIZE); - traversalStack[stackPtr] = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_STACK_SIZE); + traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } else { /* pop */ object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); if(num_hits_in_instance) { @@ -368,16 +363,19 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, Psplat[2] = ssef(P.z); tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); +# if BVH_FEATURE(BVH_HAIR) + tfar = ssef(isect_t); +# endif gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); # endif object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr]; - --stackPtr; + node_addr = traversal_stack[stack_ptr]; + --stack_ptr; } #endif /* FEATURE(BVH_MOTION) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return num_hits; } @@ -410,3 +408,4 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, #undef BVH_FUNCTION_NAME #undef BVH_FUNCTION_FEATURES +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h new file mode 100644 index 00000000000..4d8695bedec --- /dev/null +++ b/intern/cycles/kernel/bvh/qbvh_nodes.h @@ -0,0 +1,433 @@ +/* + * Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +struct QBVHStackItem { + int addr; + float dist; +}; + +/* TOOD(sergey): Investigate if using intrinsics helps for both + * stack item swap and float comparison. + */ +ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a, + QBVHStackItem *ccl_restrict b) +{ + QBVHStackItem tmp = *a; + *a = *b; + *b = tmp; +} + +ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, + QBVHStackItem *ccl_restrict s2, + QBVHStackItem *ccl_restrict s3) +{ + if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } + if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } + if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } +} + +ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, + QBVHStackItem *ccl_restrict s2, + QBVHStackItem *ccl_restrict s3, + QBVHStackItem *ccl_restrict s4) +{ + if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } + if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); } + if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); } + if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); } + if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } +} + +/* Axis-aligned nodes intersection */ + +ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& org_idir, +#else + const sse3f& org, +#endif + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + ssef *ccl_restrict dist) +{ + const int offset = node_addr + 1; +#ifdef __KERNEL_AVX2__ + const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x); + const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y); + const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z); + const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x); + const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y); + const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z); +#else + const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x; + const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y; + const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z; + const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x; + const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y; + const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z; +#endif + +#ifdef __KERNEL_SSE41__ + const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near)); + const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far)); + const sseb vmask = cast(tnear) > cast(tfar); + int mask = (int)movemask(vmask)^0xf; +#else + const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const sseb vmask = tnear <= tfar; + int mask = (int)movemask(vmask); +#endif + *dist = tnear; + return mask; +} + +ccl_device_inline int qbvh_aligned_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& P_idir, +#else + const sse3f& P, +#endif + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + ssef *ccl_restrict dist) +{ + const int offset = node_addr + 1; +#ifdef __KERNEL_AVX2__ + const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x); + const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y); + const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z); + const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x); + const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y); + const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z); +#else + const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x; + const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y; + const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z; + const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x; + const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y; + const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z; +#endif + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const sseb vmask = round_down*tnear <= round_up*tfar; + *dist = tnear; + return (int)movemask(vmask); +} + +/* Unaligned nodes intersection */ + +ccl_device_inline int qbvh_unaligned_node_intersect( + KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& org_idir, +#endif + const sse3f& org, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + ssef *ccl_restrict dist) +{ + const int offset = node_addr; + const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1); + const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2); + const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3); + + const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4); + const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5); + const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6); + + const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7); + const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8); + const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9); + + const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10); + const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11); + const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12); + + const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, + aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, + aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + + const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x, + aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y, + aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z; + + const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); + const ssef nrdir_x = neg_one / aligned_dir_x, + nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const ssef tlower_x = aligned_P_x * nrdir_x, + tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const ssef tupper_x = tlower_x - nrdir_x, + tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + +#ifdef __KERNEL_SSE41__ + const ssef tnear_x = mini(tlower_x, tupper_x); + const ssef tnear_y = mini(tlower_y, tupper_y); + const ssef tnear_z = mini(tlower_z, tupper_z); + const ssef tfar_x = maxi(tlower_x, tupper_x); + const ssef tfar_y = maxi(tlower_y, tupper_y); + const ssef tfar_z = maxi(tlower_z, tupper_z); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = tnear <= tfar; + *dist = tnear; + return movemask(vmask); +#else + const ssef tnear_x = min(tlower_x, tupper_x); + const ssef tnear_y = min(tlower_y, tupper_y); + const ssef tnear_z = min(tlower_z, tupper_z); + const ssef tfar_x = max(tlower_x, tupper_x); + const ssef tfar_y = max(tlower_y, tupper_y); + const ssef tfar_z = max(tlower_z, tupper_z); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = tnear <= tfar; + *dist = tnear; + return movemask(vmask); +#endif +} + +ccl_device_inline int qbvh_unaligned_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& P_idir, +#endif + const sse3f& P, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + ssef *ccl_restrict dist) +{ + const int offset = node_addr; + const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1); + const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2); + const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3); + + const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4); + const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5); + const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6); + + const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7); + const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8); + const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9); + + const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10); + const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11); + const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12); + + const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, + aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, + aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; + + const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x, + aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y, + aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z; + + const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); + const ssef nrdir_x = neg_one / aligned_dir_x, + nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const ssef tlower_x = aligned_P_x * nrdir_x, + tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const ssef tupper_x = tlower_x - nrdir_x, + tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + +#ifdef __KERNEL_SSE41__ + const ssef tnear_x = mini(tlower_x, tupper_x); + const ssef tnear_y = mini(tlower_y, tupper_y); + const ssef tnear_z = mini(tlower_z, tupper_z); + const ssef tfar_x = maxi(tlower_x, tupper_x); + const ssef tfar_y = maxi(tlower_y, tupper_y); + const ssef tfar_z = maxi(tlower_z, tupper_z); +#else + const ssef tnear_x = min(tlower_x, tupper_x); + const ssef tnear_y = min(tlower_y, tupper_y); + const ssef tnear_z = min(tlower_z, tupper_z); + const ssef tfar_x = max(tlower_x, tupper_x); + const ssef tfar_y = max(tlower_y, tupper_y); + const ssef tfar_z = max(tlower_z, tupper_z); +#endif + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const sseb vmask = round_down*tnear <= round_up*tfar; + *dist = tnear; + return movemask(vmask); +} + +/* Intersectors wrappers. + * + * They'll check node type and call appropriate intersection code. + */ + +ccl_device_inline int qbvh_node_intersect( + KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& org_idir, +#endif + const sse3f& org, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + ssef *ccl_restrict dist) +{ + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return qbvh_unaligned_node_intersect(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + org_idir, +#endif + org, + dir, + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + dist); + } + else { + return qbvh_aligned_node_intersect(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + org_idir, +#else + org, +#endif + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + dist); + } +} + +ccl_device_inline int qbvh_node_intersect_robust( + KernelGlobals *ccl_restrict kg, + const ssef& isect_near, + const ssef& isect_far, +#ifdef __KERNEL_AVX2__ + const sse3f& P_idir, +#endif + const sse3f& P, + const sse3f& dir, + const sse3f& idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + ssef *ccl_restrict dist) +{ + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return qbvh_unaligned_node_intersect_robust(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + P_idir, +#endif + P, + dir, + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + dist); + } + else { + return qbvh_aligned_node_intersect_robust(kg, + isect_near, + isect_far, +#ifdef __KERNEL_AVX2__ + P_idir, +#else + P, +#endif + idir, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + dist); + } +} diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h index edb5b5c78c3..34753ff067d 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_shadow.h +++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h @@ -27,6 +27,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, @@ -39,12 +45,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; - traversalStack[0].addr = ENTRYPOINT_SENTINEL; + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; /* Traversal variables in registers. */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* Ray parameters in registers. */ const float tmax = ray->t; @@ -72,13 +78,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef tnear(0.0f), tfar(tmax); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -96,29 +106,53 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, do { do { /* Traverse internal nodes. */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + +#ifdef __VISIBILITY_FLAG__ + if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); - - if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +# endif +# if BVH_FEATURE(BVH_HAIR) + dir4, +# endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); + } /* One child is hit, continue with that child. */ - int r = __bscf(traverseChild); - if(traverseChild == 0) { - nodeAddr = __float_as_int(cnodes[r]); + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); continue; } @@ -127,24 +161,24 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ int c0 = __float_as_int(cnodes[r]); float d0 = ((float*)&dist)[r]; - r = __bscf(traverseChild); + r = __bscf(child_mask); int c1 = __float_as_int(cnodes[r]); float d1 = ((float*)&dist)[r]; - if(traverseChild == 0) { + if(child_mask == 0) { if(d1 < d0) { - nodeAddr = c1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; continue; } else { - nodeAddr = c0; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; continue; } } @@ -152,86 +186,86 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Here starts the slow path for 3 or 4 hit children. We push * all nodes onto the stack to sort them there. */ - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; /* Three children are hit, push all onto stack and sort 3 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c2 = __float_as_int(cnodes[r]); float d2 = ((float*)&dist)[r]; - if(traverseChild == 0) { - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2]); - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; continue; } /* Four children are hit, push all onto stack and sort 4 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c3 = __float_as_int(cnodes[r]); float d3 = ((float*)&dist)[r]; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c3; - traversalStack[stackPtr].dist = d3; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2], - &traversalStack[stackPtr - 3]); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); } - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } /* If node is leaf, fetch triangle list. */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); #ifdef __VISIBILITY_FLAG__ if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; continue; } #endif - int primAddr = __float_as_int(leaf.x); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - int primAddr2 = __float_as_int(leaf.y); + int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); const uint p_type = type & PRIMITIVE_ALL; /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; /* Primitive intersection. */ - while(primAddr < primAddr2) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + while(prim_addr < prim_addr2) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); bool hit; @@ -241,22 +275,57 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr); + hit = triangle_intersect(kg, + &isect_precalc, + isect_array, + P, + PATH_RAY_SHADOW, + object, + prim_addr); break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr); + hit = motion_triangle_intersect(kg, + isect_array, + P, + dir, + ray->time, + PATH_RAY_SHADOW, + object, + prim_addr); break; } #endif #if BVH_FEATURE(BVH_HAIR) case PRIMITIVE_CURVE: case PRIMITIVE_MOTION_CURVE: { - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) - hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); - else - hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0); + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = bvh_cardinal_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + type, + NULL, + 0, 0); + } + else { + hit = bvh_curve_intersect(kg, + isect_array, + P, + dir, + PATH_RAY_SHADOW, + object, + prim_addr, + ray->time, + type, + NULL, + 0, 0); + } break; } #endif @@ -268,6 +337,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Shadow ray early termination. */ if(hit) { + /* Update number of hits now, so we do proper check on max bounces. */ + (*num_hits)++; + /* detect if this surface has a shader with transparent shadows */ /* todo: optimize so primitive visibility flag indicates if @@ -298,23 +370,21 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, return true; } - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; #if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; #endif - + /* Move on to next entry in intersections array */ + isect_array++; isect_array->t = isect_t; } - primAddr++; + prim_addr++; } } #if BVH_FEATURE(BVH_INSTANCING) else { /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); @@ -329,28 +399,33 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); if(num_hits_in_instance) { @@ -383,21 +458,28 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(tmax); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return false; } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h index 84512a8783c..03794e3a882 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h +++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h @@ -25,6 +25,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, SubsurfaceIntersection *ss_isect, @@ -41,12 +47,12 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; - traversalStack[0].addr = ENTRYPOINT_SENTINEL; + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; /* Traversal variables in registers. */ - int stackPtr = 0; - int nodeAddr = kernel_tex_fetch(__object_node, subsurface_object); + int stack_ptr = 0; + int node_addr = kernel_tex_fetch(__object_node, subsurface_object); /* Ray parameters in registers. */ float3 P = ray->P; @@ -82,13 +88,17 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -106,29 +116,43 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, do { do { /* Traverse internal nodes. */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); - if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); + if(child_mask != 0) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); + } /* One child is hit, continue with that child. */ - int r = __bscf(traverseChild); - if(traverseChild == 0) { - nodeAddr = __float_as_int(cnodes[r]); + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); continue; } @@ -137,24 +161,24 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ int c0 = __float_as_int(cnodes[r]); float d0 = ((float*)&dist)[r]; - r = __bscf(traverseChild); + r = __bscf(child_mask); int c1 = __float_as_int(cnodes[r]); float d1 = ((float*)&dist)[r]; - if(traverseChild == 0) { + if(child_mask == 0) { if(d1 < d0) { - nodeAddr = c1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; continue; } else { - nodeAddr = c0; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; continue; } } @@ -162,82 +186,82 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Here starts the slow path for 3 or 4 hit children. We push * all nodes onto the stack to sort them there. */ - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; /* Three children are hit, push all onto stack and sort 3 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c2 = __float_as_int(cnodes[r]); float d2 = ((float*)&dist)[r]; - if(traverseChild == 0) { - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2]); - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; continue; } /* Four children are hit, push all onto stack and sort 4 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c3 = __float_as_int(cnodes[r]); float d3 = ((float*)&dist)[r]; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c3; - traversalStack[stackPtr].dist = d3; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2], - &traversalStack[stackPtr - 3]); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); } - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } /* If node is leaf, fetch triangle list. */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); - int primAddr2 = __float_as_int(leaf.y); + int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; /* Primitive intersection. */ switch(type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { /* Intersect ray against primitive, */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); triangle_intersect_subsurface(kg, &isect_precalc, ss_isect, P, object, - primAddr, + prim_addr, isect_t, lcg_state, max_hits); @@ -247,15 +271,15 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { /* Intersect ray against primitive. */ - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); motion_triangle_intersect_subsurface(kg, ss_isect, P, dir, ray->time, object, - primAddr, + prim_addr, isect_t, lcg_state, max_hits); @@ -267,6 +291,8 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, break; } } - } while(nodeAddr != ENTRYPOINT_SENTINEL); - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h new file mode 100644 index 00000000000..f82ff661495 --- /dev/null +++ b/intern/cycles/kernel/bvh/qbvh_traversal.h @@ -0,0 +1,505 @@ +/* + * Adapted from code Copyright 2009-2010 NVIDIA Corporation, + * and code copyright 2009-2012 Intel Corporation + * + * Modifications Copyright 2011-2014, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is a template BVH traversal function, where various features can be + * enabled/disabled. This way we can compile optimized versions for each case + * without new features slowing things down. + * + * BVH_INSTANCING: object instancing + * BVH_HAIR: hair curve rendering + * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width + * BVH_MOTION: motion blur rendering + * + */ + +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +# define NODE_INTERSECT_ROBUST qbvh_node_intersect_robust +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +# define NODE_INTERSECT_ROBUST qbvh_aligned_node_intersect_robust +#endif + +ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + const uint visibility +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + ,uint *lcg_state, + float difl, + float extmax +#endif + ) +{ + /* TODO(sergey): + * - Test if pushing distance on the stack helps (for non shadow rays). + * - Separate version for shadow rays. + * - Likely and unlikely for if() statements. + * - Test restrict attribute for pointers. + */ + + /* Traversal stack in CUDA thread-local memory. */ + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; + traversal_stack[0].dist = -FLT_MAX; + + /* Traversal variables in registers. */ + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; + float node_dist = -FLT_MAX; + + /* Ray parameters in registers. */ + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + int object = OBJECT_NONE; + +#if BVH_FEATURE(BVH_MOTION) + Transform ob_itfm; +#endif + +#ifndef __KERNEL_SSE41__ + if(!isfinite(P.x)) { + return false; + } +#endif + + isect->t = ray->t; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + + BVH_DEBUG_INIT(); + + ssef tnear(0.0f), tfar(ray->t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif + sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); + +#ifdef __KERNEL_AVX2__ + float3 P_idir = P*idir; + sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +#endif + + /* Offsets to select the side that becomes the lower or upper bound. */ + int near_x, near_y, near_z; + int far_x, far_y, far_z; + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + + IsectPrecalc isect_precalc; + triangle_intersect_precalc(dir, &isect_precalc); + + /* Traversal loop. */ + do { + do { + /* Traverse internal nodes. */ + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + + if(UNLIKELY(node_dist > isect->t) +#ifdef __VISIBILITY_FLAG__ + || (__float_as_uint(inodes.x) & visibility) == 0) +#endif + { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + int child_mask; + ssef dist; + + BVH_DEBUG_NEXT_STEP(); + +#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + if(difl != 0.0f) { + /* NOTE: We extend all the child BB instead of fetching + * and checking visibility flags for each of the, + * + * Need to test if doing opposite would be any faster. + */ + child_mask = NODE_INTERSECT_ROBUST(kg, + tnear, + tfar, +# ifdef __KERNEL_AVX2__ + P_idir4, +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +# endif +# if BVH_FEATURE(BVH_HAIR) + dir4, +# endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + difl, + &dist); + } + else +#endif /* BVH_HAIR_MINIMUM_WIDTH */ + { + child_mask = NODE_INTERSECT(kg, + tnear, + tfar, +#ifdef __KERNEL_AVX2__ + P_idir4, +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + } + + if(child_mask != 0) { + float4 cnodes; + /* TODO(sergey): Investigate whether moving cnodes upwards + * gives a speedup (will be different cache pattern but will + * avoid extra check here), + */ +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); + } + + /* One child is hit, continue with that child. */ + int r = __bscf(child_mask); + float d0 = ((float*)&dist)[r]; + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); + node_dist = d0; + continue; + } + + /* Two children are hit, push far child, and continue with + * closer child. + */ + int c0 = __float_as_int(cnodes[r]); + r = __bscf(child_mask); + int c1 = __float_as_int(cnodes[r]); + float d1 = ((float*)&dist)[r]; + if(child_mask == 0) { + if(d1 < d0) { + node_addr = c1; + node_dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + continue; + } + else { + node_addr = c0; + node_dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + continue; + } + } + + /* Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. + */ + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; + + /* Three children are hit, push all onto stack and sort 3 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c2 = __float_as_int(cnodes[r]); + float d2 = ((float*)&dist)[r]; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + /* Four children are hit, push all onto stack and sort 4 + * stack items, continue with closest child. + */ + r = __bscf(child_mask); + int c3 = __float_as_int(cnodes[r]); + float d3 = ((float*)&dist)[r]; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); + } + + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } + + /* If node is leaf, fetch triangle list. */ + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + +#ifdef __VISIBILITY_FLAG__ + if(UNLIKELY((node_dist > isect->t) || + ((__float_as_uint(leaf.z) & visibility) == 0))) +#else + if(UNLIKELY((node_dist > isect->t))) +#endif + { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + continue; + } + + int prim_addr = __float_as_int(leaf.x); + +#if BVH_FEATURE(BVH_INSTANCING) + if(prim_addr >= 0) { +#endif + int prim_addr2 = __float_as_int(leaf.y); + const uint type = __float_as_int(leaf.w); + + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + + /* Primitive intersection. */ + switch(type & PRIMITIVE_ALL) { + case PRIMITIVE_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_STEP(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(triangle_intersect(kg, + &isect_precalc, + isect, + P, + visibility, + object, + prim_addr)) { + tfar = ssef(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#if BVH_FEATURE(BVH_MOTION) + case PRIMITIVE_MOTION_TRIANGLE: { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_STEP(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + if(motion_triangle_intersect(kg, + isect, + P, + dir, + ray->time, + visibility, + object, + prim_addr)) { + tfar = ssef(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_MOTION) */ +#if BVH_FEATURE(BVH_HAIR) + case PRIMITIVE_CURVE: + case PRIMITIVE_MOTION_CURVE: { + for(; prim_addr < prim_addr2; prim_addr++) { + BVH_DEBUG_NEXT_STEP(); + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); + bool hit; + if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { + hit = bvh_cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + type, + lcg_state, + difl, + extmax); + } + else { + hit = bvh_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + type, + lcg_state, + difl, + extmax); + } + if(hit) { + tfar = ssef(isect->t); + /* Shadow ray early termination. */ + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + return true; + } + } + } + break; + } +#endif /* BVH_FEATURE(BVH_HAIR) */ + } + } +#if BVH_FEATURE(BVH_INSTANCING) + else { + /* Instance push. */ + object = kernel_tex_fetch(__prim_object, -prim_addr-1); + +# if BVH_FEATURE(BVH_MOTION) + qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm); +# else + qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist); +# endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + + triangle_intersect_precalc(dir, &isect_precalc); + + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; + traversal_stack[stack_ptr].dist = -FLT_MAX; + + node_addr = kernel_tex_fetch(__object_node, object); + + BVH_DEBUG_NEXT_INSTANCE(); + } + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + +#if BVH_FEATURE(BVH_INSTANCING) + if(stack_ptr >= 0) { + kernel_assert(object != OBJECT_NONE); + + /* Instance pop. */ +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); +# else + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); +# endif + + if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } + if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } + if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } + tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif + idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +# ifdef __KERNEL_AVX2__ + P_idir = P*idir; + P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); +# endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + + triangle_intersect_precalc(dir, &isect_precalc); + + object = OBJECT_NONE; + node_addr = traversal_stack[stack_ptr].addr; + node_dist = traversal_stack[stack_ptr].dist; + --stack_ptr; + } +#endif /* FEATURE(BVH_INSTANCING) */ + } while(node_addr != ENTRYPOINT_SENTINEL); + + return (isect->prim != PRIM_NONE); +} + +#undef NODE_INTERSECT +#undef NODE_INTERSECT_ROBUST diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h index ab2e530dd20..b4f334eb842 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_volume.h +++ b/intern/cycles/kernel/bvh/qbvh_volume.h @@ -26,6 +26,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect, @@ -38,12 +44,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; - traversalStack[0].addr = ENTRYPOINT_SENTINEL; + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; /* Traversal variables in registers. */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* Ray parameters in registers. */ float3 P = ray->P; @@ -68,13 +74,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, isect->object = OBJECT_NONE; ssef tnear(0.0f), tfar(ray->t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -92,29 +102,52 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, do { do { /* Traverse internal nodes. */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { +#ifdef __VISIBILITY_FLAG__ + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + if((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); - - if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); + } /* One child is hit, continue with that child. */ - int r = __bscf(traverseChild); - if(traverseChild == 0) { - nodeAddr = __float_as_int(cnodes[r]); + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); continue; } @@ -123,24 +156,24 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ int c0 = __float_as_int(cnodes[r]); float d0 = ((float*)&dist)[r]; - r = __bscf(traverseChild); + r = __bscf(child_mask); int c1 = __float_as_int(cnodes[r]); float d1 = ((float*)&dist)[r]; - if(traverseChild == 0) { + if(child_mask == 0) { if(d1 < d0) { - nodeAddr = c1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; continue; } else { - nodeAddr = c0; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; continue; } } @@ -148,102 +181,102 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Here starts the slow path for 3 or 4 hit children. We push * all nodes onto the stack to sort them there. */ - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; /* Three children are hit, push all onto stack and sort 3 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c2 = __float_as_int(cnodes[r]); float d2 = ((float*)&dist)[r]; - if(traverseChild == 0) { - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2]); - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; continue; } /* Four children are hit, push all onto stack and sort 4 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c3 = __float_as_int(cnodes[r]); float d3 = ((float*)&dist)[r]; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c3; - traversalStack[stackPtr].dist = d3; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2], - &traversalStack[stackPtr - 3]); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); } - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } /* If node is leaf, fetch triangle list. */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - int primAddr2 = __float_as_int(leaf.y); + int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); const uint p_type = type & PRIMITIVE_ALL; /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; /* Primitive intersection. */ switch(p_type) { case PRIMITIVE_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } /* Intersect ray against primitive. */ - triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr); + triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr); } break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } /* Intersect ray against primitive. */ - motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr); + motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, prim_addr); } break; } @@ -253,7 +286,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_INSTANCING) else { /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { @@ -268,34 +301,39 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } else { /* Pop. */ object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* Instance pop. */ @@ -309,21 +347,28 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect->t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return (isect->prim != PRIM_NONE); } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h index 5546471b0e3..a877e5bb341 100644 --- a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h +++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h @@ -26,6 +26,12 @@ * */ +#if BVH_FEATURE(BVH_HAIR) +# define NODE_INTERSECT qbvh_node_intersect +#else +# define NODE_INTERSECT qbvh_aligned_node_intersect +#endif + ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, @@ -39,12 +45,12 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; - traversalStack[0].addr = ENTRYPOINT_SENTINEL; + QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; + traversal_stack[0].addr = ENTRYPOINT_SENTINEL; /* Traversal variables in registers. */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; + int stack_ptr = 0; + int node_addr = kernel_data.bvh.root; /* Ray parameters in registers. */ const float tmax = ray->t; @@ -72,13 +78,17 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif ssef tnear(0.0f), tfar(isect_t); +#if BVH_FEATURE(BVH_HAIR) + sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +#endif sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); #ifdef __KERNEL_AVX2__ float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); + sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); +#endif +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); #endif /* Offsets to select the side that becomes the lower or upper bound. */ @@ -96,29 +106,52 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, do { do { /* Traverse internal nodes. */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { + while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { +#ifdef __VISIBILITY_FLAG__ + float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + if((__float_as_uint(inodes.x) & visibility) == 0) { + /* Pop. */ + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; + continue; + } +#endif + ssef dist; - int traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, + int child_mask = NODE_INTERSECT(kg, + tnear, + tfar, #ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, + P_idir4, #endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); - - if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); +#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4, +#endif +#if BVH_FEATURE(BVH_HAIR) + dir4, +#endif + idir4, + near_x, near_y, near_z, + far_x, far_y, far_z, + node_addr, + &dist); + + if(child_mask != 0) { + float4 cnodes; +#if BVH_FEATURE(BVH_HAIR) + if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+13); + } + else +#endif + { + cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+7); + } /* One child is hit, continue with that child. */ - int r = __bscf(traverseChild); - if(traverseChild == 0) { - nodeAddr = __float_as_int(cnodes[r]); + int r = __bscf(child_mask); + if(child_mask == 0) { + node_addr = __float_as_int(cnodes[r]); continue; } @@ -127,24 +160,24 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, */ int c0 = __float_as_int(cnodes[r]); float d0 = ((float*)&dist)[r]; - r = __bscf(traverseChild); + r = __bscf(child_mask); int c1 = __float_as_int(cnodes[r]); float d1 = ((float*)&dist)[r]; - if(traverseChild == 0) { + if(child_mask == 0) { if(d1 < d0) { - nodeAddr = c1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + node_addr = c1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; continue; } else { - nodeAddr = c0; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; + node_addr = c0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; continue; } } @@ -152,96 +185,94 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Here starts the slow path for 3 or 4 hit children. We push * all nodes onto the stack to sort them there. */ - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c1; + traversal_stack[stack_ptr].dist = d1; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c0; + traversal_stack[stack_ptr].dist = d0; /* Three children are hit, push all onto stack and sort 3 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c2 = __float_as_int(cnodes[r]); float d2 = ((float*)&dist)[r]; - if(traverseChild == 0) { - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2]); - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + if(child_mask == 0) { + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2]); + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; continue; } /* Four children are hit, push all onto stack and sort 4 * stack items, continue with closest child. */ - r = __bscf(traverseChild); + r = __bscf(child_mask); int c3 = __float_as_int(cnodes[r]); float d3 = ((float*)&dist)[r]; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c3; - traversalStack[stackPtr].dist = d3; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2], - &traversalStack[stackPtr - 3]); + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c3; + traversal_stack[stack_ptr].dist = d3; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = c2; + traversal_stack[stack_ptr].dist = d2; + qbvh_stack_sort(&traversal_stack[stack_ptr], + &traversal_stack[stack_ptr - 1], + &traversal_stack[stack_ptr - 2], + &traversal_stack[stack_ptr - 3]); } - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } /* If node is leaf, fetch triangle list. */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); - int primAddr = __float_as_int(leaf.x); + if(node_addr < 0) { + float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); + int prim_addr = __float_as_int(leaf.x); #if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { + if(prim_addr >= 0) { #endif - int primAddr2 = __float_as_int(leaf.y); + int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); const uint p_type = type & PRIMITIVE_ALL; bool hit; /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; /* Primitive intersection. */ switch(p_type) { case PRIMITIVE_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } /* Intersect ray against primitive. */ - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr); + hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr); if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; + /* Update number of hits now, so we do proper check on max bounces. */ num_hits++; #if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; #endif - isect_array->t = isect_t; if(num_hits == max_hits) { #if BVH_FEATURE(BVH_INSTANCING) # if BVH_FEATURE(BVH_MOTION) @@ -256,30 +287,31 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #endif /* BVH_FEATURE(BVH_INSTANCING) */ return num_hits; } + /* Move on to next entry in intersections array */ + isect_array++; + isect_array->t = isect_t; } } break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); + for(; prim_addr < prim_addr2; prim_addr++) { + kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object; + uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, prim_addr): object; int object_flag = kernel_tex_fetch(__object_flag, tri_object); if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } /* Intersect ray against primitive. */ - hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr); + hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); if(hit) { - /* Move on to next entry in intersections array. */ - isect_array++; + /* Update number of hits now, so we do proper check on max bounces. */ num_hits++; # if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; # endif - isect_array->t = isect_t; if(num_hits == max_hits) { # if BVH_FEATURE(BVH_INSTANCING) # if BVH_FEATURE(BVH_MOTION) @@ -294,6 +326,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, # endif /* BVH_FEATURE(BVH_INSTANCING) */ return num_hits; } + /* Move on to next entry in intersections array */ + isect_array++; + isect_array->t = isect_t; } } break; @@ -304,7 +339,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, #if BVH_FEATURE(BVH_INSTANCING) else { /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); + object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { @@ -320,35 +355,40 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect_t); idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL; + ++stack_ptr; + kernel_assert(stack_ptr < BVH_QSTACK_SIZE); + traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - nodeAddr = kernel_tex_fetch(__object_node, object); + node_addr = kernel_tex_fetch(__object_node, object); } else { /* Pop. */ object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } } } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); #if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { + if(stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* Instance pop. */ @@ -379,23 +419,30 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } tfar = ssef(isect_t); +# if BVH_FEATURE(BVH_HAIR) + dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); +# endif idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); # ifdef __KERNEL_AVX2__ P_idir = P*idir; P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif +# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) + org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); +# endif + triangle_intersect_precalc(dir, &isect_precalc); isect_t = tmax; isect_array->t = isect_t; object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - --stackPtr; + node_addr = traversal_stack[stack_ptr].addr; + --stack_ptr; } #endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); + } while(node_addr != ENTRYPOINT_SENTINEL); return num_hits; } + +#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h index c94a5384d1f..d2c7edb11ea 100644 --- a/intern/cycles/kernel/geom/geom.h +++ b/intern/cycles/kernel/geom/geom.h @@ -15,27 +15,6 @@ * limitations under the License. */ -/* bottom-most stack entry, indicating the end of traversal */ -#define ENTRYPOINT_SENTINEL 0x76543210 - -/* 64 object BVH + 64 mesh BVH + 64 object node splitting */ -#define BVH_STACK_SIZE 192 -#define BVH_QSTACK_SIZE 384 -#define BVH_NODE_SIZE 4 -#define BVH_NODE_LEAF_SIZE 1 -#define BVH_QNODE_SIZE 7 -#define BVH_QNODE_LEAF_SIZE 1 -#define TRI_NODE_SIZE 3 - -/* silly workaround for float extended precision that happens when compiling - * without sse support on x86, it results in different results for float ops - * that you would otherwise expect to compare correctly */ -#if !defined(__i386__) || defined(__SSE__) -# define NO_EXTENDED_PRECISION -#else -# define NO_EXTENDED_PRECISION volatile -#endif - #include "geom_attribute.h" #include "geom_object.h" #include "geom_triangle.h" @@ -45,5 +24,4 @@ #include "geom_curve.h" #include "geom_volume.h" #include "geom_primitive.h" -#include "geom_bvh.h" diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 8894843997c..292e1bfca0e 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -450,8 +450,8 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect else if(level == 1) { /* the maximum recursion depth is reached. - * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. - * dP* is reversed if necessary.*/ + * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. + * dP* is reversed if necessary.*/ float t = isect->t; float u = 0.0f; float gd = 0.0f; diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index ffe55529110..2fb8e219884 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -47,13 +47,13 @@ ccl_device_inline int find_attribute_motion(KernelGlobals *kg, int object, uint return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z; } -ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, float3 tri_vindex, int offset, int numverts, int numsteps, int step, float3 verts[3]) +ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, uint4 tri_vindex, int offset, int numverts, int numsteps, int step, float3 verts[3]) { if(step == numsteps) { /* center step: regular vertex location */ - verts[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - verts[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - verts[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + verts[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); + verts[1] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); + verts[2] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); } else { /* center step not store in this array */ @@ -62,19 +62,19 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, float3 offset += step*numverts; - verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); - verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); - verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z))); + verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x)); + verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y)); + verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z)); } } -ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, float3 tri_vindex, int offset, int numverts, int numsteps, int step, float3 normals[3]) +ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, uint4 tri_vindex, int offset, int numverts, int numsteps, int step, float3 normals[3]) { if(step == numsteps) { /* center step: regular vertex location */ - normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x))); - normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y))); - normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z))); + normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x)); + normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); + normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); } else { /* center step not stored in this array */ @@ -83,9 +83,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, float offset += step*numverts; - normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); - normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); - normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z))); + normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x)); + normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y)); + normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z)); } } @@ -107,7 +107,7 @@ ccl_device_inline void motion_triangle_vertices(KernelGlobals *kg, int object, i /* fetch vertex coordinates */ float3 next_verts[3]; - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); @@ -259,7 +259,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD /* fetch vertex coordinates */ float3 verts[3], next_verts[3]; - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim))); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h deleted file mode 100644 index 2a2d7822eee..00000000000 --- a/intern/cycles/kernel/geom/geom_qbvh.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright 2011-2014, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -struct QBVHStackItem { - int addr; - float dist; -}; - -/* TOOD(sergey): Investigate if using intrinsics helps for both - * stack item swap and float comparison. - */ -ccl_device_inline void qbvh_item_swap(QBVHStackItem *__restrict a, - QBVHStackItem *__restrict b) -{ - QBVHStackItem tmp = *a; - *a = *b; - *b = tmp; -} - -ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1, - QBVHStackItem *__restrict s2, - QBVHStackItem *__restrict s3) -{ - if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } - if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } - if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } -} - -ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1, - QBVHStackItem *__restrict s2, - QBVHStackItem *__restrict s3, - QBVHStackItem *__restrict s4) -{ - if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } - if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); } - if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); } - if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); } - if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } -} - -ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg, - const ssef& tnear, - const ssef& tfar, -#ifdef __KERNEL_AVX2__ - const sse3f& org_idir, -#else - const sse3f& org, -#endif - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int nodeAddr, - ssef *__restrict dist) -{ - const int offset = nodeAddr*BVH_QNODE_SIZE; -#ifdef __KERNEL_AVX2__ - const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x); - const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y); - const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z); - const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x); - const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y); - const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z); -#else - const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x; - const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y; - const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z; - const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x; - const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y; - const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z; -#endif - -#ifdef __KERNEL_SSE41__ - const ssef tNear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, tnear)); - const ssef tFar = mini(mini(tfar_x, tfar_y), mini(tfar_z, tfar)); - const sseb vmask = cast(tNear) > cast(tFar); - int mask = (int)movemask(vmask)^0xf; -#else - const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear); - const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar); - const sseb vmask = tNear <= tFar; - int mask = (int)movemask(vmask); -#endif - *dist = tNear; - return mask; -} - -ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg, - const ssef& tnear, - const ssef& tfar, -#ifdef __KERNEL_AVX2__ - const sse3f& P_idir, -#else - const sse3f& P, -#endif - const sse3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int nodeAddr, - const float difl, - ssef *__restrict dist) -{ - const int offset = nodeAddr*BVH_QNODE_SIZE; -#ifdef __KERNEL_AVX2__ - const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x); - const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y); - const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z); - const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x); - const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y); - const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z); -#else - const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x; - const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y; - const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z; - const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x; - const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y; - const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z; -#endif - - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear); - const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar); - const sseb vmask = round_down*tNear <= round_up*tFar; - *dist = tNear; - return (int)movemask(vmask); -} diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h deleted file mode 100644 index 738d08ac6fc..00000000000 --- a/intern/cycles/kernel/geom/geom_qbvh_traversal.h +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Adapted from code Copyright 2009-2010 NVIDIA Corporation, - * and code copyright 2009-2012 Intel Corporation - * - * Modifications Copyright 2011-2014, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function, where various features can be - * enabled/disabled. This way we can compile optimized versions for each case - * without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_HAIR: hair curve rendering - * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width - * BVH_MOTION: motion blur rendering - * - */ - -ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility -#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - ,uint *lcg_state, - float difl, - float extmax -#endif - ) -{ - /* TODO(sergey): - * - Test if pushing distance on the stack helps (for non shadow rays). - * - Separate version for shadow rays. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversalStack[BVH_QSTACK_SIZE]; - traversalStack[0].addr = ENTRYPOINT_SENTINEL; - traversalStack[0].dist = -FLT_MAX; - - /* Traversal variables in registers. */ - int stackPtr = 0; - int nodeAddr = kernel_data.bvh.root; - float nodeDist = -FLT_MAX; - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; -#endif - -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return false; - } -#endif - - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; - - BVH_DEBUG_INIT(); - - ssef tnear(0.0f), tfar(ray->t); - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P*idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#else - sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - - if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } - if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } - if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } - - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) { - if(UNLIKELY(nodeDist > isect->t)) { - /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - continue; - } - - int traverseChild; - ssef dist; - - BVH_DEBUG_NEXT_STEP(); - -#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - if(difl != 0.0f) { - /* NOTE: We extend all the child BB instead of fetching - * and checking visibility flags for each of the, - * - * Need to test if doing opposite would be any faster. - */ - traverseChild = qbvh_node_intersect_robust(kg, - tnear, - tfar, -# ifdef __KERNEL_AVX2__ - P_idir4, -# else - org, -# endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - difl, - &dist); - } - else -#endif /* BVH_HAIR_MINIMUM_WIDTH */ - { - traverseChild = qbvh_node_intersect(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#else - org, -#endif - idir4, - near_x, near_y, near_z, - far_x, far_y, far_z, - nodeAddr, - &dist); - } - - if(traverseChild != 0) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6); - - /* One child is hit, continue with that child. */ - int r = __bscf(traverseChild); - float d0 = ((float*)&dist)[r]; - if(traverseChild == 0) { - nodeAddr = __float_as_int(cnodes[r]); - nodeDist = d0; - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - r = __bscf(traverseChild); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float*)&dist)[r]; - if(traverseChild == 0) { - if(d1 < d0) { - nodeAddr = c1; - nodeDist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; - continue; - } - else { - nodeAddr = c0; - nodeDist = d0; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c1; - traversalStack[stackPtr].dist = d1; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c0; - traversalStack[stackPtr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(traverseChild); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float*)&dist)[r]; - if(traverseChild == 0) { - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2]); - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(traverseChild); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float*)&dist)[r]; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c3; - traversalStack[stackPtr].dist = d3; - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = c2; - traversalStack[stackPtr].dist = d2; - qbvh_stack_sort(&traversalStack[stackPtr], - &traversalStack[stackPtr - 1], - &traversalStack[stackPtr - 2], - &traversalStack[stackPtr - 3]); - } - - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - } - - /* If node is leaf, fetch triangle list. */ - if(nodeAddr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE); - -#ifdef __VISIBILITY_FLAG__ - if(UNLIKELY((nodeDist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0))) -#else - if(UNLIKELY((nodeDist > isect->t))) -#endif - { - /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - continue; - } - - int primAddr = __float_as_int(leaf.x); - -#if BVH_FEATURE(BVH_INSTANCING) - if(primAddr >= 0) { -#endif - int primAddr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - - /* Pop. */ - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - - /* Primitive intersection. */ - switch(type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) - return true; - } - } - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for(; primAddr < primAddr2; primAddr++) { - BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) - return true; - } - } - break; - } -#endif /* BVH_FEATURE(BVH_MOTION) */ -#if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - for(; primAddr < primAddr2; primAddr++) { - BVH_DEBUG_NEXT_STEP(); - kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type); - bool hit; - if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) - hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); - else - hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax); - if(hit) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) - return true; - } - } - break; - } -#endif /* BVH_FEATURE(BVH_HAIR) */ - } - } -#if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -primAddr-1); - -# if BVH_FEATURE(BVH_MOTION) - qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist, &ob_itfm); -# else - qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist); -# endif - - if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } - if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } - if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } - tfar = ssef(isect->t); - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - triangle_intersect_precalc(dir, &isect_precalc); - - ++stackPtr; - kernel_assert(stackPtr < BVH_QSTACK_SIZE); - traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL; - traversalStack[stackPtr].dist = -FLT_MAX; - - nodeAddr = kernel_tex_fetch(__object_node, object); - - BVH_DEBUG_NEXT_INSTANCE(); - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); - -#if BVH_FEATURE(BVH_INSTANCING) - if(stackPtr >= 0) { - kernel_assert(object != OBJECT_NONE); - - /* Instance pop. */ -# if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); -# else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); -# endif - - if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; } - if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; } - if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; } - tfar = ssef(isect->t); - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P*idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# else - org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - triangle_intersect_precalc(dir, &isect_precalc); - - object = OBJECT_NONE; - nodeAddr = traversalStack[stackPtr].addr; - nodeDist = traversalStack[stackPtr].dist; - --stackPtr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while(nodeAddr != ENTRYPOINT_SENTINEL); - - return (isect->prim != PRIM_NONE); -} diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 995dfac5b09..0c2351e1d1b 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -27,12 +27,11 @@ CCL_NAMESPACE_BEGIN ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) { /* load triangle vertices */ - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); + const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); + const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); - float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); - /* return normal */ if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED) return normalize(cross(v2 - v0, v1 - v0)); @@ -44,11 +43,10 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader) { /* load triangle vertices */ - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - - float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); + float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); + float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); + float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); /* compute point */ float t = 1.0f - u - v; @@ -71,11 +69,10 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3]) { - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - - P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); + P[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); + P[1] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); + P[2] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); } /* Interpolate smooth vertex normal from vertices */ @@ -83,11 +80,10 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v) { /* load triangle vertices */ - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - - float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x))); - float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y))); - float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.z))); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); + float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x)); + float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); + float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); return normalize((1.0f - u - v)*n2 + u*n0 + v*n1); } @@ -97,11 +93,10 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv) { /* fetch triangle vertex coordinates */ - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - - float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); - float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); - float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); + const float3 p0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); + const float3 p1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); + const float3 p2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); /* compute derivatives of P w.r.t. uv */ *dPdu = (p0 - p2); @@ -119,11 +114,11 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim)); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); - float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x)); - float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y)); - float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z)); + float f0 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.x); + float f1 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.y); + float f2 = kernel_tex_fetch(__attributes_float, offset + tri_vindex.z); #ifdef __RAY_DIFFERENTIALS__ if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; @@ -162,11 +157,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim))); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); - float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); - float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); - float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z))); + float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x)); + float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y)); + float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z)); #ifdef __RAY_DIFFERENTIALS__ if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index b6dfc769012..fc081bda525 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -106,9 +106,10 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, const float Sz = isect_precalc->Sz; /* Calculate vertices relative to ray origin. */ - const float4 tri_a = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+0), - tri_b = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+1), - tri_c = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+2); + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr); + const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), + tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), + tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); @@ -202,9 +203,10 @@ ccl_device_inline void triangle_intersect_subsurface( const float Sz = isect_precalc->Sz; /* Calculate vertices relative to ray origin. */ - const float4 tri_a = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+0), - tri_b = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+1), - tri_c = kernel_tex_fetch(__tri_storage, triAddr*TRI_NODE_SIZE+2); + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr); + const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), + tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), + tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); @@ -324,9 +326,10 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, P = P + D*t; - const float4 tri_a = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+0), - tri_b = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+1), - tri_c = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+2); + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim); + const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), + tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), + tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z); float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z); float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z); @@ -381,9 +384,10 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, P = P + D*t; #ifdef __INTERSECTION_REFINE__ - const float4 tri_a = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+0), - tri_b = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+1), - tri_c = kernel_tex_fetch(__tri_storage, isect->prim*TRI_NODE_SIZE+2); + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim); + const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), + tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), + tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z); float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z); float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z); diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 42314756f02..08f6f457805 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -42,6 +42,7 @@ #define ccl_constant #define ccl_may_alias #define ccl_addr_space +#define ccl_restrict __restrict__ /* No assert supported for CUDA */ diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index a5708448e23..8505cb85576 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -39,6 +39,7 @@ #define ccl_global __global #define ccl_local __local #define ccl_private __private +#define ccl_restrict restrict #ifdef __SPLIT_KERNEL__ # define ccl_addr_space __global diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index 736a884f819..93c4bd3f7d5 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -51,8 +51,8 @@ ccl_device float area_light_sample(float3 P, bool sample_coord) { /* In our name system we're using P for the center, - * which is o in the paper. - */ + * which is o in the paper. + */ float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f; float axisu_len, axisv_len; diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 3c3503eab8b..d5b31037723 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -25,6 +25,7 @@ #include "kernel_camera.h" #include "geom/geom.h" +#include "bvh/bvh.h" #include "kernel_accumulate.h" #include "kernel_shader.h" diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index 94598e2565e..731dc0407c5 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -309,7 +309,7 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b state->num_samples = state->num_samples*num_branches; } -ccl_device_inline uint lcg_state_init(RNG *rng, const ccl_addr_space PathState *state, uint scramble) +ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble) { return lcg_init(*rng + state->rng_offset + state->sample*scramble); } diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index 245d236ff97..5ba262c1044 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -25,7 +25,8 @@ /* bvh */ KERNEL_TEX(float4, texture_float4, __bvh_nodes) KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes) -KERNEL_TEX(float4, texture_float4, __tri_storage) +KERNEL_TEX(float4, texture_float4, __prim_tri_verts) +KERNEL_TEX(uint, texture_uint, __prim_tri_index) KERNEL_TEX(uint, texture_uint, __prim_type) KERNEL_TEX(uint, texture_uint, __prim_visibility) KERNEL_TEX(uint, texture_uint, __prim_index) @@ -39,8 +40,7 @@ KERNEL_TEX(float4, texture_float4, __objects_vector) /* triangles */ KERNEL_TEX(uint, texture_uint, __tri_shader) KERNEL_TEX(float4, texture_float4, __tri_vnormal) -KERNEL_TEX(float4, texture_float4, __tri_vindex) -KERNEL_TEX(float4, texture_float4, __tri_verts) +KERNEL_TEX(uint4, texture_uint4, __tri_vindex) /* curves */ KERNEL_TEX(float4, texture_float4, __curves) diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 76d2a6b98e6..5de58ba28ed 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -292,11 +292,14 @@ enum PathRayFlag { PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */ PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */ - PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024), + /* Special flag to tag unaligned BVH nodes. */ + PATH_RAY_NODE_UNALIGNED = 2048, - PATH_RAY_MIS_SKIP = 2048, - PATH_RAY_DIFFUSE_ANCESTOR = 4096, - PATH_RAY_SINGLE_PASS_DONE = 8192, + PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048), + + PATH_RAY_MIS_SKIP = 4096, + PATH_RAY_DIFFUSE_ANCESTOR = 8192, + PATH_RAY_SINGLE_PASS_DONE = 16384, }; /* Closure Label */ @@ -769,7 +772,7 @@ typedef ccl_addr_space struct ShaderData { int type; /* parametric coordinates - * - barycentric weights for triangles */ + * - barycentric weights for triangles */ float u; float v; /* object id if there is one, ~0 otherwise */ @@ -792,14 +795,14 @@ typedef ccl_addr_space struct ShaderData { #endif #ifdef __DPDU__ /* differential of P w.r.t. parametric coordinates. note that dPdu is - * not readily suitable as a tangent for shading on triangles. */ + * not readily suitable as a tangent for shading on triangles. */ float3 dPdu; float3 dPdv; #endif #ifdef __OBJECT_MOTION__ /* object <-> world space transformations, cached to avoid - * re-interpolating them constantly for shading */ + * re-interpolating them constantly for shading */ Transform ob_tfm; Transform ob_itfm; #endif @@ -1171,11 +1174,11 @@ typedef ccl_addr_space struct DebugData { #define QUEUE_EMPTY_SLOT -1 /* -* Queue 1 - Active rays -* Queue 2 - Background queue -* Queue 3 - Shadow ray cast kernel - AO -* Queeu 4 - Shadow ray cast kernel - direct lighting -*/ + * Queue 1 - Active rays + * Queue 2 - Background queue + * Queue 3 - Shadow ray cast kernel - AO + * Queeu 4 - Shadow ray cast kernel - direct lighting + */ #define NUM_QUEUES 4 /* Queue names */ diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index aad06ed5c76..37907cd8fdc 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -35,6 +35,7 @@ # include "../../kernel_montecarlo.h" # include "../../kernel_projection.h" # include "../../geom/geom.h" +# include "../../bvh/bvh.h" # include "../../kernel_accumulate.h" # include "../../kernel_camera.h" diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index ebe739ebd0e..2bb2be5e6b3 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -47,6 +47,7 @@ #include "kernel_camera.h" #include "kernels/cpu/kernel_cpu_image.h" #include "geom/geom.h" +#include "bvh/bvh.h" #include "kernel_projection.h" #include "kernel_accumulate.h" @@ -912,7 +913,7 @@ bool OSLRenderServices::texture(ustring filename, #endif bool status; - if(filename[0] == '@') { + if(filename.length() && filename[0] == '@') { int slot = atoi(filename.c_str() + 1); float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t); @@ -993,7 +994,7 @@ bool OSLRenderServices::texture3d(ustring filename, } bool status; - if(filename[0] == '@') { + if(filename.length() && filename[0] == '@') { int slot = atoi(filename.c_str() + 1); float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z); diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt index 49030f33c26..b43f8402d42 100644 --- a/intern/cycles/kernel/shaders/CMakeLists.txt +++ b/intern/cycles/kernel/shaders/CMakeLists.txt @@ -81,6 +81,7 @@ set(SRC_OSL node_wireframe.osl node_hair_bsdf.osl node_uv_map.osl + node_rgb_to_bw.osl ) set(SRC_OSL_HEADERS diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl index a00401845c8..7cd2922dd4f 100644 --- a/intern/cycles/kernel/shaders/node_image_texture.osl +++ b/intern/cycles/kernel/shaders/node_image_texture.osl @@ -88,7 +88,7 @@ shader node_image_texture( string color_space = "sRGB", string projection = "flat", string interpolation = "smartcubic", - string wrap = "periodic", + string extension = "periodic", float projection_blend = 0.0, int is_float = 1, int use_alpha = 1, @@ -108,7 +108,7 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); } else if (projection == "box") { /* object space normal */ @@ -184,7 +184,7 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); Alpha += weight[0] * tmp_alpha; } if (weight[1] > 0.0) { @@ -195,7 +195,7 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); Alpha += weight[1] * tmp_alpha; } if (weight[2] > 0.0) { @@ -206,7 +206,7 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); Alpha += weight[2] * tmp_alpha; } } @@ -219,7 +219,7 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); } else if (projection == "tube") { point projected = map_to_tube(texco_remap_square(p)); @@ -230,6 +230,6 @@ shader node_image_texture( use_alpha, is_float, interpolation, - wrap); + extension); } } diff --git a/intern/cycles/kernel/shaders/node_rgb_to_bw.osl b/intern/cycles/kernel/shaders/node_rgb_to_bw.osl new file mode 100644 index 00000000000..903dfcdc881 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_rgb_to_bw.osl @@ -0,0 +1,25 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stdosl.h" + +shader node_rgb_to_bw( + color Color = 0.0, + output float Val = 0.0) +{ + Val = Color[0] * 0.2126 + Color[1] * 0.7152 + Color[2] * 0.0722; +} + diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index e1c7e2cea99..88d6dab04d0 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -31,6 +31,7 @@ #include "kernel_camera.h" #include "geom/geom.h" +#include "bvh/bvh.h" #include "kernel_accumulate.h" #include "kernel_shader.h" diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index aa9c07c867e..44732734c31 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -72,8 +72,16 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint width = info.x; uint height = info.y; uint offset = info.z; - uint periodic = (info.w & 0x1); - uint interpolation = info.w >> 1; + + /* Image Options */ + uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; + uint extension; + if(info.w & (1 << 1)) + extension = EXTENSION_REPEAT; + else if(info.w & (1 << 2)) + extension = EXTENSION_EXTEND; + else + extension = EXTENSION_CLIP; float4 r; int ix, iy, nix, niy; @@ -81,22 +89,26 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, svm_image_texture_frac(x*width, &ix); svm_image_texture_frac(y*height, &iy); - if(periodic) { + if(extension == EXTENSION_REPEAT) { ix = svm_image_texture_wrap_periodic(ix, width); iy = svm_image_texture_wrap_periodic(iy, height); } - else { + else if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + else { /* EXTENSION_EXTEND */ ix = svm_image_texture_wrap_clamp(ix, width); iy = svm_image_texture_wrap_clamp(iy, height); - } + r = svm_image_texture_read(kg, id, offset + ix + iy*width); } - else { /* We default to linear interpolation if it is not closest */ + else { /* INTERPOLATION_LINEAR */ float tx = svm_image_texture_frac(x*width - 0.5f, &ix); float ty = svm_image_texture_frac(y*height - 0.5f, &iy); - if(periodic) { + if(extension == EXTENSION_REPEAT) { ix = svm_image_texture_wrap_periodic(ix, width); iy = svm_image_texture_wrap_periodic(iy, height); @@ -104,14 +116,17 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, niy = svm_image_texture_wrap_periodic(iy+1, height); } else { - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - + if(extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } nix = svm_image_texture_wrap_clamp(ix+1, width); niy = svm_image_texture_wrap_clamp(iy+1, height); + ix = svm_image_texture_wrap_clamp(ix, width); + iy = svm_image_texture_wrap_clamp(iy, height); } - r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width); r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width); r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width); diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index ecde2e99a7b..614620c14af 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -1076,6 +1076,26 @@ void ImageManager::device_update_slot(Device *device, } } +uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot) +{ + uint8_t options = 0; + + /* Image Options are packed into one uint: + * bit 0 -> Interpolation + * bit 1 + 2 + 3-> Extension */ + if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST) + options |= (1 << 0); + + if(images[type][slot]->extension == EXTENSION_REPEAT) + options |= (1 << 1); + else if(images[type][slot]->extension == EXTENSION_EXTEND) + options |= (1 << 2); + else /* EXTENSION_CLIP */ + options |= (1 << 3); + + return options; +} + void ImageManager::device_pack_images(Device *device, DeviceScene *dscene, Progress& /*progess*/) @@ -1107,11 +1127,9 @@ void ImageManager::device_pack_images(Device *device, device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot]; - /* The image options are packed - bit 0 -> periodic - bit 1 + 2 -> interpolation type */ - uint8_t interpolation = (images[type][slot]->interpolation << 1) + 1; - info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, interpolation); + uint8_t options = pack_image_options(type, slot); + + info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options); memcpy(pixels_byte+offset, (void*)tex_img.data_pointer, tex_img.memory_size()); offset += tex_img.size(); @@ -1139,11 +1157,8 @@ void ImageManager::device_pack_images(Device *device, /* todo: support 3D textures, only CPU for now */ - /* The image options are packed - bit 0 -> periodic - bit 1 + 2 -> interpolation type */ - uint8_t interpolation = (images[type][slot]->interpolation << 1) + 1; - info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, interpolation); + uint8_t options = pack_image_options(type, slot); + info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options); memcpy(pixels_float+offset, (void*)tex_img.data_pointer, tex_img.memory_size()); offset += tex_img.size(); diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h index 01d02f4dbec..07998684b23 100644 --- a/intern/cycles/render/image.h +++ b/intern/cycles/render/image.h @@ -122,6 +122,8 @@ private: int flattened_slot_to_type_index(int flat_slot, ImageDataType *type); string name_from_type(int type); + uint8_t pack_image_options(ImageDataType type, size_t slot); + void device_load_image(Device *device, DeviceScene *dscene, ImageDataType type, int slot, Progress *progess); void device_free_image(Device *device, DeviceScene *dscene, ImageDataType type, int slot); diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index 764a925983e..661719ed545 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -73,6 +73,37 @@ void Mesh::Curve::bounds_grow(const int k, const float3 *curve_keys, const float bounds.grow(upper, mr); } +void Mesh::Curve::bounds_grow(const int k, + const float3 *curve_keys, + const float *curve_radius, + const Transform& aligned_space, + BoundBox& bounds) const +{ + float3 P[4]; + + P[0] = curve_keys[max(first_key + k - 1,first_key)]; + P[1] = curve_keys[first_key + k]; + P[2] = curve_keys[first_key + k + 1]; + P[3] = curve_keys[min(first_key + k + 2, first_key + num_keys - 1)]; + + P[0] = transform_point(&aligned_space, P[0]); + P[1] = transform_point(&aligned_space, P[1]); + P[2] = transform_point(&aligned_space, P[2]); + P[3] = transform_point(&aligned_space, P[3]); + + float3 lower; + float3 upper; + + curvebounds(&lower.x, &upper.x, P, 0); + curvebounds(&lower.y, &upper.y, P, 1); + curvebounds(&lower.z, &upper.z, P, 2); + + float mr = max(curve_radius[first_key + k], curve_radius[first_key + k + 1]); + + bounds.grow(lower, mr); + bounds.grow(upper, mr); +} + /* Mesh */ NODE_DEFINE(Mesh) @@ -472,30 +503,19 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal) } } -void Mesh::pack_verts(float4 *tri_verts, float4 *tri_vindex, size_t vert_offset) +void Mesh::pack_verts(const vector<uint>& tri_prim_index, + uint4 *tri_vindex, + size_t vert_offset, + size_t tri_offset) { - size_t verts_size = verts.size(); - - if(verts_size) { - float3 *verts_ptr = &verts[0]; - - for(size_t i = 0; i < verts_size; i++) { - float3 p = verts_ptr[i]; - tri_verts[i] = make_float4(p.x, p.y, p.z, 0.0f); - } - } - - size_t triangles_size = num_triangles(); - + const size_t triangles_size = num_triangles(); if(triangles_size) { for(size_t i = 0; i < triangles_size; i++) { Triangle t = get_triangle(i); - - tri_vindex[i] = make_float4( - __int_as_float(t.v[0] + vert_offset), - __int_as_float(t.v[1] + vert_offset), - __int_as_float(t.v[2] + vert_offset), - 0); + tri_vindex[i] = make_uint4(t.v[0] + vert_offset, + t.v[1] + vert_offset, + t.v[2] + vert_offset, + tri_prim_index[i + tri_offset]); } } } @@ -533,7 +553,11 @@ void Mesh::pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, s } } -void Mesh::compute_bvh(SceneParams *params, Progress *progress, int n, int total) +void Mesh::compute_bvh(DeviceScene *dscene, + SceneParams *params, + Progress *progress, + int n, + int total) { if(progress->get_cancel()) return; @@ -564,6 +588,7 @@ void Mesh::compute_bvh(SceneParams *params, Progress *progress, int n, int total BVHParams bparams; bparams.use_spatial_split = params->use_bvh_spatial_split; bparams.use_qbvh = params->use_qbvh; + bparams.use_unaligned_nodes = dscene->data.bvh.have_curves; delete bvh; bvh = BVH::create(bparams, objects); @@ -1070,42 +1095,82 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene, } } -void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress) +void MeshManager::mesh_calc_offset(Scene *scene) { - /* count and update offsets */ size_t vert_size = 0; size_t tri_size = 0; - size_t curve_key_size = 0; size_t curve_size = 0; foreach(Mesh *mesh, scene->meshes) { mesh->vert_offset = vert_size; mesh->tri_offset = tri_size; - mesh->curvekey_offset = curve_key_size; mesh->curve_offset = curve_size; vert_size += mesh->verts.size(); tri_size += mesh->num_triangles(); - curve_key_size += mesh->curve_keys.size(); curve_size += mesh->num_curves(); } +} +void MeshManager::device_update_mesh(Device *device, + DeviceScene *dscene, + Scene *scene, + bool for_displacement, + Progress& progress) +{ + /* Count. */ + size_t vert_size = 0; + size_t tri_size = 0; + size_t curve_key_size = 0; + size_t curve_size = 0; + foreach(Mesh *mesh, scene->meshes) { + vert_size += mesh->verts.size(); + tri_size += mesh->num_triangles(); + curve_key_size += mesh->curve_keys.size(); + curve_size += mesh->num_curves(); + } + /* Create mapping from triangle to primitive triangle array. */ + vector<uint> tri_prim_index(tri_size); + if(for_displacement) { + /* For displacement kernels we do some trickery to make them believe + * we've got all required data ready. However, that data is different + * from final render kernels since we don't have BVH yet, so can't + * really use same semantic of arrays. + */ + foreach(Mesh *mesh, scene->meshes) { + for(size_t i = 0; i < mesh->num_triangles(); ++i) { + tri_prim_index[i + mesh->tri_offset] = 3 * (i + mesh->tri_offset); + } + } + } + else { + PackedBVH& pack = bvh->pack; + for(size_t i = 0; i < pack.prim_index.size(); ++i) { + if ((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) { + tri_prim_index[pack.prim_index[i]] = pack.prim_tri_index[i]; + } + } + } + /* Fill in all the arrays. */ if(tri_size != 0) { /* normals */ progress.set_status("Updating Mesh", "Computing normals"); uint *tri_shader = dscene->tri_shader.resize(tri_size); float4 *vnormal = dscene->tri_vnormal.resize(vert_size); - float4 *tri_verts = dscene->tri_verts.resize(vert_size); - float4 *tri_vindex = dscene->tri_vindex.resize(tri_size); + uint4 *tri_vindex = dscene->tri_vindex.resize(tri_size); foreach(Mesh *mesh, scene->meshes) { - mesh->pack_normals(scene, &tri_shader[mesh->tri_offset], &vnormal[mesh->vert_offset]); - mesh->pack_verts(&tri_verts[mesh->vert_offset], &tri_vindex[mesh->tri_offset], mesh->vert_offset); - + mesh->pack_normals(scene, + &tri_shader[mesh->tri_offset], + &vnormal[mesh->vert_offset]); + mesh->pack_verts(tri_prim_index, + &tri_vindex[mesh->tri_offset], + mesh->vert_offset, + mesh->tri_offset); if(progress.get_cancel()) return; } @@ -1114,10 +1179,8 @@ void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene device->tex_alloc("__tri_shader", dscene->tri_shader); device->tex_alloc("__tri_vnormal", dscene->tri_vnormal); - device->tex_alloc("__tri_verts", dscene->tri_verts); device->tex_alloc("__tri_vindex", dscene->tri_vindex); } - if(curve_size != 0) { progress.set_status("Updating Mesh", "Copying Strands to device"); @@ -1132,6 +1195,19 @@ void MeshManager::device_update_mesh(Device *device, DeviceScene *dscene, Scene device->tex_alloc("__curve_keys", dscene->curve_keys); device->tex_alloc("__curves", dscene->curves); } + if(for_displacement) { + float4 *prim_tri_verts = dscene->prim_tri_verts.resize(tri_size * 3); + foreach(Mesh *mesh, scene->meshes) { + for(size_t i = 0; i < mesh->num_triangles(); ++i) { + Mesh::Triangle t = mesh->get_triangle(i); + size_t offset = 3 * (i + mesh->tri_offset); + prim_tri_verts[offset + 0] = float3_to_float4(mesh->verts[t.v[0]]); + prim_tri_verts[offset + 1] = float3_to_float4(mesh->verts[t.v[1]]); + prim_tri_verts[offset + 2] = float3_to_float4(mesh->verts[t.v[2]]); + } + } + device->tex_alloc("__prim_tri_verts", dscene->prim_tri_verts); + } } void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress) @@ -1146,6 +1222,7 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene * bparams.top_level = true; bparams.use_qbvh = scene->params.use_qbvh; bparams.use_spatial_split = scene->params.use_bvh_spatial_split; + bparams.use_unaligned_nodes = dscene->data.bvh.have_curves; delete bvh; bvh = BVH::create(bparams, scene->objects); @@ -1170,9 +1247,13 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene * dscene->object_node.reference((uint*)&pack.object_node[0], pack.object_node.size()); device->tex_alloc("__object_node", dscene->object_node); } - if(pack.tri_storage.size()) { - dscene->tri_storage.reference(&pack.tri_storage[0], pack.tri_storage.size()); - device->tex_alloc("__tri_storage", dscene->tri_storage); + if(pack.prim_tri_index.size()) { + dscene->prim_tri_index.reference((uint*)&pack.prim_tri_index[0], pack.prim_tri_index.size()); + device->tex_alloc("__prim_tri_index", dscene->prim_tri_index); + } + if(pack.prim_tri_verts.size()) { + dscene->prim_tri_verts.reference((float4*)&pack.prim_tri_verts[0], pack.prim_tri_verts.size()); + device->tex_alloc("__prim_tri_verts", dscene->prim_tri_verts); } if(pack.prim_type.size()) { dscene->prim_type.reference((uint*)&pack.prim_type[0], pack.prim_type.size()); @@ -1273,7 +1354,7 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen VLOG(1) << "Total " << scene->meshes.size() << " meshes."; - /* update normals */ + /* Update normals. */ foreach(Mesh *mesh, scene->meshes) { foreach(Shader *shader, mesh->used_shaders) { if(shader->need_update_attributes) @@ -1289,17 +1370,17 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen } /* Update images needed for true displacement. */ - bool need_displacement_images = false; + bool true_displacement_used = false; bool old_need_object_flags_update = false; foreach(Mesh *mesh, scene->meshes) { if(mesh->need_update && mesh->displacement_method != Mesh::DISPLACE_BUMP) { - need_displacement_images = true; + true_displacement_used = true; break; } } - if(need_displacement_images) { + if(true_displacement_used) { VLOG(1) << "Updating images used for true displacement."; device_update_displacement_images(device, dscene, scene, progress); old_need_object_flags_update = scene->object_manager->need_flags_update; @@ -1310,49 +1391,52 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen false); } - /* device update */ + /* Device update. */ device_free(device, dscene); - device_update_mesh(device, dscene, scene, progress); + mesh_calc_offset(scene); + if(true_displacement_used) { + device_update_mesh(device, dscene, scene, true, progress); + } if(progress.get_cancel()) return; device_update_attributes(device, dscene, scene, progress); if(progress.get_cancel()) return; - /* update displacement */ + /* Update displacement. */ bool displacement_done = false; - - foreach(Mesh *mesh, scene->meshes) - if(mesh->need_update && displace(device, dscene, scene, mesh, progress)) + foreach(Mesh *mesh, scene->meshes) { + if(mesh->need_update && + displace(device, dscene, scene, mesh, progress)) + { displacement_done = true; + } + } - /* todo: properly handle cancel halfway displacement */ + /* TODO: properly handle cancel halfway displacement */ if(progress.get_cancel()) return; - /* device re-update after displacement */ + /* Device re-update after displacement. */ if(displacement_done) { device_free(device, dscene); - device_update_mesh(device, dscene, scene, progress); - if(progress.get_cancel()) return; - device_update_attributes(device, dscene, scene, progress); if(progress.get_cancel()) return; } - /* update bvh */ + /* Update bvh. */ size_t i = 0, num_bvh = 0; - - foreach(Mesh *mesh, scene->meshes) - if(mesh->need_update && mesh->need_build_bvh()) + foreach(Mesh *mesh, scene->meshes) { + if(mesh->need_update && mesh->need_build_bvh()) { num_bvh++; - + } + } TaskPool pool; - foreach(Mesh *mesh, scene->meshes) { if(mesh->need_update) { pool.push(function_bind(&Mesh::compute_bvh, mesh, + dscene, &scene->params, &progress, i, @@ -1362,14 +1446,14 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen } } } - TaskPool::Summary summary; pool.wait_work(&summary); VLOG(2) << "Objects BVH build pool statistics:\n" << summary.full_report(); - foreach(Shader *shader, scene->shaders) + foreach(Shader *shader, scene->shaders) { shader->need_update_attributes = false; + } #ifdef __OBJECT_MOTION__ Scene::MotionType need_motion = scene->need_motion(device->info.advanced_shading); @@ -1378,18 +1462,23 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen bool motion_blur = false; #endif - /* update obejcts */ + /* Update objects. */ vector<Object *> volume_objects; - foreach(Object *object, scene->objects) + foreach(Object *object, scene->objects) { object->compute_bounds(motion_blur); + } if(progress.get_cancel()) return; device_update_bvh(device, dscene, scene, progress); + if(progress.get_cancel()) return; + + device_update_mesh(device, dscene, scene, false, progress); + if(progress.get_cancel()) return; need_update = false; - if(need_displacement_images) { + if(true_displacement_used) { /* Re-tag flags for update, so they're re-evaluated * for meshes with correct bounding boxes. * @@ -1405,7 +1494,8 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) device->tex_free(dscene->bvh_nodes); device->tex_free(dscene->bvh_leaf_nodes); device->tex_free(dscene->object_node); - device->tex_free(dscene->tri_storage); + device->tex_free(dscene->prim_tri_verts); + device->tex_free(dscene->prim_tri_index); device->tex_free(dscene->prim_type); device->tex_free(dscene->prim_visibility); device->tex_free(dscene->prim_index); @@ -1413,7 +1503,6 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) device->tex_free(dscene->tri_shader); device->tex_free(dscene->tri_vnormal); device->tex_free(dscene->tri_vindex); - device->tex_free(dscene->tri_verts); device->tex_free(dscene->curves); device->tex_free(dscene->curve_keys); device->tex_free(dscene->attributes_map); @@ -1423,7 +1512,8 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) dscene->bvh_nodes.clear(); dscene->object_node.clear(); - dscene->tri_storage.clear(); + dscene->prim_tri_verts.clear(); + dscene->prim_tri_index.clear(); dscene->prim_type.clear(); dscene->prim_visibility.clear(); dscene->prim_index.clear(); @@ -1431,7 +1521,6 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) dscene->tri_shader.clear(); dscene->tri_vnormal.clear(); dscene->tri_vindex.clear(); - dscene->tri_verts.clear(); dscene->curves.clear(); dscene->curve_keys.clear(); dscene->attributes_map.clear(); diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h index edad6d32f00..0aea55544f2 100644 --- a/intern/cycles/render/mesh.h +++ b/intern/cycles/render/mesh.h @@ -72,7 +72,15 @@ public: int num_segments() { return num_keys - 1; } - void bounds_grow(const int k, const float3 *curve_keys, const float *curve_radius, BoundBox& bounds) const; + void bounds_grow(const int k, + const float3 *curve_keys, + const float *curve_radius, + BoundBox& bounds) const; + void bounds_grow(const int k, + const float3 *curve_keys, + const float *curve_radius, + const Transform& aligned_space, + BoundBox& bounds) const; }; Curve get_curve(size_t i) const @@ -167,9 +175,16 @@ public: void add_vertex_normals(); void pack_normals(Scene *scene, uint *shader, float4 *vnormal); - void pack_verts(float4 *tri_verts, float4 *tri_vindex, size_t vert_offset); + void pack_verts(const vector<uint>& tri_prim_index, + uint4 *tri_vindex, + size_t vert_offset, + size_t tri_offset); void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset); - void compute_bvh(SceneParams *params, Progress *progress, int n, int total); + void compute_bvh(DeviceScene *dscene, + SceneParams *params, + Progress *progress, + int n, + int total); bool need_attribute(Scene *scene, AttributeStandard std); bool need_attribute(Scene *scene, ustring name); @@ -213,15 +228,41 @@ public: void update_svm_attributes(Device *device, DeviceScene *dscene, Scene *scene, vector<AttributeRequestSet>& mesh_attributes); void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); - void device_update_object(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); - void device_update_mesh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); - void device_update_attributes(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); - void device_update_bvh(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); void device_update_flags(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); - void device_update_displacement_images(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); + void device_free(Device *device, DeviceScene *dscene); void tag_update(Scene *scene); + +protected: + /* Calculate verts/triangles/curves offsets in global arrays. */ + void mesh_calc_offset(Scene *scene); + + void device_update_object(Device *device, + DeviceScene *dscene, + Scene *scene, + Progress& progress); + + void device_update_mesh(Device *device, + DeviceScene *dscene, + Scene *scene, + bool for_displacement, + Progress& progress); + + void device_update_attributes(Device *device, + DeviceScene *dscene, + Scene *scene, + Progress& progress); + + void device_update_bvh(Device *device, + DeviceScene *dscene, + Scene *scene, + Progress& progress); + + void device_update_displacement_images(Device *device, + DeviceScene *dscene, + Scene *scene, + Progress& progress); }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp index 87020823df9..15b55d17301 100644 --- a/intern/cycles/render/nodes.cpp +++ b/intern/cycles/render/nodes.cpp @@ -611,10 +611,10 @@ static float sky_perez_function(float lam[6], float theta, float gamma) static void sky_texture_precompute_old(SunSky *sunsky, float3 dir, float turbidity) { /* - * We re-use the SunSky struct of the new model, to avoid extra variables - * zenith_Y/x/y is now radiance_x/y/z - * perez_Y/x/y is now config_x/y/z - */ + * We re-use the SunSky struct of the new model, to avoid extra variables + * zenith_Y/x/y is now radiance_x/y/z + * perez_Y/x/y is now config_x/y/z + */ float2 spherical = sky_spherical_coordinates(dir); float theta = spherical.x; @@ -1596,7 +1596,7 @@ void RGBToBWNode::compile(SVMCompiler& compiler) void RGBToBWNode::compile(OSLCompiler& compiler) { - compiler.add(this, "node_convert_from_color"); + compiler.add(this, "node_rgb_to_bw"); } /* Convert */ diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index b821e2b6475..925e84ad96d 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -63,7 +63,8 @@ public: device_vector<float4> bvh_nodes; device_vector<float4> bvh_leaf_nodes; device_vector<uint> object_node; - device_vector<float4> tri_storage; + device_vector<uint> prim_tri_index; + device_vector<float4> prim_tri_verts; device_vector<uint> prim_type; device_vector<uint> prim_visibility; device_vector<uint> prim_index; @@ -72,8 +73,7 @@ public: /* mesh */ device_vector<uint> tri_shader; device_vector<float4> tri_vnormal; - device_vector<float4> tri_vindex; - device_vector<float4> tri_verts; + device_vector<uint4> tri_vindex; device_vector<float4> curves; device_vector<float4> curve_keys; diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h index cef5adc0a61..599222da9c5 100644 --- a/intern/cycles/util/util_boundbox.h +++ b/intern/cycles/util/util_boundbox.h @@ -151,7 +151,7 @@ public: (isfinite(max.x) && isfinite(max.y) && isfinite(max.z)); } - BoundBox transformed(const Transform *tfm) + BoundBox transformed(const Transform *tfm) const { BoundBox result = BoundBox::empty; diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h index f01db64a79b..6fed18a3db8 100644 --- a/intern/cycles/util/util_transform.h +++ b/intern/cycles/util/util_transform.h @@ -127,6 +127,19 @@ ccl_device_inline Transform make_transform(float a, float b, float c, float d, return t; } +/* Constructs a coordinate frame from a normalized normal. */ +ccl_device_inline Transform make_transform_frame(float3 N) +{ + const float3 dx0 = cross(make_float3(1.0f, 0.0f, 0.0f), N); + const float3 dx1 = cross(make_float3(0.0f, 1.0f, 0.0f), N); + const float3 dx = normalize((dot(dx0,dx0) > dot(dx1,dx1))? dx0: dx1); + const float3 dy = normalize(cross(N, dx)); + return make_transform(dx.x, dx.y, dx.z, 0.0f, + dy.x, dy.y, dy.z, 0.0f, + N.x , N.y, N.z, 0.0f, + 0.0f, 0.0f, 0.0f, 1.0f); +} + #ifndef __KERNEL_GPU__ ccl_device_inline Transform operator*(const Transform a, const Transform b) diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 972befa185b..257c6ad7491 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -37,6 +37,7 @@ #define ccl_device_noinline static #define ccl_global #define ccl_constant +#define ccl_restrict __restrict #define __KERNEL_WITH_SSE_ALIGN__ #if defined(_WIN32) && !defined(FREE_WINDOWS) |