#pragma once #include "intrinsics.h" #include "types.h" #include "utils.h" #include namespace intgemm { template using PostprocessPipeline = std::tuple; template constexpr std::tuple CreatePostprocessPipeline(const Stages&... stages) { return std::make_tuple(stages...); } template class PostprocessImpl; namespace { // anonymous namespace template using input_register_type = typename std::tuple_element< 0, std::tuple >::type::InputRegister; template using output_register_type = typename std::tuple_element< std::tuple_size>::value - 1, std::tuple >::type::OutputRegister; template constexpr std::tuple DropFirstStage(const std::tuple& pipeline) { return make_subtuple(pipeline, sequence_popfront>()); } template constexpr std::tuple<> InitPostprocessPipelineImpl(std::tuple<> pipeline) { return std::tuple<>(); } template constexpr std::tuple, PostprocessImpl...> InitPostprocessPipelineImpl(std::tuple pipeline) { return std::tuple_cat( std::tuple>(PostprocessImpl(std::get<0>(pipeline))), InitPostprocessPipelineImpl(DropFirstStage(pipeline)) ); } template struct RunPostprocessPipelineImpl; #define RUN_POSTPROCESS_PIPELINE_IMPL_INSERT_IMPL(attribute, cpu_type) \ template <> \ struct RunPostprocessPipelineImpl { \ template \ attribute static constexpr output_register_type \ run(std::tuple pipeline, input_register_type input, Index offset) { \ return std::get<0>(pipeline).run(input, offset); \ } \ template \ attribute static constexpr output_register_type \ run(std::tuple pipeline, input_register_type input, Index offset) { \ return run( \ DropFirstStage(pipeline), \ std::get<0>(pipeline).run(input, offset), offset); \ } \ }; RUN_POSTPROCESS_PIPELINE_IMPL_INSERT_IMPL(INTGEMM_SSE2, CPUType::SSE2) RUN_POSTPROCESS_PIPELINE_IMPL_INSERT_IMPL(INTGEMM_SSSE3, CPUType::SSSE3) RUN_POSTPROCESS_PIPELINE_IMPL_INSERT_IMPL(INTGEMM_AVX2, CPUType::AVX2) RUN_POSTPROCESS_PIPELINE_IMPL_INSERT_IMPL(INTGEMM_AVX512BW, CPUType::AVX512BW) } // anonymous namespace template class InitedPostprocessPipeline {}; template constexpr InitedPostprocessPipeline InitPostprocessPipeline(std::tuple pipeline) { return InitedPostprocessPipeline(pipeline); } #define INITED_POSTPROCESS_PIPELINE_INSERT_IMPL(attribute, cpu_type) \ template \ class InitedPostprocessPipeline { \ public: \ using InputRegister = input_register_type...>; \ using OutputRegister = output_register_type...>; \ InitedPostprocessPipeline(std::tuple pipeline) \ : inited_pipeline(InitPostprocessPipelineImpl(pipeline)) {} \ attribute inline OutputRegister run(InputRegister input, Index offset) { \ return RunPostprocessPipelineImpl::run(inited_pipeline, input, offset); \ } \ attribute inline void run(const InputRegister* input, unsigned length, OutputRegister* output) { \ for (unsigned i = 0; i < length; ++i) \ output[i] = RunPostprocessPipelineImpl::run(inited_pipeline, input[i], i * sizeof(InputRegister)); \ } \ private: \ const std::tuple...> inited_pipeline; \ }; INITED_POSTPROCESS_PIPELINE_INSERT_IMPL(INTGEMM_SSE2, CPUType::SSE2) INITED_POSTPROCESS_PIPELINE_INSERT_IMPL(INTGEMM_SSSE3, CPUType::SSSE3) INITED_POSTPROCESS_PIPELINE_INSERT_IMPL(INTGEMM_AVX2, CPUType::AVX2) INITED_POSTPROCESS_PIPELINE_INSERT_IMPL(INTGEMM_AVX512BW, CPUType::AVX512BW) }