1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
#pragma once
#include "intrinsics.h"
#include "types.h"
#include "utils.h"
#include <tuple>
namespace intgemm {
template <typename... Stages>
using PostprocessPipeline = std::tuple<Stages...>;
template <typename... Stages>
constexpr std::tuple<Stages...> CreatePostprocessPipeline(const Stages&... stages) {
return std::make_tuple(stages...);
}
template <typename Postprocess, CPUType CpuType>
class PostprocessImpl;
namespace { // anonymous namespace
template <typename... Stages>
using input_register_type = typename std::tuple_element<
0,
std::tuple<Stages...>
>::type::InputRegister;
template <typename... Stages>
using output_register_type = typename std::tuple_element<
std::tuple_size<std::tuple<Stages...>>::value - 1,
std::tuple<Stages...>
>::type::OutputRegister;
template <typename FirstStage, typename... RestStages>
constexpr std::tuple<RestStages...> DropFirstStage(const std::tuple<FirstStage, RestStages...>& pipeline) {
return make_subtuple(pipeline, sequence_popfront<make_sequence<sizeof...(RestStages) + 1>>());
}
template <CPUType CpuType>
constexpr std::tuple<> InitPostprocessPipelineImpl(std::tuple<> pipeline) {
return std::tuple<>();
}
template <CPUType CpuType, typename FirstStage, typename... RestStages>
constexpr std::tuple<PostprocessImpl<FirstStage, CpuType>, PostprocessImpl<RestStages, CpuType>...> InitPostprocessPipelineImpl(std::tuple<FirstStage, RestStages...> pipeline) {
return std::tuple_cat(
std::tuple<PostprocessImpl<FirstStage, CpuType>>(PostprocessImpl<FirstStage, CpuType>(std::get<0>(pipeline))),
InitPostprocessPipelineImpl<CpuType, RestStages...>(DropFirstStage(pipeline))
);
}
template <CPUType CpuType>
struct RunPostprocessPipelineImpl;
#define RUN_POSTPROCESS_PIPELINE_IMPL_INSERT_IMPL(attribute, cpu_type) \
template <> \
struct RunPostprocessPipelineImpl<cpu_type> { \
template <typename Stage> \
attribute static constexpr output_register_type<Stage> \
run(std::tuple<Stage> pipeline, input_register_type<Stage> input, Index offset) { \
return std::get<0>(pipeline).run(input, offset); \
} \
template <typename... Stages> \
attribute static constexpr output_register_type<Stages...> \
run(std::tuple<Stages...> pipeline, input_register_type<Stages...> input, Index offset) { \
return run( \
DropFirstStage(pipeline), \
std::get<0>(pipeline).run(input, offset), offset); \
} \
};
RUN_POSTPROCESS_PIPELINE_IMPL_INSERT_IMPL(INTGEMM_SSE2, CPUType::SSE2)
RUN_POSTPROCESS_PIPELINE_IMPL_INSERT_IMPL(INTGEMM_SSSE3, CPUType::SSSE3)
RUN_POSTPROCESS_PIPELINE_IMPL_INSERT_IMPL(INTGEMM_AVX2, CPUType::AVX2)
RUN_POSTPROCESS_PIPELINE_IMPL_INSERT_IMPL(INTGEMM_AVX512BW, CPUType::AVX512BW)
} // anonymous namespace
template <CPUType CpuType, typename... Stages>
class InitedPostprocessPipeline {};
template <CPUType CpuType, typename... Stages>
constexpr InitedPostprocessPipeline<CpuType, Stages...> InitPostprocessPipeline(std::tuple<Stages...> pipeline) {
return InitedPostprocessPipeline<CpuType, Stages...>(pipeline);
}
#define INITED_POSTPROCESS_PIPELINE_INSERT_IMPL(attribute, cpu_type) \
template <typename... Stages> \
class InitedPostprocessPipeline<cpu_type, Stages...> { \
public: \
using InputRegister = input_register_type<PostprocessImpl<Stages, cpu_type>...>; \
using OutputRegister = output_register_type<PostprocessImpl<Stages, cpu_type>...>; \
InitedPostprocessPipeline(std::tuple<Stages...> pipeline) \
: inited_pipeline(InitPostprocessPipelineImpl<cpu_type, Stages...>(pipeline)) {} \
attribute inline OutputRegister run(InputRegister input, Index offset) { \
return RunPostprocessPipelineImpl<cpu_type>::run(inited_pipeline, input, offset); \
} \
attribute inline void run(const InputRegister* input, unsigned length, OutputRegister* output) { \
for (unsigned i = 0; i < length; ++i) \
output[i] = RunPostprocessPipelineImpl<cpu_type>::run(inited_pipeline, input[i], i * sizeof(InputRegister)); \
} \
private: \
const std::tuple<PostprocessImpl<Stages, cpu_type>...> inited_pipeline; \
};
INITED_POSTPROCESS_PIPELINE_INSERT_IMPL(INTGEMM_SSE2, CPUType::SSE2)
INITED_POSTPROCESS_PIPELINE_INSERT_IMPL(INTGEMM_SSSE3, CPUType::SSSE3)
INITED_POSTPROCESS_PIPELINE_INSERT_IMPL(INTGEMM_AVX2, CPUType::AVX2)
INITED_POSTPROCESS_PIPELINE_INSERT_IMPL(INTGEMM_AVX512BW, CPUType::AVX512BW)
}
|