media-libs/zimg/files/zimg-2.7.5-sse2.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231

From e30112df0ca703be82ed2c852511916fc46defbd Mon Sep 17 00:00:00 2001
From: sekrit-twc <noreply@example.com>
Date: Fri, 22 Mar 2019 18:51:14 -0700
Subject: [PATCH] colorspace: use bfloat16 for SSE2 linear-to-gamma LUT

On Skylake, processing 512 pixel array:

direction	cycles/sample	cycles/pxvector
     g->l	         2.35	           9.43
     l->g	         2.49	           9.97
---
 .../colorspace/x86/operation_impl_avx2.cpp    |   3 +-
 .../colorspace/x86/operation_impl_sse2.cpp    | 104 ++++++++++++++++--
 test/colorspace/x86/colorspace_sse2_test.cpp  |  14 +--
 3 files changed, 100 insertions(+), 21 deletions(-)

diff --git a/src/zimg/colorspace/x86/operation_impl_avx2.cpp b/src/zimg/colorspace/x86/operation_impl_avx2.cpp
index bbbbf896..f0e7f792 100644
--- a/src/zimg/colorspace/x86/operation_impl_avx2.cpp
+++ b/src/zimg/colorspace/x86/operation_impl_avx2.cpp
@@ -114,8 +114,7 @@ class ToGammaLutOperationAVX2 final : public Operation {
 	{
 		EnsureSinglePrecision x87;
 
-		// Allocate an extra LUT entry so that indexing can be done by multipying by a power of 2.
-		for (unsigned long i = 0; i <= UINT16_MAX; ++i) {
+		for (size_t i = 0; i <= UINT16_MAX; ++i) {
 			uint16_t half = static_cast<uint16_t>(i);
 			float x = _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(half)));
 			m_lut[i] = func(x * prescale);
diff --git a/src/zimg/colorspace/x86/operation_impl_sse2.cpp b/src/zimg/colorspace/x86/operation_impl_sse2.cpp
index 48645031..da9d4dbc 100644
--- a/src/zimg/colorspace/x86/operation_impl_sse2.cpp
+++ b/src/zimg/colorspace/x86/operation_impl_sse2.cpp
@@ -3,11 +3,13 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <type_traits>
 #include <vector>
 #include <emmintrin.h>
 #include "common/align.h"
 #include "common/ccdep.h"
 #include "common/make_unique.h"
+#include "common/x86/sse2_util.h"
 #include "colorspace/gamma.h"
 #include "colorspace/operation.h"
 #include "colorspace/operation_impl.h"
@@ -20,14 +22,25 @@ namespace {
 
 constexpr unsigned LUT_DEPTH = 16;
 
-void lut_filter_line(const float *RESTRICT lut, unsigned lut_depth, float prescale, const float *src, float *dst, unsigned left, unsigned right)
+template <class T, class U>
+T bit_cast(const U &x) noexcept
+{
+	static_assert(sizeof(T) == sizeof(U), "object sizes must match");
+	static_assert(std::is_pod<T>::value && std::is_pod<U>::value, "object types must be POD");
+
+	T ret;
+	std::copy_n(reinterpret_cast<const char *>(&x), sizeof(x), reinterpret_cast<char *>(&ret));
+	return ret;
+}
+
+void to_linear_lut_filter_line(const float *RESTRICT lut, unsigned lut_depth, const float *src, float *dst, unsigned left, unsigned right)
 {
 	unsigned vec_left = ceil_n(left, 4);
 	unsigned vec_right = floor_n(right, 4);
 
 	const int32_t lut_limit = static_cast<int32_t>(1) << lut_depth;
 
-	const __m128 scale = _mm_set_ps1(0.5f * prescale * lut_limit);
+	const __m128 scale = _mm_set_ps1(0.5f * lut_limit);
 	const __m128 offset = _mm_set_ps1(0.25f * lut_limit);
 	const __m128i limit = _mm_set1_epi16(std::min(lut_limit + INT16_MIN, static_cast<int32_t>(INT16_MAX)));
 	const __m128i bias_epi16 = _mm_set1_epi16(INT16_MIN);
@@ -73,16 +86,61 @@ void lut_filter_line(const float *RESTRICT lut, unsigned lut_depth, float presca
 	}
 }
 
+void to_gamma_lut_filter_line(const float *RESTRICT lut, const float *src, float *dst, unsigned left, unsigned right)
+{
+	unsigned vec_left = ceil_n(left, 4);
+	unsigned vec_right = floor_n(right, 4);
+
+	for (unsigned j = left; j < vec_left; ++j) {
+		__m128i x = _mm_castps_si128(_mm_load_ss(src + j));
+		__m128i msb = _mm_srli_epi32(x, 16);
+		__m128i lsb = _mm_and_si128(_mm_srli_epi32(x, 15), _mm_set1_epi32(1));
+		x = mm_packus_epi32(msb, lsb);
+		x = _mm_adds_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
+
+		dst[j] = lut[_mm_cvtsi128_si32(x)];
+	}
+	for (unsigned j = vec_left; j < vec_right; j += 4) {
+		__m128i x = _mm_castps_si128(_mm_load_ps(src + j));
+		__m128i msb = _mm_srli_epi32(x, 16);
+		__m128i lsb = _mm_and_si128(_mm_srli_epi32(x, 15), _mm_set1_epi32(1));
+		x = mm_packus_epi32(msb, lsb);
+		x = _mm_adds_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
+
+#if SIZE_MAX >= UINT64_MAX
+		uint64_t tmp = _mm_cvtsi128_si64(x);
+		dst[j + 0] = lut[tmp & 0xFFFFU];
+		dst[j + 1] = lut[(tmp >> 16) & 0xFFFFU];
+		dst[j + 2] = lut[(tmp >> 32) & 0xFFFFU];
+		dst[j + 3] = lut[tmp >> 48];
+#else
+		uint32_t tmp0 = _mm_cvtsi128_si32(x);
+		uint32_t tmp1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, 0, 1)));
+		dst[j + 0] = lut[tmp0 & 0xFFFFU];
+		dst[j + 1] = lut[tmp0 >> 16];
+		dst[j + 2] = lut[tmp1 & 0xFFFFU];
+		dst[j + 3] = lut[tmp1 >> 16];
+#endif
+	}
+	for (unsigned j = vec_right; j < right; ++j) {
+		__m128i x = _mm_castps_si128(_mm_load_ss(src + j));
+		__m128i msb = _mm_srli_epi32(x, 16);
+		__m128i lsb = _mm_and_si128(_mm_srli_epi32(x, 15), _mm_set1_epi32(1));
+		x = mm_packus_epi32(msb, lsb);
+		x = _mm_adds_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
+
+		dst[j] = lut[_mm_cvtsi128_si32(x)];
+	}
+}
+
 
-class LutOperationSSE2 final : public Operation {
+class ToLinearLutOperationSSE2 final : public Operation {
 	std::vector<float> m_lut;
 	unsigned m_lut_depth;
-	float m_prescale;
 public:
-	LutOperationSSE2(gamma_func func, unsigned lut_depth, float prescale, float postscale) :
+	ToLinearLutOperationSSE2(gamma_func func, unsigned lut_depth, float postscale) :
 		m_lut((1UL << lut_depth) + 1),
-		m_lut_depth{ lut_depth },
-		m_prescale{ static_cast<float>(prescale) }
+		m_lut_depth{ lut_depth }
 	{
 		EnsureSinglePrecision x87;
 
@@ -95,9 +153,31 @@ class LutOperationSSE2 final : public Operation {
 
 	void process(const float * const *src, float * const *dst, unsigned left, unsigned right) const override
 	{
-		lut_filter_line(m_lut.data(), m_lut_depth, m_prescale, src[0], dst[0], left, right);
-		lut_filter_line(m_lut.data(), m_lut_depth, m_prescale, src[1], dst[1], left, right);
-		lut_filter_line(m_lut.data(), m_lut_depth, m_prescale, src[2], dst[2], left, right);
+		to_linear_lut_filter_line(m_lut.data(), m_lut_depth, src[0], dst[0], left, right);
+		to_linear_lut_filter_line(m_lut.data(), m_lut_depth, src[1], dst[1], left, right);
+		to_linear_lut_filter_line(m_lut.data(), m_lut_depth, src[2], dst[2], left, right);
+	}
+};
+
+class ToGammaLutOperationSSE2 final : public Operation {
+	std::vector<float> m_lut;
+public:
+	ToGammaLutOperationSSE2(gamma_func func, float prescale) :
+		m_lut(static_cast<uint32_t>(UINT16_MAX) + 1)
+	{
+		EnsureSinglePrecision x87;
+
+		for (size_t i = 0; i <= UINT16_MAX; ++i) {
+			float x = bit_cast<float>(static_cast<uint32_t>(i << 16));
+			m_lut[i] = func(x * prescale);
+		}
+	}
+
+	void process(const float * const *src, float * const *dst, unsigned left, unsigned right) const override
+	{
+		to_gamma_lut_filter_line(m_lut.data(), src[0], dst[0], left, right);
+		to_gamma_lut_filter_line(m_lut.data(), src[1], dst[1], left, right);
+		to_gamma_lut_filter_line(m_lut.data(), src[2], dst[2], left, right);
 	}
 };
 
@@ -109,7 +189,7 @@ std::unique_ptr<Operation> create_gamma_operation_sse2(const TransferFunction &t
 	if (!params.approximate_gamma)
 		return nullptr;
 
-	return ztd::make_unique<LutOperationSSE2>(transfer.to_gamma, LUT_DEPTH, transfer.to_gamma_scale, 1.0f);
+	return ztd::make_unique<ToGammaLutOperationSSE2>(transfer.to_gamma, transfer.to_gamma_scale);
 }
 
 std::unique_ptr<Operation> create_inverse_gamma_operation_sse2(const TransferFunction &transfer, const OperationParams &params)
@@ -117,7 +197,7 @@ std::unique_ptr<Operation> create_inverse_gamma_operation_sse2(const TransferFun
 	if (!params.approximate_gamma)
 		return nullptr;
 
-	return ztd::make_unique<LutOperationSSE2>(transfer.to_linear, LUT_DEPTH, 1.0f, transfer.to_linear_scale);
+	return ztd::make_unique<ToLinearLutOperationSSE2>(transfer.to_linear, LUT_DEPTH, transfer.to_linear_scale);
 }
 
 } // namespace colorspace
diff --git a/test/colorspace/x86/colorspace_sse2_test.cpp b/test/colorspace/x86/colorspace_sse2_test.cpp
index d5130868..ecaa05e7 100644
--- a/test/colorspace/x86/colorspace_sse2_test.cpp
+++ b/test/colorspace/x86/colorspace_sse2_test.cpp
@@ -53,9 +53,9 @@ TEST(ColorspaceConversionSSE2Test, test_transfer_lut)
 			"162687e701627cdc17283a32c36ea711d28a953e"
 		},
 		{
-			"492587e7ed75b7e3ab868bead6ade7a4137c6ea1",
-			"3b0694e9fbce61466cb5a575f300d784089b6cad",
-			"b68f103f52ccafae867d664d7f27fe56ae9208af"
+			"95f2715bd0d417028bebd5c5377180fcd5b01119",
+			"76f7c88b198f1ab08167f8162c1237b54f22007a",
+			"1099c3ae187c0a9f79acb9445761b6056218c779"
 		},
 		{
 			"4c0b5ffe768a7812d1ef102b4d8d52614838bc8e",
@@ -63,13 +63,13 @@ TEST(ColorspaceConversionSSE2Test, test_transfer_lut)
 			"85a277a80dfca2e21789cedd76aaee307dbc4562"
 		},
 		{
-			"df546ce0ad6f859499a96d2d697d896067e60e38",
-			"f0041b8a008ab45f0ea1319090ac7e8be0990d92",
-			"06880efb598e41f96fa79e04dbdfcccd50d6dc6f"
+			"5e35786d313e936566d9873ba7a08a8d6005b2ee",
+			"829fa88acfbbb26801871bf3cadf5cc2eb6830c9",
+			"f82fcad18a19b548d419a1952b6a7a423a684b62"
 		},
 	};
 	const double expected_tolinear_snr = 80.0;
-	const double expected_togamma_snr = 40.0;
+	const double expected_togamma_snr = 60.0;
 
 	SCOPED_TRACE("tolinear 709");
 	test_case({ MatrixCoefficients::RGB, TransferCharacteristics::REC_709, ColorPrimaries::UNSPECIFIED },