FFmpeg
dsp_init.c
Go to the documentation of this file.
1 /*
2  * VVC DSP init for x86
3  *
4  * Copyright (C) 2022-2024 Nuo Mi
5  * Copyright (c) 2023-2024 Wu Jianhua
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "config.h"
25 
26 #include "libavutil/attributes.h"
27 #include "libavutil/cpu.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/vvc/dec.h"
30 #include "libavcodec/vvc/ctu.h"
31 #include "libavcodec/vvc/dsp.h"
33 
34 #if ARCH_X86_64
35 
36 #define bf(fn, bd, opt) fn##_##bd##_##opt
37 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
38 
39 #define DMVR_PROTOTYPES(bd, opt) \
40 void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
41  int height, intptr_t mx, intptr_t my, int width); \
42 void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
43  int height, intptr_t mx, intptr_t my, int width); \
44 void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
45  int height, intptr_t mx, intptr_t my, int width); \
46 void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
47  int height, intptr_t mx, intptr_t my, int width); \
48 
49 DMVR_PROTOTYPES( 8, avx2)
50 DMVR_PROTOTYPES(10, avx2)
51 DMVR_PROTOTYPES(12, avx2)
52 
53 #define OF_INIT(BD, OPT) do { \
54 void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, ptrdiff_t dst_stride, \
55  const int16_t *src0, const int16_t *src1, \
56  int w, int h); \
57  c->inter.apply_bdof = ff_vvc_apply_bdof_## BD ##_## OPT; \
58 } while (0)
59 
60 #define ALF_BPC_PROTOTYPES(bpc, opt) \
61 void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \
62  const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \
63 void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \
64  intptr_t width, intptr_t height, intptr_t vb_pos, intptr_t bit_depth); \
65 
66 ALF_BPC_PROTOTYPES(8, avx2)
67 ALF_BPC_PROTOTYPES(16, avx2)
68 
69 #if ARCH_X86_64
70 #define FW_PUT(name, depth, opt) \
71 static void vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
72  int height, const int8_t *hf, const int8_t *vf, int width) \
73 { \
74  ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
75 }
76 
77 #if HAVE_SSE4_EXTERNAL
78 #define FW_PUT_TAP(fname, bitd, opt ) \
79  FW_PUT(fname##4, bitd, opt ) \
80  FW_PUT(fname##8, bitd, opt ) \
81  FW_PUT(fname##16, bitd, opt ) \
82  FW_PUT(fname##32, bitd, opt ) \
83  FW_PUT(fname##64, bitd, opt ) \
84  FW_PUT(fname##128, bitd, opt ) \
85 
86 #define FW_PUT_4TAP(fname, bitd, opt) \
87  FW_PUT(fname ## 2, bitd, opt) \
88  FW_PUT_TAP(fname, bitd, opt)
89 
90 #define FW_PUT_4TAP_SSE4(bitd) \
91  FW_PUT_4TAP(pixels, bitd, sse4) \
92  FW_PUT_4TAP(4tap_h, bitd, sse4) \
93  FW_PUT_4TAP(4tap_v, bitd, sse4) \
94  FW_PUT_4TAP(4tap_hv, bitd, sse4)
95 
96 #define FW_PUT_8TAP_SSE4(bitd) \
97  FW_PUT_TAP(8tap_h, bitd, sse4) \
98  FW_PUT_TAP(8tap_v, bitd, sse4) \
99  FW_PUT_TAP(8tap_hv, bitd, sse4)
100 
101 #define FW_PUT_SSE4(bitd) \
102  FW_PUT_4TAP_SSE4(bitd) \
103  FW_PUT_8TAP_SSE4(bitd)
104 
105 FW_PUT_SSE4( 8)
106 FW_PUT_SSE4(10)
107 FW_PUT_SSE4(12)
108 #endif
109 
110 #if HAVE_AVX2_EXTERNAL
111 #define FW_PUT_TAP_AVX2(n, bitd) \
112  FW_PUT(n ## tap_h32, bitd, avx2) \
113  FW_PUT(n ## tap_h64, bitd, avx2) \
114  FW_PUT(n ## tap_h128, bitd, avx2) \
115  FW_PUT(n ## tap_v32, bitd, avx2) \
116  FW_PUT(n ## tap_v64, bitd, avx2) \
117  FW_PUT(n ## tap_v128, bitd, avx2)
118 
119 #define FW_PUT_AVX2(bitd) \
120  FW_PUT(pixels32, bitd, avx2) \
121  FW_PUT(pixels64, bitd, avx2) \
122  FW_PUT(pixels128, bitd, avx2) \
123  FW_PUT_TAP_AVX2(4, bitd) \
124  FW_PUT_TAP_AVX2(8, bitd) \
125 
126 FW_PUT_AVX2( 8)
127 FW_PUT_AVX2(10)
128 FW_PUT_AVX2(12)
129 
130 #define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
131  FW_PUT(n ## tap_h16, bitd, avx2) \
132  FW_PUT(n ## tap_v16, bitd, avx2) \
133  FW_PUT(n ## tap_hv16, bitd, avx2) \
134  FW_PUT(n ## tap_hv32, bitd, avx2) \
135  FW_PUT(n ## tap_hv64, bitd, avx2) \
136  FW_PUT(n ## tap_hv128, bitd, avx2)
137 
138 #define FW_PUT_16BPC_AVX2(bitd) \
139  FW_PUT(pixels16, bitd, avx2) \
140  FW_PUT_TAP_16BPC_AVX2(4, bitd) \
141  FW_PUT_TAP_16BPC_AVX2(8, bitd)
142 
143 FW_PUT_16BPC_AVX2(10)
144 FW_PUT_16BPC_AVX2(12)
145 
146 #define ALF_FUNCS(bpc, bd, opt) \
147 static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \
148  const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \
149 { \
150  BF(ff_vvc_alf_classify_grad, bpc, opt)(gradient_tmp, src, src_stride, width, height, vb_pos); \
151  BF(ff_vvc_alf_classify, bpc, opt)(class_idx, transpose_idx, gradient_tmp, width, height, vb_pos, bd); \
152 } \
153 
154 ALF_FUNCS(8, 8, avx2)
155 ALF_FUNCS(16, 10, avx2)
156 ALF_FUNCS(16, 12, avx2)
157 
158 #endif
159 
160 #define SAO_FILTER_FUNC(wd, bitd, opt) \
161 void ff_vvc_sao_band_filter_##wd##_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
162  const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
163 void ff_vvc_sao_edge_filter_##wd##_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
164  const int16_t *sao_offset_val, int eo, int width, int height); \
165 
166 #define SAO_FILTER_FUNCS(bitd, opt) \
167  SAO_FILTER_FUNC(8, bitd, opt) \
168  SAO_FILTER_FUNC(16, bitd, opt) \
169  SAO_FILTER_FUNC(32, bitd, opt) \
170  SAO_FILTER_FUNC(48, bitd, opt) \
171  SAO_FILTER_FUNC(64, bitd, opt) \
172  SAO_FILTER_FUNC(80, bitd, opt) \
173  SAO_FILTER_FUNC(96, bitd, opt) \
174  SAO_FILTER_FUNC(112, bitd, opt) \
175  SAO_FILTER_FUNC(128, bitd, opt) \
176 
177 SAO_FILTER_FUNCS(8, avx2)
178 SAO_FILTER_FUNCS(10, avx2)
179 SAO_FILTER_FUNCS(12, avx2)
180 
181 #define SAO_FILTER_INIT(type, bitd, opt) do { \
182  c->sao.type##_filter[0] = ff_vvc_sao_##type##_filter_8_##bitd##_##opt; \
183  c->sao.type##_filter[1] = ff_vvc_sao_##type##_filter_16_##bitd##_##opt; \
184  c->sao.type##_filter[2] = ff_vvc_sao_##type##_filter_32_##bitd##_##opt; \
185  c->sao.type##_filter[3] = ff_vvc_sao_##type##_filter_48_##bitd##_##opt; \
186  c->sao.type##_filter[4] = ff_vvc_sao_##type##_filter_64_##bitd##_##opt; \
187  c->sao.type##_filter[5] = ff_vvc_sao_##type##_filter_80_##bitd##_##opt; \
188  c->sao.type##_filter[6] = ff_vvc_sao_##type##_filter_96_##bitd##_##opt; \
189  c->sao.type##_filter[7] = ff_vvc_sao_##type##_filter_112_##bitd##_##opt; \
190  c->sao.type##_filter[8] = ff_vvc_sao_##type##_filter_128_##bitd##_##opt; \
191 } while (0)
192 
193 #define SAO_INIT(bitd, opt) do { \
194  SAO_FILTER_INIT(band, bitd, opt); \
195  SAO_FILTER_INIT(edge, bitd, opt); \
196 } while (0)
197 
198 #define AVG_INIT(bd, opt) do { \
199 void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
200  const int16_t *src0, const int16_t *src1, int width, int height);\
201 void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
202  const int16_t *src0, const int16_t *src1, int width, int height, \
203  int denom, int w0, int w1, int o); \
204  c->inter.avg = bf(ff_vvc_avg, bd, opt); \
205  c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
206 } while (0)
207 
208 #define DMVR_INIT(bd) do { \
209  c->inter.dmvr[0][0] = ff_vvc_dmvr_##bd##_avx2; \
210  c->inter.dmvr[0][1] = ff_vvc_dmvr_h_##bd##_avx2; \
211  c->inter.dmvr[1][0] = ff_vvc_dmvr_v_##bd##_avx2; \
212  c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_##bd##_avx2; \
213 } while (0)
214 
215 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \
216  dst[C][W][idx1][idx2] = vvc_put_## name ## _ ## D ## _##opt; \
217  dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \
218 
219 #define MC_TAP_LINKS(pointer, C, my, mx, fname, bitd, opt ) \
220  PEL_LINK(pointer, C, 1, my , mx , fname##4 , bitd, opt ); \
221  PEL_LINK(pointer, C, 2, my , mx , fname##8 , bitd, opt ); \
222  PEL_LINK(pointer, C, 3, my , mx , fname##16, bitd, opt ); \
223  PEL_LINK(pointer, C, 4, my , mx , fname##32, bitd, opt ); \
224  PEL_LINK(pointer, C, 5, my , mx , fname##64, bitd, opt ); \
225  PEL_LINK(pointer, C, 6, my , mx , fname##128, bitd, opt );
226 
227 #define MC_8TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
228  MC_TAP_LINKS(pointer, LUMA, my, mx, fname, bitd, opt)
229 
230 #define MC_8TAP_LINKS_SSE4(bd) \
231  MC_8TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
232  MC_8TAP_LINKS(c->inter.put, 0, 1, 8tap_h, bd, sse4); \
233  MC_8TAP_LINKS(c->inter.put, 1, 0, 8tap_v, bd, sse4); \
234  MC_8TAP_LINKS(c->inter.put, 1, 1, 8tap_hv, bd, sse4)
235 
236 #define MC_4TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
237  PEL_LINK(pointer, CHROMA, 0, my , mx , fname##2 , bitd, opt ); \
238  MC_TAP_LINKS(pointer, CHROMA, my, mx, fname, bitd, opt) \
239 
240 #define MC_4TAP_LINKS_SSE4(bd) \
241  MC_4TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
242  MC_4TAP_LINKS(c->inter.put, 0, 1, 4tap_h, bd, sse4); \
243  MC_4TAP_LINKS(c->inter.put, 1, 0, 4tap_v, bd, sse4); \
244  MC_4TAP_LINKS(c->inter.put, 1, 1, 4tap_hv, bd, sse4)
245 
246 #define MC_LINK_SSE4(bd) \
247  MC_4TAP_LINKS_SSE4(bd) \
248  MC_8TAP_LINKS_SSE4(bd)
249 
250 #define MC_TAP_LINKS_AVX2(C,tap,bd) do { \
251  PEL_LINK(c->inter.put, C, 4, 0, 0, pixels32, bd, avx2) \
252  PEL_LINK(c->inter.put, C, 5, 0, 0, pixels64, bd, avx2) \
253  PEL_LINK(c->inter.put, C, 6, 0, 0, pixels128, bd, avx2) \
254  PEL_LINK(c->inter.put, C, 4, 0, 1, tap##tap_h32, bd, avx2) \
255  PEL_LINK(c->inter.put, C, 5, 0, 1, tap##tap_h64, bd, avx2) \
256  PEL_LINK(c->inter.put, C, 6, 0, 1, tap##tap_h128, bd, avx2) \
257  PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32, bd, avx2) \
258  PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64, bd, avx2) \
259  PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2) \
260  } while (0)
261 
262 #define MC_LINKS_AVX2(bd) \
263  MC_TAP_LINKS_AVX2(LUMA, 8, bd); \
264  MC_TAP_LINKS_AVX2(CHROMA, 4, bd);
265 
266 #define MC_TAP_LINKS_16BPC_AVX2(C, tap, bd) do { \
267  PEL_LINK(c->inter.put, C, 3, 0, 0, pixels16, bd, avx2) \
268  PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2) \
269  PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2) \
270  PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2) \
271  PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2) \
272  PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2) \
273  PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
274  } while (0)
275 
276 #define MC_LINKS_16BPC_AVX2(bd) \
277  MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
278  MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
279 
280 int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
281 #define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
282 
283 #define ALF_INIT(bd, opt) do { \
284 void bf(ff_vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
285  const uint8_t *src, ptrdiff_t src_stride, int width, int height, \
286  const int16_t *filter, const int16_t *clip, int vb_pos); \
287 void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
288  const uint8_t *src, ptrdiff_t src_stride, int width, int height, \
289  const int16_t *filter, const int16_t *clip, int vb_pos); \
290  c->alf.filter[LUMA] = bf(ff_vvc_alf_filter_luma, bd, opt); \
291  c->alf.filter[CHROMA] = bf(ff_vvc_alf_filter_chroma, bd, opt); \
292  c->alf.classify = bf(vvc_alf_classify, bd, opt); \
293 } while (0)
294 
295 #endif
296 
297 
298 #endif // ARCH_X86_64
299 
300 av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
301 {
302 #if ARCH_X86_64
303  const int cpu_flags = av_get_cpu_flags();
304 
305  switch (bd) {
306  case 8:
307 #if HAVE_SSE4_EXTERNAL
308  if (EXTERNAL_SSE4(cpu_flags)) {
309  MC_LINK_SSE4(8);
310  }
311 #endif
312 #if HAVE_AVX2_EXTERNAL
314  // inter
315  AVG_INIT(8, avx2);
316  DMVR_INIT(8);
317  MC_LINKS_AVX2(8);
318  OF_INIT(8, avx2);
319  SAD_INIT();
320 
321  // filter
322  ALF_INIT(8, avx2);
323  SAO_INIT(8, avx2);
324  }
325 #endif
326  break;
327  case 10:
328 #if HAVE_SSE4_EXTERNAL
329  if (EXTERNAL_SSE4(cpu_flags)) {
330  MC_LINK_SSE4(10);
331  }
332 #endif
333 #if HAVE_AVX2_EXTERNAL
335  // inter
336  AVG_INIT(10, avx2);
337  DMVR_INIT(10);
338  MC_LINKS_AVX2(10);
339  MC_LINKS_16BPC_AVX2(10);
340  OF_INIT(10, avx2);
341  SAD_INIT();
342 
343  // filter
344  ALF_INIT(10, avx2);
345  SAO_INIT(10, avx2);
346  }
347 #endif
348  break;
349  case 12:
350 #if HAVE_SSE4_EXTERNAL
351  if (EXTERNAL_SSE4(cpu_flags)) {
352  MC_LINK_SSE4(12);
353  }
354 #endif
355 #if HAVE_AVX2_EXTERNAL
357  // inter
358  AVG_INIT(12, avx2);
359  DMVR_INIT(12);
360  MC_LINKS_AVX2(12);
361  MC_LINKS_16BPC_AVX2(12);
362  OF_INIT(12, avx2);
363  SAD_INIT();
364 
365  // filter
366  ALF_INIT(12, avx2);
367  SAO_INIT(12, avx2);
368  }
369 #endif
370  break;
371  default:
372  break;
373  }
374 #endif
375 }
cpu.h
src1
const pixel * src1
Definition: h264pred_template.c:420
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:73
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
DMVR_PROTOTYPES
#define DMVR_PROTOTYPES(bd, opt)
Definition: dsp_init.c:42
av_cold
#define av_cold
Definition: attributes.h:111
dsp.h
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
cpu.h
DMVR_INIT
#define DMVR_INIT(bd, opt)
Definition: dsp_init.c:55
attributes.h
h2656dsp.h
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:62
src0
const pixel *const src0
Definition: h264pred_template.c:419
ctu.h
ff_vvc_dsp_init_x86
av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
Definition: dsp_init.c:300
dec.h
VVCDSPContext
Definition: dsp.h:170