FFmpeg
ops.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2025 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <float.h>
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem.h"
25 
26 #include "../ops_chain.h"
27 
28 #define DECL_ENTRY(TYPE, MASK, NAME, ...) \
29  static const SwsOpEntry op_##NAME = { \
30  .type = SWS_PIXEL_##TYPE, \
31  .mask = MASK, \
32  __VA_ARGS__ \
33  }
34 
35 #define DECL_ASM(TYPE, MASK, NAME, ...) \
36  void ff_##NAME(void); \
37  DECL_ENTRY(TYPE, MASK, NAME, \
38  .func = ff_##NAME, \
39  __VA_ARGS__)
40 
41 #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
42  DECL_ASM(TYPE, SWS_COMP_MASK(X, Y, Z, W), p##X##Y##Z##W##_##NAME, \
43  __VA_ARGS__ \
44  )
45 
46 #define REF_PATTERN(NAME, X, Y, Z, W) \
47  &op_p##X##Y##Z##W##_##NAME
48 
49 #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
50  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
51  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
52  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
53  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
54 
55 #define REF_COMMON_PATTERNS(NAME) \
56  REF_PATTERN(NAME, 1, 0, 0, 0), \
57  REF_PATTERN(NAME, 1, 0, 0, 1), \
58  REF_PATTERN(NAME, 1, 1, 1, 0), \
59  REF_PATTERN(NAME, 1, 1, 1, 1)
60 
61 static int setup_rw(const SwsImplParams *params, SwsImplResult *out)
62 {
63  const SwsOp *op = params->op;
64 
65  /* 3-component reads/writes process one extra garbage word */
66  if (op->rw.packed && op->rw.elems == 3) {
67  switch (op->op) {
68  case SWS_OP_READ: out->over_read = sizeof(uint32_t); break;
69  case SWS_OP_WRITE: out->over_write = sizeof(uint32_t); break;
70  }
71  }
72 
73  return 0;
74 }
75 
76 #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
77  DECL_ASM(TYPE, SWS_COMP_ELEMS(ELEMS), NAME##ELEMS##EXT, \
78  .op = SWS_OP_##OP, \
79  .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
80  .setup = setup_rw, \
81  );
82 
83 #define DECL_PACKED_RW(EXT, DEPTH) \
84  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
85  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
86  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
87  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
88  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
89  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
90 
91 #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \
92  DECL_ASM(TYPE, SWS_COMP(0), pack_##X##Y##Z##W##EXT, \
93  .op = SWS_OP_PACK, \
94  .pack.pattern = {X, Y, Z, W}, \
95  ); \
96  \
97  DECL_ASM(TYPE, SWS_COMP_MASK(X, Y, Z, W), unpack_##X##Y##Z##W##EXT, \
98  .op = SWS_OP_UNPACK, \
99  .pack.pattern = {X, Y, Z, W}, \
100  ); \
101 
102 static int setup_swap_bytes(const SwsImplParams *params, SwsImplResult *out)
103 {
104  const int mask = ff_sws_pixel_type_size(params->op->type) - 1;
105  for (int i = 0; i < 16; i++)
106  out->priv.u8[i] = (i & ~mask) | (mask - (i & mask));
107  return 0;
108 }
109 
110 #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
111  DECL_ENTRY(TYPE, SWS_COMP_MASK(X, Y, Z, W), \
112  p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \
113  .op = SWS_OP_SWAP_BYTES, \
114  .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
115  .setup = setup_swap_bytes, \
116  );
117 
118 #define DECL_CLEAR_ALPHA(EXT, IDX) \
119  DECL_ASM(U8, SWS_COMP_ALL, clear_alpha##IDX##EXT, \
120  .op = SWS_OP_CLEAR, \
121  .clear.mask = SWS_COMP(IDX), \
122  .clear.value[IDX] = { -1, 1 }, \
123  ); \
124 
125 #define DECL_CLEAR_ZERO(EXT, IDX) \
126  DECL_ASM(U8, SWS_COMP_ALL, clear_zero##IDX##EXT, \
127  .op = SWS_OP_CLEAR, \
128  .clear.mask = SWS_COMP(IDX), \
129  .clear.value[IDX] = { 0, 1 }, \
130  );
131 
132 static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
133 {
134  const SwsOp *op = params->op;
135  for (int i = 0; i < 4; i++)
136  out->priv.u32[i] = (uint32_t) op->clear.value[i].num;
137  return 0;
138 }
139 
140 #define DECL_CLEAR(EXT, X, Y, Z, W) \
141  DECL_ASM(U8, SWS_COMP_ALL, p##X##Y##Z##W##_clear##EXT, \
142  .op = SWS_OP_CLEAR, \
143  .setup = setup_clear, \
144  .clear.mask = SWS_COMP_MASK(X, Y, Z, W), \
145  );
146 
147 #define DECL_SWIZZLE(EXT, X, Y, Z, W) \
148  DECL_ASM(U8, SWS_COMP_ALL, swizzle_##X##Y##Z##W##EXT, \
149  .op = SWS_OP_SWIZZLE, \
150  .swizzle.in = {X, Y, Z, W}, \
151  );
152 
153 #define DECL_CONVERT(EXT, FROM, TO) \
154  DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
155  .op = SWS_OP_CONVERT, \
156  .convert.to = SWS_PIXEL_##TO, \
157  );
158 
159 #define DECL_EXPAND(EXT, FROM, TO) \
160  DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
161  .op = SWS_OP_CONVERT, \
162  .convert.to = SWS_PIXEL_##TO, \
163  .convert.expand = true, \
164  );
165 
166 static int setup_shift(const SwsImplParams *params, SwsImplResult *out)
167 {
168  out->priv.u16[0] = params->op->shift.amount;
169  return 0;
170 }
171 
172 #define DECL_SHIFT16(EXT) \
173  DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
174  .op = SWS_OP_LSHIFT, \
175  .setup = setup_shift, \
176  .flexible = true, \
177  ); \
178  \
179  DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
180  .op = SWS_OP_RSHIFT, \
181  .setup = setup_shift, \
182  .flexible = true, \
183  );
184 
185 #define DECL_MIN_MAX(EXT) \
186  DECL_COMMON_PATTERNS(F32, min##EXT, \
187  .op = SWS_OP_MIN, \
188  .setup = ff_sws_setup_clamp, \
189  .flexible = true, \
190  ); \
191  \
192  DECL_COMMON_PATTERNS(F32, max##EXT, \
193  .op = SWS_OP_MAX, \
194  .setup = ff_sws_setup_clamp, \
195  .flexible = true, \
196  );
197 
198 #define DECL_SCALE(EXT) \
199  DECL_COMMON_PATTERNS(F32, scale##EXT, \
200  .op = SWS_OP_SCALE, \
201  .setup = ff_sws_setup_scale, \
202  .flexible = true, \
203  );
204 
205 #define DECL_EXPAND_BITS(EXT, BITS) \
206  DECL_ASM(U##BITS, SWS_COMP(0), expand_bits##BITS##EXT, \
207  .op = SWS_OP_SCALE, \
208  .scale = { .num = ((1 << (BITS)) - 1), .den = 1 }, \
209  );
210 
211 static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
212 {
213  const SwsOp *op = params->op;
214  /* 1x1 matrix / single constant */
215  if (!op->dither.size_log2) {
216  const AVRational k = op->dither.matrix[0];
217  out->priv.f32[0] = (float) k.num / k.den;
218  return 0;
219  }
220 
221  const int size = 1 << op->dither.size_log2;
222  const int8_t *off = op->dither.y_offset;
223  int max_offset = 0;
224  for (int i = 0; i < 4; i++) {
225  if (off[i] >= 0)
226  max_offset = FFMAX(max_offset, off[i] & (size - 1));
227  }
228 
229  /* Allocate extra rows to allow over-reading for row offsets. Note that
230  * max_offset is currently never larger than 5, so the extra space needed
231  * for this over-allocation is bounded by 5 * size * sizeof(float),
232  * typically 320 bytes for a 16x16 dither matrix. */
233  const int stride = size * sizeof(float);
234  const int num_rows = size + max_offset;
235  float *matrix = out->priv.ptr = av_mallocz(num_rows * stride);
236  if (!matrix)
237  return AVERROR(ENOMEM);
238  out->free = ff_op_priv_free;
239 
240  for (int i = 0; i < size * size; i++)
241  matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
242 
243  memcpy(&matrix[size * size], matrix, max_offset * stride);
244 
245  /* Store relative pointer offset to each row inside extra space */
246  static_assert(sizeof(out->priv.ptr) <= sizeof(int16_t[4]),
247  ">8 byte pointers not supported");
248  assert(max_offset * stride <= INT16_MAX);
249  int16_t *off_out = &out->priv.i16[4];
250  for (int i = 0; i < 4; i++)
251  off_out[i] = off[i] >= 0 ? (off[i] & (size - 1)) * stride : -1;
252 
253  return 0;
254 }
255 
256 #define DECL_DITHER0(EXT) \
257  DECL_COMMON_PATTERNS(F32, dither0##EXT, \
258  .op = SWS_OP_DITHER, \
259  .setup = setup_dither, \
260  );
261 
262 #define DECL_DITHER(EXT, SIZE) \
263  DECL_ASM(F32, SWS_COMP_ALL, dither##SIZE##EXT, \
264  .op = SWS_OP_DITHER, \
265  .setup = setup_dither, \
266  .dither_size = SIZE, \
267  );
268 
269 static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
270 {
271  const SwsOp *op = params->op;
272 
273  float *matrix = out->priv.ptr = av_mallocz(sizeof(float[4][5]));
274  if (!matrix)
275  return AVERROR(ENOMEM);
276  out->free = ff_op_priv_free;
277 
278  for (int y = 0; y < 4; y++) {
279  for (int x = 0; x < 5; x++)
280  matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
281  }
282 
283  return 0;
284 }
285 
286 #define DECL_LINEAR(EXT, NAME, MASK) \
287  DECL_ASM(F32, SWS_COMP_ALL, NAME##EXT, \
288  .op = SWS_OP_LINEAR, \
289  .setup = setup_linear, \
290  .linear_mask = (MASK), \
291  );
292 
293 static bool check_filter_fma(const SwsImplParams *params)
294 {
295  const SwsOp *op = params->op;
296  SwsContext *ctx = params->ctx;
297  if (!(ctx->flags & SWS_BITEXACT))
298  return true;
299 
300  if (!ff_sws_pixel_type_is_int(op->type))
301  return false;
302 
303  /* Check if maximum/minimum partial sum fits losslessly inside float */
304  AVRational max_range = { 1 << 24, 1 };
305  AVRational min_range = { -(1 << 24), 1 };
307 
308  for (int i = 0; i < op->rw.elems; i++) {
309  const AVRational min = av_mul_q(op->comps.min[i], scale);
310  const AVRational max = av_mul_q(op->comps.max[i], scale);
311  if (av_cmp_q(min, min_range) < 0 || av_cmp_q(max_range, max) < 0)
312  return false;
313  }
314 
315  return true;
316 }
317 
318 static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
319 {
320  const SwsFilterWeights *filter = params->op->rw.kernel;
321  static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
322  ">8 byte pointers not supported");
323 
324  /* Pre-convert weights to float */
325  float *weights = av_calloc(filter->num_weights, sizeof(float));
326  if (!weights)
327  return AVERROR(ENOMEM);
328 
329  for (int i = 0; i < filter->num_weights; i++)
330  weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
331 
332  out->priv.ptr = weights;
333  out->priv.uptr[1] = filter->filter_size;
334  out->free = ff_op_priv_free;
335  return 0;
336 }
337 
338 static int hscale_sizeof_weight(const SwsOp *op)
339 {
340  switch (op->type) {
341  case SWS_PIXEL_U8: return sizeof(int16_t);
342  case SWS_PIXEL_U16: return sizeof(int16_t);
343  case SWS_PIXEL_F32: return sizeof(float);
344  default: return 0;
345  }
346 }
347 
348 static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
349 {
350  const SwsOp *op = params->op;
351  const SwsFilterWeights *filter = op->rw.kernel;
352 
353  /**
354  * `vpgatherdd` gathers 32 bits at a time; so if we're filtering a smaller
355  * size, we need to gather 2/4 taps simultaneously and unroll the inner
356  * loop over several packed samples.
357  */
358  const int pixel_size = ff_sws_pixel_type_size(op->type);
359  const int taps_align = sizeof(int32_t) / pixel_size;
360  const int filter_size = filter->filter_size;
361  const int block_size = params->table->block_size;
362  const size_t aligned_size = FFALIGN(filter_size, taps_align);
363  const size_t line_size = FFALIGN(filter->dst_size, block_size);
364  av_assert1(FFALIGN(line_size, taps_align) == line_size);
365  if (aligned_size > INT_MAX)
366  return AVERROR(EINVAL);
367 
368  union {
369  void *ptr;
370  int16_t *i16;
371  float *f32;
372  } weights;
373 
374  const int sizeof_weight = hscale_sizeof_weight(op);
375  weights.ptr = av_calloc(line_size, sizeof_weight * aligned_size);
376  if (!weights.ptr)
377  return AVERROR(ENOMEM);
378 
379  /**
380  * Transpose filter weights to group (aligned) taps by block
381  */
382  const int mmsize = block_size * 2;
383  const int gather_size = mmsize / sizeof(int32_t); /* pixels per vpgatherdd */
384  for (size_t x = 0; x < line_size; x += block_size) {
385  const int elems = FFMIN(block_size, filter->dst_size - x);
386  for (int j = 0; j < filter_size; j++) {
387  const int jb = j & ~(taps_align - 1);
388  const int ji = j - jb;
389  const size_t idx_base = x * aligned_size + jb * block_size + ji;
390  for (int i = 0; i < elems; i++) {
391  const int w = filter->weights[(x + i) * filter_size + j];
392  size_t idx = idx_base;
393  if (op->type == SWS_PIXEL_U8) {
394  /* Interleave the pixels within each lane, i.e.:
395  * [a0 a1 a2 a3 | b0 b1 b2 b3 ] pixels 0-1, taps 0-3 (lane 0)
396  * [e0 e1 e2 e3 | f0 f1 f2 f3 ] pixels 4-5, taps 0-3 (lane 1)
397  * [c0 c1 c2 c3 | d0 d1 d2 d3 ] pixels 2-3, taps 0-3 (lane 0)
398  * [g0 g1 g2 g3 | h0 h1 h2 h3 ] pixels 6-7, taps 0-3 (lane 1)
399  * [i0 i1 i2 i3 | j0 j1 j2 j3 ] pixels 8-9, taps 0-3 (lane 0)
400  * ...
401  * [o0 o1 o2 o3 | p0 p1 p2 p3 ] pixels 14-15, taps 0-3 (lane 1)
402  * (repeat for taps 4-7, etc.)
403  */
404  const int gather_base = i & ~(gather_size - 1);
405  const int gather_pos = i - gather_base;
406  const int lane_idx = gather_pos >> 2;
407  const int pos_in_lane = gather_pos & 3;
408  idx += gather_base * 4 /* which gather (m0 or m1) */
409  + (pos_in_lane >> 1) * (mmsize / 2) /* lo/hi unpack */
410  + lane_idx * 8 /* 8 ints per lane */
411  + (pos_in_lane & 1) * 4; /* 4 taps per pair */
412  } else {
413  idx += i * taps_align;
414  }
415 
416  switch (op->type) {
417  case SWS_PIXEL_U8: weights.i16[idx] = w; break;
418  case SWS_PIXEL_U16: weights.i16[idx] = w; break;
419  case SWS_PIXEL_F32: weights.f32[idx] = w; break;
420  }
421  }
422  }
423  }
424 
425  out->priv.ptr = weights.ptr;
426  out->priv.uptr[1] = aligned_size;
427  out->free = ff_op_priv_free;
428  out->over_read = (aligned_size - filter_size) * pixel_size;
429  return 0;
430 }
431 
432 static bool check_filter_4x4_h(const SwsImplParams *params)
433 {
434  SwsContext *ctx = params->ctx;
435  const SwsOp *op = params->op;
436  if ((ctx->flags & SWS_BITEXACT) && op->type == SWS_PIXEL_F32)
437  return false; /* different accumulation order due to 4x4 transpose */
438 
439  const int cpu_flags = av_get_cpu_flags();
441  return true; /* always prefer over gathers if gathers are slow */
442 
443  /**
444  * Otherwise, prefer it above a certain filter size. Empirically, this
445  * kernel seems to be faster whenever the reference/gather kernel crosses
446  * a breakpoint for the number of gathers needed, but this filter doesn't.
447  *
448  * Tested on a Lunar Lake (Intel Core Ultra 7 258V) system.
449  */
450  const SwsFilterWeights *filter = op->rw.kernel;
451  return op->type == SWS_PIXEL_U8 && filter->filter_size > 12 ||
452  op->type == SWS_PIXEL_U16 && filter->filter_size > 4 ||
453  op->type == SWS_PIXEL_F32 && filter->filter_size > 1;
454 }
455 
457 {
458  const SwsOp *op = params->op;
459  const SwsFilterWeights *filter = op->rw.kernel;
460  const int pixel_size = ff_sws_pixel_type_size(op->type);
461  const int sizeof_weights = hscale_sizeof_weight(op);
462  const int block_size = params->table->block_size;
463  const int taps_align = 16 / sizeof_weights; /* taps per iteration (XMM) */
464  const int pixels_align = 4; /* pixels per iteration */
465  const int filter_size = filter->filter_size;
466  const size_t aligned_size = FFALIGN(filter_size, taps_align);
467  const int line_size = FFALIGN(filter->dst_size, block_size);
468  av_assert1(FFALIGN(line_size, pixels_align) == line_size);
469 
470  union {
471  void *ptr;
472  int16_t *i16;
473  float *f32;
474  } weights;
475 
476  weights.ptr = av_calloc(line_size, aligned_size * sizeof_weights);
477  if (!weights.ptr)
478  return AVERROR(ENOMEM);
479 
480  /**
481  * Desired memory layout: [w][taps][pixels_align][taps_align]
482  *
483  * Example with taps_align=8, pixels_align=4:
484  * [a0, a1, ... a7] weights for pixel 0, taps 0..7
485  * [b0, b1, ... b7] weights for pixel 1, taps 0..7
486  * [c0, c1, ... c7] weights for pixel 2, taps 0..7
487  * [d0, d1, ... d7] weights for pixel 3, taps 0..7
488  * [a8, a9, ... a15] weights for pixel 0, taps 8..15
489  * ...
490  * repeat for all taps, then move on to pixels 4..7, etc.
491  */
492  for (int x = 0; x < filter->dst_size; x++) {
493  for (int j = 0; j < filter_size; j++) {
494  const int xb = x & ~(pixels_align - 1);
495  const int jb = j & ~(taps_align - 1);
496  const int xi = x - xb, ji = j - jb;
497  const int w = filter->weights[x * filter_size + j];
498  const int idx = xb * aligned_size + jb * pixels_align + xi * taps_align + ji;
499 
500  switch (op->type) {
501  case SWS_PIXEL_U8: weights.i16[idx] = w; break;
502  case SWS_PIXEL_U16: weights.i16[idx] = w; break;
503  case SWS_PIXEL_F32: weights.f32[idx] = w; break;
504  }
505  }
506  }
507 
508  out->priv.ptr = weights.ptr;
509  out->priv.uptr[1] = aligned_size * sizeof_weights;
510  out->free = ff_op_priv_free;
511  out->over_read = (aligned_size - filter_size) * pixel_size;
512  return 0;
513 }
514 
515 #define DECL_FILTER(EXT, TYPE, DIR, NAME, ELEMS, ...) \
516  DECL_ASM(TYPE, SWS_COMP_ELEMS(ELEMS), NAME##ELEMS##_##TYPE##EXT, \
517  .op = SWS_OP_READ, \
518  .rw.elems = ELEMS, \
519  .rw.filter = SWS_OP_FILTER_##DIR, \
520  __VA_ARGS__ \
521  );
522 
523 #define DECL_FILTERS(EXT, TYPE, DIR, NAME, ...) \
524  DECL_FILTER(EXT, TYPE, DIR, NAME, 1, __VA_ARGS__) \
525  DECL_FILTER(EXT, TYPE, DIR, NAME, 2, __VA_ARGS__) \
526  DECL_FILTER(EXT, TYPE, DIR, NAME, 3, __VA_ARGS__) \
527  DECL_FILTER(EXT, TYPE, DIR, NAME, 4, __VA_ARGS__)
528 
529 #define DECL_FILTERS_GENERIC(EXT, TYPE) \
530  DECL_FILTERS(EXT, TYPE, V, filter_v, .setup = setup_filter_v) \
531  DECL_FILTERS(EXT, TYPE, V, filter_fma_v, .setup = setup_filter_v, \
532  .check = check_filter_fma) \
533  DECL_FILTERS(EXT, TYPE, H, filter_h, .setup = setup_filter_h) \
534  DECL_FILTERS(EXT, TYPE, H, filter_4x4_h, .setup = setup_filter_4x4_h, \
535  .check = check_filter_4x4_h)
536 
537 #define REF_FILTERS(NAME, SUFFIX) \
538  &op_##NAME##1##SUFFIX, \
539  &op_##NAME##2##SUFFIX, \
540  &op_##NAME##3##SUFFIX, \
541  &op_##NAME##4##SUFFIX
542 
543 #define DECL_FUNCS_8(SIZE, EXT, FLAG) \
544  DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
545  DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
546  DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
547  DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
548  DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
549  DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
550  DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
551  DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
552  DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
553  DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
554  DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
555  DECL_EXPAND_BITS(EXT, 8) \
556  DECL_PACKED_RW(EXT, 8) \
557  DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
558  DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
559  DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \
560  void ff_p1000_shuffle##EXT(void); \
561  void ff_p1001_shuffle##EXT(void); \
562  void ff_p1110_shuffle##EXT(void); \
563  void ff_p1111_shuffle##EXT(void); \
564  DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
565  DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
566  DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
567  DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
568  DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
569  DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
570  DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
571  DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
572  DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
573  DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
574  DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
575  DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
576  DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
577  DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
578  DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
579  DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
580  DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
581  DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
582  DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
583  DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
584  DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
585  DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
586  DECL_CLEAR_ALPHA(EXT, 0) \
587  DECL_CLEAR_ALPHA(EXT, 1) \
588  DECL_CLEAR_ALPHA(EXT, 3) \
589  DECL_CLEAR_ZERO(EXT, 0) \
590  DECL_CLEAR_ZERO(EXT, 1) \
591  DECL_CLEAR_ZERO(EXT, 3) \
592  DECL_CLEAR(EXT, 0, 0, 0, 1) \
593  DECL_CLEAR(EXT, 1, 0, 0, 0) \
594  DECL_CLEAR(EXT, 1, 1, 0, 0) \
595  DECL_CLEAR(EXT, 0, 1, 1, 0) \
596  DECL_CLEAR(EXT, 0, 0, 1, 1) \
597  DECL_CLEAR(EXT, 1, 0, 1, 0) \
598  DECL_CLEAR(EXT, 0, 1, 0, 1) \
599  DECL_CLEAR(EXT, 0, 1, 1, 1) \
600  DECL_CLEAR(EXT, 1, 0, 1, 1) \
601  DECL_CLEAR(EXT, 1, 1, 0, 1) \
602  \
603 static const SwsOpTable ops8##EXT = { \
604  .cpu_flags = AV_CPU_FLAG_##FLAG, \
605  .block_size = SIZE, \
606  .entries = { \
607  &op_read_planar1##EXT, \
608  &op_read_planar2##EXT, \
609  &op_read_planar3##EXT, \
610  &op_read_planar4##EXT, \
611  &op_write_planar1##EXT, \
612  &op_write_planar2##EXT, \
613  &op_write_planar3##EXT, \
614  &op_write_planar4##EXT, \
615  &op_read8_packed2##EXT, \
616  &op_read8_packed3##EXT, \
617  &op_read8_packed4##EXT, \
618  &op_write8_packed2##EXT, \
619  &op_write8_packed3##EXT, \
620  &op_write8_packed4##EXT, \
621  &op_read_nibbles1##EXT, \
622  &op_read_bits1##EXT, \
623  &op_write_bits1##EXT, \
624  &op_expand_bits8##EXT, \
625  &op_pack_1210##EXT, \
626  &op_pack_3320##EXT, \
627  &op_pack_2330##EXT, \
628  &op_unpack_1210##EXT, \
629  &op_unpack_3320##EXT, \
630  &op_unpack_2330##EXT, \
631  &op_swizzle_3012##EXT, \
632  &op_swizzle_3021##EXT, \
633  &op_swizzle_2103##EXT, \
634  &op_swizzle_3210##EXT, \
635  &op_swizzle_3102##EXT, \
636  &op_swizzle_3201##EXT, \
637  &op_swizzle_1203##EXT, \
638  &op_swizzle_1023##EXT, \
639  &op_swizzle_2013##EXT, \
640  &op_swizzle_2310##EXT, \
641  &op_swizzle_2130##EXT, \
642  &op_swizzle_1230##EXT, \
643  &op_swizzle_1320##EXT, \
644  &op_swizzle_0213##EXT, \
645  &op_swizzle_0231##EXT, \
646  &op_swizzle_0312##EXT, \
647  &op_swizzle_3120##EXT, \
648  &op_swizzle_0321##EXT, \
649  &op_swizzle_0003##EXT, \
650  &op_swizzle_0001##EXT, \
651  &op_swizzle_3000##EXT, \
652  &op_swizzle_1000##EXT, \
653  &op_clear_alpha0##EXT, \
654  &op_clear_alpha1##EXT, \
655  &op_clear_alpha3##EXT, \
656  &op_clear_zero0##EXT, \
657  &op_clear_zero1##EXT, \
658  &op_clear_zero3##EXT, \
659  REF_PATTERN(clear##EXT, 0, 0, 0, 1), \
660  REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
661  REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
662  REF_PATTERN(clear##EXT, 0, 1, 1, 0), \
663  REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
664  REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
665  REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
666  REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
667  REF_PATTERN(clear##EXT, 1, 0, 1, 1), \
668  REF_PATTERN(clear##EXT, 1, 1, 0, 1), \
669  NULL \
670  }, \
671 };
672 
673 #define DECL_FUNCS_16(SIZE, EXT, FLAG) \
674  DECL_PACKED_RW(EXT, 16) \
675  DECL_EXPAND_BITS(EXT, 16) \
676  DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
677  DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
678  DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
679  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
680  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
681  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
682  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
683  DECL_SHIFT16(EXT) \
684  DECL_CONVERT(EXT, U8, U16) \
685  DECL_CONVERT(EXT, U16, U8) \
686  DECL_EXPAND(EXT, U8, U16) \
687  \
688 static const SwsOpTable ops16##EXT = { \
689  .cpu_flags = AV_CPU_FLAG_##FLAG, \
690  .block_size = SIZE, \
691  .entries = { \
692  &op_read16_packed2##EXT, \
693  &op_read16_packed3##EXT, \
694  &op_read16_packed4##EXT, \
695  &op_write16_packed2##EXT, \
696  &op_write16_packed3##EXT, \
697  &op_write16_packed4##EXT, \
698  &op_pack_4440##EXT, \
699  &op_pack_5550##EXT, \
700  &op_pack_5650##EXT, \
701  &op_unpack_4440##EXT, \
702  &op_unpack_5550##EXT, \
703  &op_unpack_5650##EXT, \
704  &op_expand_bits16##EXT, \
705  REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
706  REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
707  REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
708  REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
709  REF_COMMON_PATTERNS(lshift16##EXT), \
710  REF_COMMON_PATTERNS(rshift16##EXT), \
711  NULL \
712  }, \
713 };
714 
715 #define DECL_FUNCS_32(SIZE, EXT, FLAG) \
716  DECL_PACKED_RW(_m2##EXT, 32) \
717  DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \
718  DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \
719  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
720  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
721  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
722  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
723  DECL_CONVERT(EXT, U8, U32) \
724  DECL_CONVERT(EXT, U32, U8) \
725  DECL_CONVERT(EXT, U16, U32) \
726  DECL_CONVERT(EXT, U32, U16) \
727  DECL_CONVERT(EXT, U8, F32) \
728  DECL_CONVERT(EXT, F32, U8) \
729  DECL_CONVERT(EXT, U16, F32) \
730  DECL_CONVERT(EXT, F32, U16) \
731  DECL_EXPAND(EXT, U8, U32) \
732  DECL_MIN_MAX(EXT) \
733  DECL_SCALE(EXT) \
734  DECL_DITHER0(EXT) \
735  DECL_DITHER(EXT, 1) \
736  DECL_DITHER(EXT, 2) \
737  DECL_DITHER(EXT, 3) \
738  DECL_DITHER(EXT, 4) \
739  DECL_DITHER(EXT, 5) \
740  DECL_DITHER(EXT, 6) \
741  DECL_DITHER(EXT, 7) \
742  DECL_DITHER(EXT, 8) \
743  DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
744  DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
745  DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
746  DECL_LINEAR(EXT, yalpha, SWS_MASK(1, 1)) \
747  DECL_LINEAR(EXT, dot3, 0x7) \
748  DECL_LINEAR(EXT, dot3a, 0x7 | SWS_MASK_ALPHA) \
749  DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0) ^ SWS_MASK(0, 3)) \
750  DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
751  DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
752  DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
753  DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
754  DECL_LINEAR(EXT, affine3uv, \
755  SWS_MASK_MAT3 | SWS_MASK_OFF(1) | SWS_MASK_OFF(2)) \
756  DECL_LINEAR(EXT, affine3x, \
757  SWS_MASK_MAT3 ^ SWS_MASK(0, 1) | SWS_MASK_OFF3) \
758  DECL_LINEAR(EXT, affine3xa, \
759  SWS_MASK_MAT3 ^ SWS_MASK(0, 1) | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
760  DECL_LINEAR(EXT, affine3xy, \
761  SWS_MASK_MAT3 ^ SWS_MASK(0, 0) ^ SWS_MASK(0, 1) | SWS_MASK_OFF3) \
762  DECL_LINEAR(EXT, affine3a, \
763  SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
764  DECL_FILTERS_GENERIC(EXT, U8) \
765  DECL_FILTERS_GENERIC(EXT, U16) \
766  DECL_FILTERS_GENERIC(EXT, F32) \
767  \
768 static const SwsOpTable ops32##EXT = { \
769  .cpu_flags = AV_CPU_FLAG_##FLAG, \
770  .block_size = SIZE, \
771  .entries = { \
772  &op_read32_packed2_m2##EXT, \
773  &op_read32_packed3_m2##EXT, \
774  &op_read32_packed4_m2##EXT, \
775  &op_write32_packed2_m2##EXT, \
776  &op_write32_packed3_m2##EXT, \
777  &op_write32_packed4_m2##EXT, \
778  &op_pack_1010102_m2##EXT, \
779  &op_pack_2101010_m2##EXT, \
780  &op_unpack_1010102_m2##EXT, \
781  &op_unpack_2101010_m2##EXT, \
782  REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
783  REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
784  REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
785  REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
786  REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
787  REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
788  REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
789  REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
790  REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
791  REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
792  REF_COMMON_PATTERNS(min##EXT), \
793  REF_COMMON_PATTERNS(max##EXT), \
794  REF_COMMON_PATTERNS(scale##EXT), \
795  REF_COMMON_PATTERNS(dither0##EXT), \
796  &op_dither1##EXT, \
797  &op_dither2##EXT, \
798  &op_dither3##EXT, \
799  &op_dither4##EXT, \
800  &op_dither5##EXT, \
801  &op_dither6##EXT, \
802  &op_dither7##EXT, \
803  &op_dither8##EXT, \
804  &op_luma##EXT, \
805  &op_alpha##EXT, \
806  &op_lumalpha##EXT, \
807  &op_yalpha##EXT, \
808  &op_dot3##EXT, \
809  &op_dot3a##EXT, \
810  &op_row0##EXT, \
811  &op_diag3##EXT, \
812  &op_diag4##EXT, \
813  &op_diagoff3##EXT, \
814  &op_affine3##EXT, \
815  &op_affine3uv##EXT, \
816  &op_affine3x##EXT, \
817  &op_affine3xa##EXT, \
818  &op_affine3xy##EXT, \
819  &op_affine3a##EXT, \
820  REF_FILTERS(filter_fma_v, _U8##EXT), \
821  REF_FILTERS(filter_fma_v, _U16##EXT), \
822  REF_FILTERS(filter_fma_v, _F32##EXT), \
823  REF_FILTERS(filter_4x4_h, _U8##EXT), \
824  REF_FILTERS(filter_4x4_h, _U16##EXT), \
825  REF_FILTERS(filter_4x4_h, _F32##EXT), \
826  REF_FILTERS(filter_v, _U8##EXT), \
827  REF_FILTERS(filter_v, _U16##EXT), \
828  REF_FILTERS(filter_v, _F32##EXT), \
829  REF_FILTERS(filter_h, _U8##EXT), \
830  REF_FILTERS(filter_h, _U16##EXT), \
831  REF_FILTERS(filter_h, _F32##EXT), \
832  NULL \
833  }, \
834 };
835 
836 DECL_FUNCS_8(16, _m1_sse4, SSE4)
837 DECL_FUNCS_8(32, _m1_avx2, AVX2)
838 DECL_FUNCS_8(32, _m2_sse4, SSE4)
839 DECL_FUNCS_8(64, _m2_avx2, AVX2)
840 
841 DECL_FUNCS_16(16, _m1_avx2, AVX2)
842 DECL_FUNCS_16(32, _m2_avx2, AVX2)
843 
844 DECL_FUNCS_32(16, _avx2, AVX2)
845 
846 static const SwsOpTable *const tables[] = {
847  &ops8_m1_sse4,
848  &ops8_m1_avx2,
849  &ops8_m2_sse4,
850  &ops8_m2_avx2,
851  &ops16_m1_avx2,
852  &ops16_m2_avx2,
853  &ops32_avx2,
854 };
855 
856 static av_const int get_mmsize(const int cpu_flags)
857 {
859  return 64;
860  else if (cpu_flags & AV_CPU_FLAG_AVX2)
861  return 32;
862  else if (cpu_flags & AV_CPU_FLAG_SSE4)
863  return 16;
864  else
865  return AVERROR(ENOTSUP);
866 }
867 
868 /**
869  * Returns true if the operation's implementation only depends on the block
870  * size, and not the underlying pixel type
871  */
872 static bool op_is_type_invariant(const SwsOp *op)
873 {
874  switch (op->op) {
875  case SWS_OP_READ:
876  case SWS_OP_WRITE:
877  return !(op->rw.elems > 1 && op->rw.packed) && !op->rw.frac && !op->rw.filter;
878  case SWS_OP_SWIZZLE:
879  case SWS_OP_CLEAR:
880  return true;
881  }
882 
883  return false;
884 }
885 
886 static int movsize(const int bytes, const int mmsize)
887 {
888  return bytes <= 4 ? 4 : /* movd */
889  bytes <= 8 ? 8 : /* movq */
890  mmsize; /* movu */
891 }
892 
893 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
894 {
895  uint8_t shuffle[16];
896  int read_bytes, write_bytes;
897  int pixels;
898 
899  /* Solve the shuffle mask for one 128-bit lane only */
900  pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
901  if (pixels < 0)
902  return pixels;
903 
904  /* We can't shuffle acress lanes, so restrict the vector size to XMM
905  * whenever the read/write size would be a subset of the full vector */
906  if (read_bytes < 16 || write_bytes < 16)
907  mmsize = 16;
908 
909  const int num_lanes = mmsize / 16;
910  const int in_total = num_lanes * read_bytes;
911  const int out_total = num_lanes * write_bytes;
912 
913  *out = (SwsCompiledOp) {
914  .priv = av_memdup(shuffle, sizeof(shuffle)),
915  .free = av_free,
916  .slice_align = 1,
917  .block_size = pixels * num_lanes,
918  .over_read = movsize(in_total, mmsize) - in_total,
919  .over_write = movsize(out_total, mmsize) - out_total,
920  .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
921  mmsize > 16 ? AV_CPU_FLAG_AVX2 :
923  };
924 
925  if (!out->priv)
926  return AVERROR(ENOMEM);
927 
928 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
929 do { \
930  SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
931  if (in_total == IN && out_total == OUT) \
932  out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
933 } while (0)
934 
935  ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
936  ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
937  ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
938  ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
939  ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
940  ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
941  ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
942  ASSIGN_SHUFFLE_FUNC(15, 5, sse4);
943  ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
944  ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
945  ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
946  ASSIGN_SHUFFLE_FUNC(16, 4, sse4);
947  ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
948  ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
949  ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
950  ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
951  ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
952  ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
953  av_assert1(out->func);
954  return 0;
955 }
956 
957 /* Normalize clear values into 32-bit integer constants */
958 static void normalize_clear(SwsOp *op)
959 {
960  static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
961  SwsImplResult res;
962  union {
963  uint32_t u32;
964  int i;
965  } c;
966 
967  ff_sws_setup_clear(&(const SwsImplParams) { .op = op }, &res);
968 
969  for (int i = 0; i < 4; i++) {
970  if (!SWS_COMP_TEST(op->clear.mask, i))
971  continue;
972  switch (ff_sws_pixel_type_size(op->type)) {
973  case 1: c.u32 = 0x1010101U * res.priv.u8[i]; break;
974  case 2: c.u32 = (uint32_t) res.priv.u16[i] << 16 | res.priv.u16[i]; break;
975  case 4: c.u32 = res.priv.u32[i]; break;
976  }
977 
978  op->clear.value[i].num = c.i;
979  op->clear.value[i].den = 1;
980  }
981 }
982 
984 {
985  int ret;
986  const int cpu_flags = av_get_cpu_flags();
987  const int mmsize = get_mmsize(cpu_flags);
988  if (mmsize < 0)
989  return mmsize;
990 
991  /* Special fast path for in-place packed shuffle */
992  ret = solve_shuffle(ops, mmsize, out);
993  if (ret != AVERROR(ENOTSUP))
994  return ret;
995 
997  if (!chain)
998  return AVERROR(ENOMEM);
999 
1000  *out = (SwsCompiledOp) {
1001  .priv = chain,
1002  .slice_align = 1,
1004 
1005  /* Use at most two full YMM regs during the widest precision section */
1006  .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
1007  };
1008 
1009  for (int i = 0; i < ops->num_ops; i++) {
1010  int op_block_size = out->block_size;
1011  SwsOp *op = &ops->ops[i];
1012 
1013  if (op_is_type_invariant(op)) {
1014  if (op->op == SWS_OP_CLEAR)
1016  op_block_size *= ff_sws_pixel_type_size(op->type);
1017  op->type = SWS_PIXEL_U8;
1018  }
1019 
1021  ops, i, op_block_size, chain);
1022  if (ret < 0) {
1023  av_log(ctx, AV_LOG_TRACE, "Failed to compile op %d\n", i);
1024  ff_sws_op_chain_free(chain);
1025  return ret;
1026  }
1027  }
1028 
1029 #define ASSIGN_PROCESS_FUNC(NAME) \
1030  do { \
1031  SWS_DECL_FUNC(NAME); \
1032  out->func = NAME; \
1033  } while (0)
1034 
1035  const SwsOp *read = ff_sws_op_list_input(ops);
1036  const SwsOp *write = ff_sws_op_list_output(ops);
1037  const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
1038  const int write_planes = write->rw.packed ? 1 : write->rw.elems;
1039  switch (FFMAX(read_planes, write_planes)) {
1040  case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
1041  case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
1042  case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
1043  case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
1044  }
1045 
1046  if (ret < 0) {
1047  ff_sws_op_chain_free(chain);
1048  return ret;
1049  }
1050 
1051  out->cpu_flags = chain->cpu_flags;
1052  out->over_read = chain->over_read;
1053  out->over_write = chain->over_write;
1054  return 0;
1055 }
1056 
1058  .name = "x86",
1059  .compile = compile,
1060  .hw_format = AV_PIX_FMT_NONE,
1061 };
SWS_OP_READ
@ SWS_OP_READ
Definition: ops.h:50
SwsOpTable
Copyright (C) 2025 Niklas Haas.
Definition: ops_chain.h:159
check_filter_fma
static bool check_filter_fma(const SwsImplParams *params)
Definition: ops.c:293
SWS_PIXEL_U16
@ SWS_PIXEL_U16
Definition: ops.h:36
SWS_OP_SWIZZLE
@ SWS_OP_SWIZZLE
Definition: ops.h:53
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
ff_sws_setup_clear
int ff_sws_setup_clear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops_chain.c:282
ASSIGN_PROCESS_FUNC
#define ASSIGN_PROCESS_FUNC(NAME)
get_mmsize
static av_const int get_mmsize(const int cpu_flags)
Definition: ops.c:856
out
static FILE * out
Definition: movenc.c:55
ff_sws_op_list_input
const SwsOp * ff_sws_op_list_input(const SwsOpList *ops)
Returns the input operation for a given op list, or NULL if there is none (e.g.
Definition: ops.c:670
SWS_OP_CLEAR
@ SWS_OP_CLEAR
Definition: ops.h:62
ff_sws_op_list_max_size
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
Definition: ops.c:747
backend_x86
const SwsOpBackend backend_x86
Definition: ops.c:1057
matrix
Definition: vc1dsp.c:43
mask
int mask
Definition: mediacodecdec_common.c:154
SwsOp::rw
SwsReadWriteOp rw
Definition: ops.h:243
normalize_clear
static void normalize_clear(SwsOp *op)
Definition: ops.c:958
SwsFilterWeights
Represents a computed filter kernel.
Definition: filters.h:64
av_const
#define av_const
Definition: attributes.h:105
SWS_BITEXACT
@ SWS_BITEXACT
Definition: swscale.h:157
read_bytes
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
Definition: vf_nnedi.c:442
float.h
DECL_FUNCS_32
#define DECL_FUNCS_32(SIZE, EXT, FLAG)
Definition: ops.c:715
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
setup_linear
static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:269
SwsOpBackend::name
const char * name
Definition: ops_internal.h:56
ff_sws_pixel_type_size
int ff_sws_pixel_type_size(SwsPixelType type)
Definition: ops.c:76
SwsOpChain::cpu_flags
int cpu_flags
Definition: ops_chain.h:89
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
SwsOpTable::block_size
int block_size
Definition: ops_chain.h:161
SwsOpPriv::u32
uint32_t u32[4]
Definition: ops_chain.h:54
setup_dither
static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:211
SWS_PIXEL_F32
@ SWS_PIXEL_F32
Definition: ops.h:38
SwsOpList::num_ops
int num_ops
Definition: ops.h:290
SWS_PIXEL_U8
@ SWS_PIXEL_U8
Definition: ops.h:35
SWS_COMP_TEST
#define SWS_COMP_TEST(mask, X)
Definition: ops.h:89
ff_sws_pixel_type_is_int
bool ff_sws_pixel_type_is_int(SwsPixelType type)
Definition: ops.c:91
AVRational::num
int num
Numerator.
Definition: rational.h:59
AV_CPU_FLAG_SLOW_GATHER
#define AV_CPU_FLAG_SLOW_GATHER
CPU has slow gathers.
Definition: cpu.h:62
SwsOpChain::over_read
int over_read
Definition: ops_chain.h:90
AV_CPU_FLAG_AVX512
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
Definition: cpu.h:60
Q
#define Q(q)
SwsOpChain::free
void(* free[SWS_MAX_OPS+1])(SwsOpPriv *)
Definition: ops_chain.h:87
avassert.h
AV_LOG_TRACE
#define AV_LOG_TRACE
Extremely verbose debugging, useful for libav* development.
Definition: log.h:236
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
float
float
Definition: af_crystalizer.c:122
ff_sws_op_chain_alloc
SwsOpChain * ff_sws_op_chain_alloc(void)
Definition: ops_chain.c:29
AVFormatContext::flags
int flags
Flags modifying the (de)muxer behaviour.
Definition: avformat.h:1414
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
setup_clear
static int setup_clear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:132
ctx
static AVFormatContext * ctx
Definition: movenc.c:49
AV_CPU_FLAG_SSE4
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
Definition: cpu.h:47
ff_sws_op_list_output
const SwsOp * ff_sws_op_list_output(const SwsOpList *ops)
Returns the output operation for a given op list, or NULL if there is none.
Definition: ops.c:679
av_mallocz
#define av_mallocz(s)
Definition: tableprint_vlc.h:31
SwsOpBackend
Definition: ops_internal.h:55
SwsReadWriteOp::kernel
SwsFilterWeights * kernel
Definition: ops.h:138
SwsOpChain
Compiled "chain" of operations, which can be dispatched efficiently.
Definition: ops_chain.h:84
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
SwsImplParams::op
const SwsOp * op
Definition: ops_chain.h:107
tables
static const SwsOpTable *const tables[]
Definition: ops.c:846
check_filter_4x4_h
static bool check_filter_4x4_h(const SwsImplParams *params)
Definition: ops.c:432
setup_rw
static int setup_rw(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:61
solve_shuffle
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
Definition: ops.c:893
setup_filter_4x4_h
static int setup_filter_4x4_h(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:456
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
SWS_FILTER_SCALE
@ SWS_FILTER_SCALE
14-bit coefficients are picked to fit comfortably within int16_t for efficient SIMD processing (e....
Definition: filters.h:40
SwsImplParams
Definition: ops_chain.h:105
AV_CPU_FLAG_AVX2
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
Definition: cpu.h:56
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
SwsOp::type
SwsPixelType type
Definition: ops.h:240
movsize
static int movsize(const int bytes, const int mmsize)
Definition: ops.c:886
SwsOpPriv::u8
uint8_t u8[16]
Definition: ops_chain.h:50
size
int size
Definition: twinvq_data.h:10344
setup_swap_bytes
static int setup_swap_bytes(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:102
ff_sws_op_compile_tables
int ff_sws_op_compile_tables(SwsContext *ctx, const SwsOpTable *const tables[], int num_tables, SwsOpList *ops, int ops_index, const int block_size, SwsOpChain *chain)
"Compile" a single op by looking it up in a list of fixed size op tables.
Definition: ops_chain.c:181
SwsShiftOp::amount
uint8_t amount
Definition: ops.h:165
SWS_OP_WRITE
@ SWS_OP_WRITE
Definition: ops.h:51
SwsOpPriv::u16
uint16_t u16[8]
Definition: ops_chain.h:52
ff_sws_op_chain_free_cb
void ff_sws_op_chain_free_cb(void *ptr)
Definition: ops_chain.c:34
compile
static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
Definition: ops.c:983
SwsImplParams::ctx
SwsContext * ctx
Definition: ops_chain.h:108
ff_sws_op_chain_free
static void ff_sws_op_chain_free(SwsOpChain *chain)
Definition: ops_chain.h:96
SwsOpList::ops
SwsOp * ops
Definition: ops.h:289
weights
static const int weights[]
Definition: hevc_pel.c:32
op_is_type_invariant
static bool op_is_type_invariant(const SwsOp *op)
Returns true if the operation's implementation only depends on the block size, and not the underlying...
Definition: ops.c:872
av_assert1
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:58
hscale_sizeof_weight
static int hscale_sizeof_weight(const SwsOp *op)
Definition: ops.c:338
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
SwsOp
Definition: ops.h:238
write_bytes
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
Definition: vf_nnedi.c:484
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:264
ff_op_priv_free
static void ff_op_priv_free(SwsOpPriv *priv)
Definition: ops_chain.h:149
av_cmp_q
static int av_cmp_q(AVRational a, AVRational b)
Compare two rationals.
Definition: rational.h:89
ret
ret
Definition: filter_design.txt:187
SwsCompiledOp
Definition: ops_dispatch.h:100
setup_shift
static int setup_shift(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:166
U
#define U(x)
Definition: vpx_arith.h:37
ASSIGN_SHUFFLE_FUNC
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
SwsImplResult::priv
SwsOpPriv priv
Definition: ops_chain.h:113
AVRational::den
int den
Denominator.
Definition: rational.h:60
SwsReadWriteOp::packed
bool packed
Definition: ops.h:128
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
ff_sws_solve_shuffle
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
Definition: ops_optimizer.c:786
SwsOp::shift
SwsShiftOp shift
Definition: ops.h:246
av_mul_q
AVRational av_mul_q(AVRational b, AVRational c)
Multiply two rationals.
Definition: rational.c:80
SwsReadWriteOp::elems
uint8_t elems
Examples: rgba = 4x u8 packed yuv444p = 3x u8 rgb565 = 1x u16 <- use SWS_OP_UNPACK to unpack monow = ...
Definition: ops.h:126
mem.h
w
uint8_t w
Definition: llvidencdsp.c:39
av_free
#define av_free(p)
Definition: tableprint_vlc.h:34
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:278
FFALIGN
#define FFALIGN(x, a)
Definition: macros.h:78
setup_filter_v
static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:318
int32_t
int32_t
Definition: audioconvert.c:56
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
DECL_FUNCS_16
#define DECL_FUNCS_16(SIZE, EXT, FLAG)
Definition: ops.c:673
stride
#define stride
Definition: h264pred_template.c:536
xi
#define xi(width, name, var, range_min, range_max, subs,...)
Definition: cbs_h264.c:190
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:288
DECL_FUNCS_8
#define DECL_FUNCS_8(SIZE, EXT, FLAG)
Definition: ops.c:543
SwsContext
Main external API structure.
Definition: swscale.h:206
shuffle
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
Definition: des.c:179
SwsImplResult
Definition: ops_chain.h:111
SwsImplParams::table
const SwsOpTable * table
Definition: ops_chain.h:106
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239
SwsOpChain::over_write
int over_write
Definition: ops_chain.h:91
min
float min
Definition: vorbis_enc_data.h:429
setup_filter_h
static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:348