FFmpeg
ops.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2026 Ramiro Polla
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "../ops_chain.h"
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/avstring.h"
25 #include "libavutil/tree.h"
26 
27 #include "ops_lookup.h"
28 
29 #include "ops_impl_conv.c"
30 
31 /*********************************************************************/
32 typedef struct SwsAArch64BackendContext {
36 
37 /*********************************************************************/
39  const SwsOp *op, SwsImplResult *res)
40 {
41  /**
42  * Compute number of full vector registers needed to pack all non-zero
43  * coefficients.
44  */
45  const int num_vregs = linear_num_vregs(p);
46  av_assert0(num_vregs <= 4);
47  float *coeffs = av_malloc(num_vregs * 4 * sizeof(float));
48  if (!coeffs)
49  return AVERROR(ENOMEM);
50 
51  /**
52  * Copy non-zero coefficients, reordered to match SwsAArch64LinearOpMask.
53  * The coefficients are packed in sequential order. The same order must
54  * be followed in asmgen_op_linear().
55  */
56  int i_coeff = 0;
57  LOOP_LINEAR_MASK(p, i, j) {
58  const int jj = linear_index_to_sws_op(j);
59  coeffs[i_coeff++] = (float) op->lin.m[i][jj].num / op->lin.m[i][jj].den;
60  }
61 
62  res->priv.ptr = coeffs;
63  res->free = ff_op_priv_free;
64 
65  return 0;
66 }
67 
68 /*********************************************************************/
70  const SwsOp *op, SwsImplResult *res)
71 {
72  /**
73  * The input dither matrix is (1 << size_log2)² pixels large. It is
74  * periodic, so the x and y offsets should be masked to fit inside
75  * (1 << size_log2).
76  * The width of the matrix is assumed to be at least 8, which matches
77  * the maximum block_size for aarch64 asmgen when f32 operations
78  * (i.e., dithering) are used. This guarantees that the x offset is
79  * aligned and that reading block_size elements does not extend past
80  * the end of the row. The x offset doesn't change between components,
81  * so it is only required to be masked once.
82  * The y offset, on the other hand, may change per component, and
83  * would therefore need to be masked for every y_offset value. To
84  * simplify the execution, we over-allocate the number of rows of
85  * the output dither matrix by the largest y_offset value. This way,
86  * we only need to mask y offset once, and can safely increment the
87  * dither matrix pointer by fixed offsets for every y_offset change.
88  */
89 
90  /* Find the largest y_offset value. */
91  const int size = 1 << op->dither.size_log2;
92  const int8_t *off = op->dither.y_offset;
93  int max_offset = 0;
94  for (int i = 0; i < 4; i++) {
95  if (off[i] >= 0)
96  max_offset = FFMAX(max_offset, off[i] & (size - 1));
97  }
98 
99  /* Allocate (size + max_offset) rows to allow over-reading the matrix. */
100  const int stride = size * sizeof(float);
101  const int num_rows = size + max_offset;
102  float *matrix = av_malloc(num_rows * stride);
103  if (!matrix)
104  return AVERROR(ENOMEM);
105 
106  for (int i = 0; i < size * size; i++)
107  matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
108 
109  memcpy(&matrix[size * size], matrix, max_offset * stride);
110 
111  res->priv.ptr = matrix;
112  res->free = ff_op_priv_free;
113 
114  return 0;
115 }
116 
117 /*********************************************************************/
118 static int aarch64_setup(SwsOpList *ops, int block_size, int n,
120 {
121  SwsOp *op = &ops->ops[n];
122  switch (op->op) {
123  case SWS_OP_READ:
124  /* Negative shift values to perform right shift using ushl. */
125  if (op->rw.frac == 3) {
126  out->priv = (SwsOpPriv) {
127  .u8 = {
128  -7, -6, -5, -4, -3, -2, -1, 0,
129  -7, -6, -5, -4, -3, -2, -1, 0,
130  }
131  };
132  }
133  break;
134  case SWS_OP_WRITE:
135  /* Shift values for ushl. */
136  if (op->rw.frac == 3) {
137  out->priv = (SwsOpPriv) {
138  .u8 = {
139  7, 6, 5, 4, 3, 2, 1, 0,
140  7, 6, 5, 4, 3, 2, 1, 0,
141  }
142  };
143  }
144  break;
145  case SWS_OP_CLEAR:
146  case SWS_OP_MIN:
147  case SWS_OP_MAX:
148  ff_sws_setup_q4(&(const SwsImplParams) { .op = op }, out);
149  break;
150  case SWS_OP_SCALE:
151  ff_sws_setup_q(&(const SwsImplParams) { .op = op }, out);
152  break;
153  case SWS_OP_LINEAR:
154  return aarch64_setup_linear(p, op, out);
155  case SWS_OP_DITHER:
156  return aarch64_setup_dither(p, op, out);
157  }
158  return 0;
159 }
160 
161 /*********************************************************************/
163 {
164  /* Currently, no optimization is performed. This is just a placeholder. */
165 
166  /* Use at most two full vregs during the widest precision section */
167  bctx->block_size = (ff_sws_op_list_max_size(ops) == 4) ? 8 : 16;
168 
169  return 0;
170 }
171 
172 /*********************************************************************/
174 {
176  int ret;
177 
178  const int cpu_flags = av_get_cpu_flags();
179  if (!(cpu_flags & AV_CPU_FLAG_NEON))
180  return AVERROR(ENOTSUP);
181 
182  /* Make on-stack copy of `ops` to iterate over */
183  SwsOpList rest = *ops;
184  bctx.sws = ctx;
185  ret = aarch64_optimize(&bctx, &rest);
186  if (ret < 0)
187  return ret;
188 
190  if (!chain)
191  return AVERROR(ENOMEM);
192  chain->cpu_flags = AV_CPU_FLAG_NEON;
193 
194  *out = (SwsCompiledOp) {
195  .priv = chain,
196  .slice_align = 1,
198  .block_size = bctx.block_size,
199  };
200 
201  /* Look up kernel functions. */
202  for (int i = 0; i < rest.num_ops; i++) {
203  SwsAArch64OpImplParams params = { 0 };
204  ret = convert_to_aarch64_impl(ctx, &rest, i, bctx.block_size, &params);
205  if (ret < 0)
206  goto error;
208  if (!func) {
209  ret = AVERROR(ENOTSUP);
210  goto error;
211  }
212  SwsImplResult res = { 0 };
213  ret = aarch64_setup(&rest, bctx.block_size, i, &params, &res);
214  if (ret < 0)
215  goto error;
216  ret = ff_sws_op_chain_append(chain, func, res.free, &res.priv);
217  if (ret < 0)
218  goto error;
219  }
220 
221  /* Look up process/process_return functions. */
222  const SwsOp *read = ff_sws_op_list_input(&rest);
223  const SwsOp *write = ff_sws_op_list_output(&rest);
224  const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
225  const int write_planes = write->rw.packed ? 1 : write->rw.elems;
227  for (int i = 0; i < FFMAX(read_planes, write_planes); i++)
228  MASK_SET(mask, i, 1);
229 
230  SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, .mask = mask };
231  SwsAArch64OpImplParams return_params = { .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = mask };
232  SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params);
233  SwsFuncPtr return_func = ff_sws_aarch64_lookup(&return_params);
234  if (!process_func || !return_func) {
235  ret = AVERROR(ENOTSUP);
236  goto error;
237  }
238 
239  ret = ff_sws_op_chain_append(chain, return_func, NULL, &(SwsOpPriv) { 0 });
240  if (ret < 0)
241  goto error;
242 
243  out->func = (SwsOpFunc) process_func;
244  out->cpu_flags = chain->cpu_flags;
245 
246 error:
247  if (ret < 0)
248  ff_sws_op_chain_free(chain);
249  return ret;
250 }
251 
252 /*********************************************************************/
254  .name = "aarch64",
255  .compile = aarch64_compile,
256  .hw_format = AV_PIX_FMT_NONE,
257 };
SWS_OP_READ
@ SWS_OP_READ
Definition: ops.h:50
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:32
func
int(* func)(AVBPrint *dst, const char *in, const char *arg)
Definition: jacosubdec.c:66
SwsAArch64BackendContext
Definition: ops.c:32
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
linear_index_to_sws_op
static int linear_index_to_sws_op(int idx)
Definition: ops_impl.h:146
out
static FILE * out
Definition: movenc.c:55
ff_sws_op_list_input
const SwsOp * ff_sws_op_list_input(const SwsOpList *ops)
Returns the input operation for a given op list, or NULL if there is none (e.g.
Definition: ops.c:631
SWS_OP_CLEAR
@ SWS_OP_CLEAR
Definition: ops.h:62
ff_sws_op_list_max_size
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
Definition: ops.c:708
matrix
Definition: vc1dsp.c:43
ff_sws_setup_q
int ff_sws_setup_q(const SwsImplParams *params, SwsImplResult *out)
Definition: ops_chain.c:284
mask
int mask
Definition: mediacodecdec_common.c:154
SwsOp::rw
SwsReadWriteOp rw
Definition: ops.h:217
SWS_OP_DITHER
@ SWS_OP_DITHER
Definition: ops.h:70
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
SwsOpBackend::name
const char * name
Definition: ops_internal.h:56
SwsOpChain::cpu_flags
int cpu_flags
Definition: ops_chain.h:89
SwsFuncPtr
void(* SwsFuncPtr)(void)
Per-kernel execution context.
Definition: ops_chain.h:70
SwsOpList::num_ops
int num_ops
Definition: ops.h:256
SwsOpFunc
void(* SwsOpFunc)(const SwsOpExec *exec, const void *priv, int bx_start, int y_start, int bx_end, int y_end)
Process a given range of pixel blocks.
Definition: ops_dispatch.h:94
SWS_OP_SCALE
@ SWS_OP_SCALE
Definition: ops.h:66
SwsOpChain::free
void(* free[SWS_MAX_OPS+1])(SwsOpPriv *)
Definition: ops_chain.h:87
avassert.h
backend_aarch64
const SwsOpBackend backend_aarch64
Definition: ops.c:253
ff_sws_aarch64_lookup
SwsFuncPtr ff_sws_aarch64_lookup(const SwsAArch64OpImplParams *p)
float
float
Definition: af_crystalizer.c:122
SwsAArch64OpMask
uint16_t SwsAArch64OpMask
Definition: ops_impl.h:68
ff_sws_op_chain_alloc
SwsOpChain * ff_sws_op_chain_alloc(void)
Definition: ops_chain.c:29
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
SwsAArch64BackendContext::sws
SwsContext * sws
Definition: ops.c:33
av_assert0
#define av_assert0(cond)
assert() equivalent, that is always enabled.
Definition: avassert.h:42
SWS_OP_MIN
@ SWS_OP_MIN
Definition: ops.h:64
ctx
static AVFormatContext * ctx
Definition: movenc.c:49
SWS_OP_LINEAR
@ SWS_OP_LINEAR
Definition: ops.h:69
ff_sws_op_list_output
const SwsOp * ff_sws_op_list_output(const SwsOpList *ops)
Returns the output operation for a given op list, or NULL if there is none.
Definition: ops.c:640
AARCH64_SWS_OP_PROCESS
@ AARCH64_SWS_OP_PROCESS
Definition: ops_impl.h:40
SwsOpBackend
Definition: ops_internal.h:55
SwsOpPriv::ptr
void * ptr
Definition: ops_chain.h:49
SwsOpChain
Compiled "chain" of operations, which can be dispatched efficiently.
Definition: ops_chain.h:84
NULL
#define NULL
Definition: coverity.c:32
SwsAArch64BackendContext::block_size
int block_size
Definition: ops.c:34
aarch64_optimize
static int aarch64_optimize(SwsAArch64BackendContext *bctx, SwsOpList *ops)
Definition: ops.c:162
aarch64_compile
static int aarch64_compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
Definition: ops.c:173
SwsImplParams
Definition: ops_chain.h:105
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
AV_CPU_FLAG_NEON
#define AV_CPU_FLAG_NEON
Definition: cpu.h:73
size
int size
Definition: twinvq_data.h:10344
SwsAArch64OpImplParams::op
SwsAArch64OpType op
Definition: ops_impl.h:95
SWS_OP_WRITE
@ SWS_OP_WRITE
Definition: ops.h:51
ff_sws_setup_q4
int ff_sws_setup_q4(const SwsImplParams *params, SwsImplResult *out)
Definition: ops_chain.c:296
tree.h
ops_lookup.h
aarch64_setup
static int aarch64_setup(SwsOpList *ops, int block_size, int n, const SwsAArch64OpImplParams *p, SwsImplResult *out)
Definition: ops.c:118
ff_sws_op_chain_free_cb
void ff_sws_op_chain_free_cb(void *ptr)
Definition: ops_chain.c:34
aarch64_setup_dither
static int aarch64_setup_dither(const SwsAArch64OpImplParams *p, const SwsOp *op, SwsImplResult *res)
Definition: ops.c:69
ops_impl_conv.c
av_malloc
#define av_malloc(s)
Definition: ops_asmgen.c:44
ff_sws_op_chain_free
static void ff_sws_op_chain_free(SwsOpChain *chain)
Definition: ops_chain.h:96
SwsOpList::ops
SwsOp * ops
Definition: ops.h:255
aarch64_setup_linear
static int aarch64_setup_linear(const SwsAArch64OpImplParams *p, const SwsOp *op, SwsImplResult *res)
Definition: ops.c:38
SwsImplResult::free
void(* free)(SwsOpPriv *priv)
Definition: ops_chain.h:114
SwsOp
Definition: ops.h:212
ff_op_priv_free
static void ff_op_priv_free(SwsOpPriv *priv)
Definition: ops_chain.h:149
ret
ret
Definition: filter_design.txt:187
MASK_SET
#define MASK_SET(mask, idx, val)
Definition: ops_impl.h:112
SWS_OP_MAX
@ SWS_OP_MAX
Definition: ops.h:65
SwsCompiledOp
Definition: ops_dispatch.h:100
convert_to_aarch64_impl
static int convert_to_aarch64_impl(SwsContext *ctx, const SwsOpList *ops, int n, int block_size, SwsAArch64OpImplParams *out)
Convert SwsOp to a SwsAArch64OpImplParams.
Definition: ops_impl_conv.c:59
SwsImplResult::priv
SwsOpPriv priv
Definition: ops_chain.h:113
SwsReadWriteOp::packed
bool packed
Definition: ops.h:120
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
Windows::Graphics::DirectX::Direct3D11::p
IDirect3DDxgiInterfaceAccess _COM_Outptr_ void ** p
Definition: vsrc_gfxcapture_winrt.hpp:53
LOOP_LINEAR_MASK
#define LOOP_LINEAR_MASK(p, idx, jdx)
Definition: ops_impl.h:132
SwsReadWriteOp::elems
uint8_t elems
Examples: rgba = 4x u8 packed yuv444p = 3x u8 rgb565 = 1x u16 <- use SWS_OP_UNPACK to unpack monow = ...
Definition: ops.h:118
SwsAArch64OpImplParams
SwsAArch64OpImplParams describes the parameters for an SwsAArch64OpType operation.
Definition: ops_impl.h:94
AARCH64_SWS_OP_PROCESS_RETURN
@ AARCH64_SWS_OP_PROCESS_RETURN
Definition: ops_impl.h:41
ff_sws_op_chain_append
int ff_sws_op_chain_append(SwsOpChain *chain, SwsFuncPtr func, void(*free)(SwsOpPriv *), const SwsOpPriv *priv)
Definition: ops_chain.c:48
linear_num_vregs
static int linear_num_vregs(const SwsAArch64OpImplParams *params)
Definition: ops_impl.h:138
stride
#define stride
Definition: h264pred_template.c:536
avstring.h
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:254
SwsContext
Main external API structure.
Definition: swscale.h:206
SwsOpPriv
Private data for each kernel.
Definition: ops_chain.h:45
SwsImplResult
Definition: ops_chain.h:111
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239