FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
h264chroma_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/mem.h"
22 
23 /* this code assume that stride % 16 == 0 */
24 
25 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
26  vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
27  vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
28 \
29  psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
30  psum = vec_mladd(vB, vsrc1ssH, psum);\
31  psum = vec_mladd(vC, vsrc2ssH, psum);\
32  psum = vec_mladd(vD, vsrc3ssH, psum);\
33  psum = BIAS2(psum);\
34  psum = vec_sr(psum, v6us);\
35 \
36  vdst = vec_ld(0, dst);\
37  ppsum = (vec_u8)vec_pack(psum, psum);\
38  vfdst = vec_perm(vdst, ppsum, fperm);\
39 \
40  OP_U8_ALTIVEC(fsum, vfdst, vdst);\
41 \
42  vec_st(fsum, 0, dst);\
43 \
44  vsrc0ssH = vsrc2ssH;\
45  vsrc1ssH = vsrc3ssH;\
46 \
47  dst += stride;\
48  src += stride;
49 
50 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
51 \
52  vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
53  vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
54 \
55  psum = vec_mladd(vA, vsrc0ssH, v32ss);\
56  psum = vec_mladd(vE, vsrc1ssH, psum);\
57  psum = vec_sr(psum, v6us);\
58 \
59  vdst = vec_ld(0, dst);\
60  ppsum = (vec_u8)vec_pack(psum, psum);\
61  vfdst = vec_perm(vdst, ppsum, fperm);\
62 \
63  OP_U8_ALTIVEC(fsum, vfdst, vdst);\
64 \
65  vec_st(fsum, 0, dst);\
66 \
67  dst += stride;\
68  src += stride;
69 
70 #define noop(a) a
71 #define add28(a) vec_add(v28ss, a)
72 
73 #ifdef PREFIX_h264_chroma_mc8_altivec
74 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
75  int stride, int h, int x, int y) {
76  DECLARE_ALIGNED(16, signed int, ABCD)[4] =
77  {((8 - x) * (8 - y)),
78  (( x) * (8 - y)),
79  ((8 - x) * ( y)),
80  (( x) * ( y))};
81  register int i;
82  vec_u8 fperm;
83  const vec_s32 vABCD = vec_ld(0, ABCD);
84  const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
85  const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
86  const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
87  const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
88  LOAD_ZERO;
89  const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
90  const vec_u16 v6us = vec_splat_u16(6);
91  register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
92  register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
93 
94  vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
95  vec_u8 vsrc0uc, vsrc1uc;
96  vec_s16 vsrc0ssH, vsrc1ssH;
97  vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
98  vec_s16 vsrc2ssH, vsrc3ssH, psum;
99  vec_u8 vdst, ppsum, vfdst, fsum;
100 
101  if (((unsigned long)dst) % 16 == 0) {
102  fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
103  0x14, 0x15, 0x16, 0x17,
104  0x08, 0x09, 0x0A, 0x0B,
105  0x0C, 0x0D, 0x0E, 0x0F};
106  } else {
107  fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
108  0x04, 0x05, 0x06, 0x07,
109  0x18, 0x19, 0x1A, 0x1B,
110  0x1C, 0x1D, 0x1E, 0x1F};
111  }
112 
113  vsrcAuc = vec_ld(0, src);
114 
115  if (loadSecond)
116  vsrcBuc = vec_ld(16, src);
117  vsrcperm0 = vec_lvsl(0, src);
118  vsrcperm1 = vec_lvsl(1, src);
119 
120  vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
121  if (reallyBadAlign)
122  vsrc1uc = vsrcBuc;
123  else
124  vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
125 
126  vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
127  vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
128 
129  if (ABCD[3]) {
130  if (!loadSecond) {// -> !reallyBadAlign
131  for (i = 0 ; i < h ; i++) {
132  vsrcCuc = vec_ld(stride + 0, src);
133  vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
134  vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
135 
137  }
138  } else {
139  vec_u8 vsrcDuc;
140  for (i = 0 ; i < h ; i++) {
141  vsrcCuc = vec_ld(stride + 0, src);
142  vsrcDuc = vec_ld(stride + 16, src);
143  vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
144  if (reallyBadAlign)
145  vsrc3uc = vsrcDuc;
146  else
147  vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
148 
150  }
151  }
152  } else {
153  const vec_s16 vE = vec_add(vB, vC);
154  if (ABCD[2]) { // x == 0 B == 0
155  if (!loadSecond) {// -> !reallyBadAlign
156  for (i = 0 ; i < h ; i++) {
157  vsrcCuc = vec_ld(stride + 0, src);
158  vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
160 
161  vsrc0uc = vsrc1uc;
162  }
163  } else {
164  vec_u8 vsrcDuc;
165  for (i = 0 ; i < h ; i++) {
166  vsrcCuc = vec_ld(stride + 0, src);
167  vsrcDuc = vec_ld(stride + 15, src);
168  vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
170 
171  vsrc0uc = vsrc1uc;
172  }
173  }
174  } else { // y == 0 C == 0
175  if (!loadSecond) {// -> !reallyBadAlign
176  for (i = 0 ; i < h ; i++) {
177  vsrcCuc = vec_ld(0, src);
178  vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
179  vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
180 
182  }
183  } else {
184  vec_u8 vsrcDuc;
185  for (i = 0 ; i < h ; i++) {
186  vsrcCuc = vec_ld(0, src);
187  vsrcDuc = vec_ld(15, src);
188  vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
189  if (reallyBadAlign)
190  vsrc1uc = vsrcDuc;
191  else
192  vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
193 
195  }
196  }
197  }
198  }
199 }
200 #endif
201 
202 /* this code assume that stride % 16 == 0 */
203 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
204 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
205  DECLARE_ALIGNED(16, signed int, ABCD)[4] =
206  {((8 - x) * (8 - y)),
207  (( x) * (8 - y)),
208  ((8 - x) * ( y)),
209  (( x) * ( y))};
210  register int i;
211  vec_u8 fperm;
212  const vec_s32 vABCD = vec_ld(0, ABCD);
213  const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
214  const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
215  const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
216  const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
217  LOAD_ZERO;
218  const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
219  const vec_u16 v6us = vec_splat_u16(6);
220  register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
221  register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
222 
223  vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
224  vec_u8 vsrc0uc, vsrc1uc;
225  vec_s16 vsrc0ssH, vsrc1ssH;
226  vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
227  vec_s16 vsrc2ssH, vsrc3ssH, psum;
228  vec_u8 vdst, ppsum, vfdst, fsum;
229 
230  if (((unsigned long)dst) % 16 == 0) {
231  fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
232  0x14, 0x15, 0x16, 0x17,
233  0x08, 0x09, 0x0A, 0x0B,
234  0x0C, 0x0D, 0x0E, 0x0F};
235  } else {
236  fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
237  0x04, 0x05, 0x06, 0x07,
238  0x18, 0x19, 0x1A, 0x1B,
239  0x1C, 0x1D, 0x1E, 0x1F};
240  }
241 
242  vsrcAuc = vec_ld(0, src);
243 
244  if (loadSecond)
245  vsrcBuc = vec_ld(16, src);
246  vsrcperm0 = vec_lvsl(0, src);
247  vsrcperm1 = vec_lvsl(1, src);
248 
249  vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
250  if (reallyBadAlign)
251  vsrc1uc = vsrcBuc;
252  else
253  vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
254 
255  vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
256  vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
257 
258  if (!loadSecond) {// -> !reallyBadAlign
259  for (i = 0 ; i < h ; i++) {
260 
261 
262  vsrcCuc = vec_ld(stride + 0, src);
263 
264  vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
265  vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
266 
267  CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
268  }
269  } else {
270  vec_u8 vsrcDuc;
271  for (i = 0 ; i < h ; i++) {
272  vsrcCuc = vec_ld(stride + 0, src);
273  vsrcDuc = vec_ld(stride + 16, src);
274 
275  vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
276  if (reallyBadAlign)
277  vsrc3uc = vsrcDuc;
278  else
279  vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
280 
281  CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
282  }
283  }
284 }
285 #endif
286 
287 #undef noop
288 #undef add28
289 #undef CHROMA_MC8_ALTIVEC_CORE