FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
h264qpel_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/mem.h"
22 
23 #ifdef DEBUG
24 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25 #else
26 #define ASSERT_ALIGNED(ptr) ;
27 #endif
28 
29 /* this code assume stride % 16 == 0 */
30 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
31 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
32  register int i;
33 
34  LOAD_ZERO;
35  const vec_u8 permM2 = vec_lvsl(-2, src);
36  const vec_u8 permM1 = vec_lvsl(-1, src);
37  const vec_u8 permP0 = vec_lvsl(+0, src);
38  const vec_u8 permP1 = vec_lvsl(+1, src);
39  const vec_u8 permP2 = vec_lvsl(+2, src);
40  const vec_u8 permP3 = vec_lvsl(+3, src);
41  const vec_s16 v5ss = vec_splat_s16(5);
42  const vec_u16 v5us = vec_splat_u16(5);
43  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
44  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
45 
46  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
47 
48  register int align = ((((unsigned long)src) - 2) % 16);
49 
50  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
51  srcP2A, srcP2B, srcP3A, srcP3B,
52  srcM1A, srcM1B, srcM2A, srcM2B,
53  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
54  pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
55  psumA, psumB, sumA, sumB;
56 
57  vec_u8 sum, fsum;
58 
59  for (i = 0 ; i < 16 ; i ++) {
60  vec_u8 srcR1 = vec_ld(-2, src);
61  vec_u8 srcR2 = vec_ld(14, src);
62 
63  switch (align) {
64  default: {
65  srcM2 = vec_perm(srcR1, srcR2, permM2);
66  srcM1 = vec_perm(srcR1, srcR2, permM1);
67  srcP0 = vec_perm(srcR1, srcR2, permP0);
68  srcP1 = vec_perm(srcR1, srcR2, permP1);
69  srcP2 = vec_perm(srcR1, srcR2, permP2);
70  srcP3 = vec_perm(srcR1, srcR2, permP3);
71  } break;
72  case 11: {
73  srcM2 = vec_perm(srcR1, srcR2, permM2);
74  srcM1 = vec_perm(srcR1, srcR2, permM1);
75  srcP0 = vec_perm(srcR1, srcR2, permP0);
76  srcP1 = vec_perm(srcR1, srcR2, permP1);
77  srcP2 = vec_perm(srcR1, srcR2, permP2);
78  srcP3 = srcR2;
79  } break;
80  case 12: {
81  vec_u8 srcR3 = vec_ld(30, src);
82  srcM2 = vec_perm(srcR1, srcR2, permM2);
83  srcM1 = vec_perm(srcR1, srcR2, permM1);
84  srcP0 = vec_perm(srcR1, srcR2, permP0);
85  srcP1 = vec_perm(srcR1, srcR2, permP1);
86  srcP2 = srcR2;
87  srcP3 = vec_perm(srcR2, srcR3, permP3);
88  } break;
89  case 13: {
90  vec_u8 srcR3 = vec_ld(30, src);
91  srcM2 = vec_perm(srcR1, srcR2, permM2);
92  srcM1 = vec_perm(srcR1, srcR2, permM1);
93  srcP0 = vec_perm(srcR1, srcR2, permP0);
94  srcP1 = srcR2;
95  srcP2 = vec_perm(srcR2, srcR3, permP2);
96  srcP3 = vec_perm(srcR2, srcR3, permP3);
97  } break;
98  case 14: {
99  vec_u8 srcR3 = vec_ld(30, src);
100  srcM2 = vec_perm(srcR1, srcR2, permM2);
101  srcM1 = vec_perm(srcR1, srcR2, permM1);
102  srcP0 = srcR2;
103  srcP1 = vec_perm(srcR2, srcR3, permP1);
104  srcP2 = vec_perm(srcR2, srcR3, permP2);
105  srcP3 = vec_perm(srcR2, srcR3, permP3);
106  } break;
107  case 15: {
108  vec_u8 srcR3 = vec_ld(30, src);
109  srcM2 = vec_perm(srcR1, srcR2, permM2);
110  srcM1 = srcR2;
111  srcP0 = vec_perm(srcR2, srcR3, permP0);
112  srcP1 = vec_perm(srcR2, srcR3, permP1);
113  srcP2 = vec_perm(srcR2, srcR3, permP2);
114  srcP3 = vec_perm(srcR2, srcR3, permP3);
115  } break;
116  }
117 
118  srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
119  srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
120  srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
121  srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
122 
123  srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
124  srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
125  srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
126  srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
127 
128  srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
129  srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
130  srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
131  srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
132 
133  sum1A = vec_adds(srcP0A, srcP1A);
134  sum1B = vec_adds(srcP0B, srcP1B);
135  sum2A = vec_adds(srcM1A, srcP2A);
136  sum2B = vec_adds(srcM1B, srcP2B);
137  sum3A = vec_adds(srcM2A, srcP3A);
138  sum3B = vec_adds(srcM2B, srcP3B);
139 
140  pp1A = vec_mladd(sum1A, v20ss, v16ss);
141  pp1B = vec_mladd(sum1B, v20ss, v16ss);
142 
143  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
144  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
145 
146  pp3A = vec_add(sum3A, pp1A);
147  pp3B = vec_add(sum3B, pp1B);
148 
149  psumA = vec_sub(pp3A, pp2A);
150  psumB = vec_sub(pp3B, pp2B);
151 
152  sumA = vec_sra(psumA, v5us);
153  sumB = vec_sra(psumB, v5us);
154 
155  sum = vec_packsu(sumA, sumB);
156 
157  ASSERT_ALIGNED(dst);
158 
159  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
160 
161  vec_st(fsum, 0, dst);
162 
163  src += srcStride;
164  dst += dstStride;
165  }
166 }
167 #endif
168 
169 /* this code assume stride % 16 == 0 */
170 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
171 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
172  register int i;
173 
174  LOAD_ZERO;
175  const vec_u8 perm = vec_lvsl(0, src);
176  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
177  const vec_u16 v5us = vec_splat_u16(5);
178  const vec_s16 v5ss = vec_splat_s16(5);
179  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
180 
181  uint8_t *srcbis = src - (srcStride * 2);
182 
183  const vec_u8 srcM2a = vec_ld(0, srcbis);
184  const vec_u8 srcM2b = vec_ld(16, srcbis);
185  const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
186  //srcbis += srcStride;
187  const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
188  const vec_u8 srcM1b = vec_ld(16, srcbis);
189  const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
190  //srcbis += srcStride;
191  const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
192  const vec_u8 srcP0b = vec_ld(16, srcbis);
193  const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
194  //srcbis += srcStride;
195  const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
196  const vec_u8 srcP1b = vec_ld(16, srcbis);
197  const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
198  //srcbis += srcStride;
199  const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
200  const vec_u8 srcP2b = vec_ld(16, srcbis);
201  const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
202  //srcbis += srcStride;
203 
204  vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
205  vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
206  vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
207  vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
208  vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
209  vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
210  vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
211  vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
212  vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
213  vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
214 
215  vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
216  psumA, psumB, sumA, sumB,
217  srcP3ssA, srcP3ssB,
218  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
219 
220  vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
221 
222  for (i = 0 ; i < 16 ; i++) {
223  srcP3a = vec_ld(0, srcbis += srcStride);
224  srcP3b = vec_ld(16, srcbis);
225  srcP3 = vec_perm(srcP3a, srcP3b, perm);
226  srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
227  srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
228  //srcbis += srcStride;
229 
230  sum1A = vec_adds(srcP0ssA, srcP1ssA);
231  sum1B = vec_adds(srcP0ssB, srcP1ssB);
232  sum2A = vec_adds(srcM1ssA, srcP2ssA);
233  sum2B = vec_adds(srcM1ssB, srcP2ssB);
234  sum3A = vec_adds(srcM2ssA, srcP3ssA);
235  sum3B = vec_adds(srcM2ssB, srcP3ssB);
236 
237  srcM2ssA = srcM1ssA;
238  srcM2ssB = srcM1ssB;
239  srcM1ssA = srcP0ssA;
240  srcM1ssB = srcP0ssB;
241  srcP0ssA = srcP1ssA;
242  srcP0ssB = srcP1ssB;
243  srcP1ssA = srcP2ssA;
244  srcP1ssB = srcP2ssB;
245  srcP2ssA = srcP3ssA;
246  srcP2ssB = srcP3ssB;
247 
248  pp1A = vec_mladd(sum1A, v20ss, v16ss);
249  pp1B = vec_mladd(sum1B, v20ss, v16ss);
250 
251  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
252  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
253 
254  pp3A = vec_add(sum3A, pp1A);
255  pp3B = vec_add(sum3B, pp1B);
256 
257  psumA = vec_sub(pp3A, pp2A);
258  psumB = vec_sub(pp3B, pp2B);
259 
260  sumA = vec_sra(psumA, v5us);
261  sumB = vec_sra(psumB, v5us);
262 
263  sum = vec_packsu(sumA, sumB);
264 
265  ASSERT_ALIGNED(dst);
266 
267  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
268 
269  vec_st(fsum, 0, dst);
270 
271  dst += dstStride;
272  }
273 }
274 #endif
275 
276 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
277 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
278 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
279  register int i;
280  LOAD_ZERO;
281  const vec_u8 permM2 = vec_lvsl(-2, src);
282  const vec_u8 permM1 = vec_lvsl(-1, src);
283  const vec_u8 permP0 = vec_lvsl(+0, src);
284  const vec_u8 permP1 = vec_lvsl(+1, src);
285  const vec_u8 permP2 = vec_lvsl(+2, src);
286  const vec_u8 permP3 = vec_lvsl(+3, src);
287  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
288  const vec_u32 v10ui = vec_splat_u32(10);
289  const vec_s16 v5ss = vec_splat_s16(5);
290  const vec_s16 v1ss = vec_splat_s16(1);
291  const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
292  const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
293 
294  register int align = ((((unsigned long)src) - 2) % 16);
295 
296  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
297  srcP2A, srcP2B, srcP3A, srcP3B,
298  srcM1A, srcM1B, srcM2A, srcM2B,
299  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
300  pp1A, pp1B, pp2A, pp2B, psumA, psumB;
301 
302  const vec_u8 mperm = (const vec_u8)
303  {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
304  0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
305  int16_t *tmpbis = tmp;
306 
307  vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
308  tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
309  tmpP2ssA, tmpP2ssB;
310 
311  vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
312  pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
313  pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
314  ssumAe, ssumAo, ssumBe, ssumBo;
315  vec_u8 fsum, sumv, sum;
316  vec_s16 ssume, ssumo;
317 
318  src -= (2 * srcStride);
319  for (i = 0 ; i < 21 ; i ++) {
320  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
321  vec_u8 srcR1 = vec_ld(-2, src);
322  vec_u8 srcR2 = vec_ld(14, src);
323 
324  switch (align) {
325  default: {
326  srcM2 = vec_perm(srcR1, srcR2, permM2);
327  srcM1 = vec_perm(srcR1, srcR2, permM1);
328  srcP0 = vec_perm(srcR1, srcR2, permP0);
329  srcP1 = vec_perm(srcR1, srcR2, permP1);
330  srcP2 = vec_perm(srcR1, srcR2, permP2);
331  srcP3 = vec_perm(srcR1, srcR2, permP3);
332  } break;
333  case 11: {
334  srcM2 = vec_perm(srcR1, srcR2, permM2);
335  srcM1 = vec_perm(srcR1, srcR2, permM1);
336  srcP0 = vec_perm(srcR1, srcR2, permP0);
337  srcP1 = vec_perm(srcR1, srcR2, permP1);
338  srcP2 = vec_perm(srcR1, srcR2, permP2);
339  srcP3 = srcR2;
340  } break;
341  case 12: {
342  vec_u8 srcR3 = vec_ld(30, src);
343  srcM2 = vec_perm(srcR1, srcR2, permM2);
344  srcM1 = vec_perm(srcR1, srcR2, permM1);
345  srcP0 = vec_perm(srcR1, srcR2, permP0);
346  srcP1 = vec_perm(srcR1, srcR2, permP1);
347  srcP2 = srcR2;
348  srcP3 = vec_perm(srcR2, srcR3, permP3);
349  } break;
350  case 13: {
351  vec_u8 srcR3 = vec_ld(30, src);
352  srcM2 = vec_perm(srcR1, srcR2, permM2);
353  srcM1 = vec_perm(srcR1, srcR2, permM1);
354  srcP0 = vec_perm(srcR1, srcR2, permP0);
355  srcP1 = srcR2;
356  srcP2 = vec_perm(srcR2, srcR3, permP2);
357  srcP3 = vec_perm(srcR2, srcR3, permP3);
358  } break;
359  case 14: {
360  vec_u8 srcR3 = vec_ld(30, src);
361  srcM2 = vec_perm(srcR1, srcR2, permM2);
362  srcM1 = vec_perm(srcR1, srcR2, permM1);
363  srcP0 = srcR2;
364  srcP1 = vec_perm(srcR2, srcR3, permP1);
365  srcP2 = vec_perm(srcR2, srcR3, permP2);
366  srcP3 = vec_perm(srcR2, srcR3, permP3);
367  } break;
368  case 15: {
369  vec_u8 srcR3 = vec_ld(30, src);
370  srcM2 = vec_perm(srcR1, srcR2, permM2);
371  srcM1 = srcR2;
372  srcP0 = vec_perm(srcR2, srcR3, permP0);
373  srcP1 = vec_perm(srcR2, srcR3, permP1);
374  srcP2 = vec_perm(srcR2, srcR3, permP2);
375  srcP3 = vec_perm(srcR2, srcR3, permP3);
376  } break;
377  }
378 
379  srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
380  srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
381  srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
382  srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
383 
384  srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
385  srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
386  srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
387  srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
388 
389  srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
390  srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
391  srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
392  srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
393 
394  sum1A = vec_adds(srcP0A, srcP1A);
395  sum1B = vec_adds(srcP0B, srcP1B);
396  sum2A = vec_adds(srcM1A, srcP2A);
397  sum2B = vec_adds(srcM1B, srcP2B);
398  sum3A = vec_adds(srcM2A, srcP3A);
399  sum3B = vec_adds(srcM2B, srcP3B);
400 
401  pp1A = vec_mladd(sum1A, v20ss, sum3A);
402  pp1B = vec_mladd(sum1B, v20ss, sum3B);
403 
404  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
405  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
406 
407  psumA = vec_sub(pp1A, pp2A);
408  psumB = vec_sub(pp1B, pp2B);
409 
410  vec_st(psumA, 0, tmp);
411  vec_st(psumB, 16, tmp);
412 
413  src += srcStride;
414  tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
415  }
416 
417  tmpM2ssA = vec_ld(0, tmpbis);
418  tmpM2ssB = vec_ld(16, tmpbis);
419  tmpbis += tmpStride;
420  tmpM1ssA = vec_ld(0, tmpbis);
421  tmpM1ssB = vec_ld(16, tmpbis);
422  tmpbis += tmpStride;
423  tmpP0ssA = vec_ld(0, tmpbis);
424  tmpP0ssB = vec_ld(16, tmpbis);
425  tmpbis += tmpStride;
426  tmpP1ssA = vec_ld(0, tmpbis);
427  tmpP1ssB = vec_ld(16, tmpbis);
428  tmpbis += tmpStride;
429  tmpP2ssA = vec_ld(0, tmpbis);
430  tmpP2ssB = vec_ld(16, tmpbis);
431  tmpbis += tmpStride;
432 
433  for (i = 0 ; i < 16 ; i++) {
434  const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
435  const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
436 
437  const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
438  const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
439  const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
440  const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
441  const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
442  const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
443 
444  tmpbis += tmpStride;
445 
446  tmpM2ssA = tmpM1ssA;
447  tmpM2ssB = tmpM1ssB;
448  tmpM1ssA = tmpP0ssA;
449  tmpM1ssB = tmpP0ssB;
450  tmpP0ssA = tmpP1ssA;
451  tmpP0ssB = tmpP1ssB;
452  tmpP1ssA = tmpP2ssA;
453  tmpP1ssB = tmpP2ssB;
454  tmpP2ssA = tmpP3ssA;
455  tmpP2ssB = tmpP3ssB;
456 
457  pp1Ae = vec_mule(sum1A, v20ss);
458  pp1Ao = vec_mulo(sum1A, v20ss);
459  pp1Be = vec_mule(sum1B, v20ss);
460  pp1Bo = vec_mulo(sum1B, v20ss);
461 
462  pp2Ae = vec_mule(sum2A, v5ss);
463  pp2Ao = vec_mulo(sum2A, v5ss);
464  pp2Be = vec_mule(sum2B, v5ss);
465  pp2Bo = vec_mulo(sum2B, v5ss);
466 
467  pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
468  pp3Ao = vec_mulo(sum3A, v1ss);
469  pp3Be = vec_sra((vec_s32)sum3B, v16ui);
470  pp3Bo = vec_mulo(sum3B, v1ss);
471 
472  pp1cAe = vec_add(pp1Ae, v512si);
473  pp1cAo = vec_add(pp1Ao, v512si);
474  pp1cBe = vec_add(pp1Be, v512si);
475  pp1cBo = vec_add(pp1Bo, v512si);
476 
477  pp32Ae = vec_sub(pp3Ae, pp2Ae);
478  pp32Ao = vec_sub(pp3Ao, pp2Ao);
479  pp32Be = vec_sub(pp3Be, pp2Be);
480  pp32Bo = vec_sub(pp3Bo, pp2Bo);
481 
482  sumAe = vec_add(pp1cAe, pp32Ae);
483  sumAo = vec_add(pp1cAo, pp32Ao);
484  sumBe = vec_add(pp1cBe, pp32Be);
485  sumBo = vec_add(pp1cBo, pp32Bo);
486 
487  ssumAe = vec_sra(sumAe, v10ui);
488  ssumAo = vec_sra(sumAo, v10ui);
489  ssumBe = vec_sra(sumBe, v10ui);
490  ssumBo = vec_sra(sumBo, v10ui);
491 
492  ssume = vec_packs(ssumAe, ssumBe);
493  ssumo = vec_packs(ssumAo, ssumBo);
494 
495  sumv = vec_packsu(ssume, ssumo);
496  sum = vec_perm(sumv, sumv, mperm);
497 
498  ASSERT_ALIGNED(dst);
499 
500  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
501 
502  vec_st(fsum, 0, dst);
503 
504  dst += dstStride;
505  }
506 }
507 #endif