24 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
26 #define ASSERT_ALIGNED(ptr) ;
30 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
31 static void PREFIX_h264_qpel16_h_lowpass_altivec(
uint8_t * dst,
uint8_t *
src,
int dstStride,
int srcStride) {
35 const vec_u8 permM2 = vec_lvsl(-2, src);
36 const vec_u8 permM1 = vec_lvsl(-1, src);
37 const vec_u8 permP0 = vec_lvsl(+0, src);
38 const vec_u8 permP1 = vec_lvsl(+1, src);
39 const vec_u8 permP2 = vec_lvsl(+2, src);
40 const vec_u8 permP3 = vec_lvsl(+3, src);
41 const vec_s16 v5ss = vec_splat_s16(5);
42 const vec_u16 v5us = vec_splat_u16(5);
43 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
44 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
46 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
48 register int align = ((((
unsigned long)src) - 2) % 16);
50 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
51 srcP2A, srcP2B, srcP3A, srcP3B,
52 srcM1A, srcM1B, srcM2A, srcM2B,
53 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
54 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
55 psumA, psumB, sumA, sumB;
59 for (i = 0 ; i < 16 ; i ++) {
60 vec_u8 srcR1 = vec_ld(-2, src);
61 vec_u8 srcR2 = vec_ld(14, src);
65 srcM2 = vec_perm(srcR1, srcR2, permM2);
66 srcM1 = vec_perm(srcR1, srcR2, permM1);
67 srcP0 = vec_perm(srcR1, srcR2, permP0);
68 srcP1 = vec_perm(srcR1, srcR2, permP1);
69 srcP2 = vec_perm(srcR1, srcR2, permP2);
70 srcP3 = vec_perm(srcR1, srcR2, permP3);
73 srcM2 = vec_perm(srcR1, srcR2, permM2);
74 srcM1 = vec_perm(srcR1, srcR2, permM1);
75 srcP0 = vec_perm(srcR1, srcR2, permP0);
76 srcP1 = vec_perm(srcR1, srcR2, permP1);
77 srcP2 = vec_perm(srcR1, srcR2, permP2);
81 vec_u8 srcR3 = vec_ld(30, src);
82 srcM2 = vec_perm(srcR1, srcR2, permM2);
83 srcM1 = vec_perm(srcR1, srcR2, permM1);
84 srcP0 = vec_perm(srcR1, srcR2, permP0);
85 srcP1 = vec_perm(srcR1, srcR2, permP1);
87 srcP3 = vec_perm(srcR2, srcR3, permP3);
90 vec_u8 srcR3 = vec_ld(30, src);
91 srcM2 = vec_perm(srcR1, srcR2, permM2);
92 srcM1 = vec_perm(srcR1, srcR2, permM1);
93 srcP0 = vec_perm(srcR1, srcR2, permP0);
95 srcP2 = vec_perm(srcR2, srcR3, permP2);
96 srcP3 = vec_perm(srcR2, srcR3, permP3);
99 vec_u8 srcR3 = vec_ld(30, src);
100 srcM2 = vec_perm(srcR1, srcR2, permM2);
101 srcM1 = vec_perm(srcR1, srcR2, permM1);
103 srcP1 = vec_perm(srcR2, srcR3, permP1);
104 srcP2 = vec_perm(srcR2, srcR3, permP2);
105 srcP3 = vec_perm(srcR2, srcR3, permP3);
108 vec_u8 srcR3 = vec_ld(30, src);
109 srcM2 = vec_perm(srcR1, srcR2, permM2);
111 srcP0 = vec_perm(srcR2, srcR3, permP0);
112 srcP1 = vec_perm(srcR2, srcR3, permP1);
113 srcP2 = vec_perm(srcR2, srcR3, permP2);
114 srcP3 = vec_perm(srcR2, srcR3, permP3);
133 sum1A = vec_adds(srcP0A, srcP1A);
134 sum1B = vec_adds(srcP0B, srcP1B);
135 sum2A = vec_adds(srcM1A, srcP2A);
136 sum2B = vec_adds(srcM1B, srcP2B);
137 sum3A = vec_adds(srcM2A, srcP3A);
138 sum3B = vec_adds(srcM2B, srcP3B);
140 pp1A = vec_mladd(sum1A, v20ss, v16ss);
141 pp1B = vec_mladd(sum1B, v20ss, v16ss);
143 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
144 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
146 pp3A = vec_add(sum3A, pp1A);
147 pp3B = vec_add(sum3B, pp1B);
149 psumA = vec_sub(pp3A, pp2A);
150 psumB = vec_sub(pp3B, pp2B);
152 sumA = vec_sra(psumA, v5us);
153 sumB = vec_sra(psumB, v5us);
155 sum = vec_packsu(sumA, sumB);
159 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
161 vec_st(fsum, 0, dst);
170 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
171 static void PREFIX_h264_qpel16_v_lowpass_altivec(
uint8_t * dst,
uint8_t *
src,
int dstStride,
int srcStride) {
176 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
177 const vec_u16 v5us = vec_splat_u16(5);
178 const vec_s16 v5ss = vec_splat_s16(5);
179 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
181 uint8_t *srcbis = src - (srcStride * 2);
183 const vec_u8 srcM2a = vec_ld(0, srcbis);
184 const vec_u8 srcM2b = vec_ld(16, srcbis);
185 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
187 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
188 const vec_u8 srcM1b = vec_ld(16, srcbis);
189 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
191 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
192 const vec_u8 srcP0b = vec_ld(16, srcbis);
193 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
195 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
196 const vec_u8 srcP1b = vec_ld(16, srcbis);
197 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
199 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
200 const vec_u8 srcP2b = vec_ld(16, srcbis);
201 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
215 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
216 psumA, psumB, sumA, sumB,
218 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
220 vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
222 for (i = 0 ; i < 16 ; i++) {
223 srcP3a = vec_ld(0, srcbis += srcStride);
224 srcP3b = vec_ld(16, srcbis);
225 srcP3 = vec_perm(srcP3a, srcP3b, perm);
230 sum1A = vec_adds(srcP0ssA, srcP1ssA);
231 sum1B = vec_adds(srcP0ssB, srcP1ssB);
232 sum2A = vec_adds(srcM1ssA, srcP2ssA);
233 sum2B = vec_adds(srcM1ssB, srcP2ssB);
234 sum3A = vec_adds(srcM2ssA, srcP3ssA);
235 sum3B = vec_adds(srcM2ssB, srcP3ssB);
248 pp1A = vec_mladd(sum1A, v20ss, v16ss);
249 pp1B = vec_mladd(sum1B, v20ss, v16ss);
251 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
252 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
254 pp3A = vec_add(sum3A, pp1A);
255 pp3B = vec_add(sum3B, pp1B);
257 psumA = vec_sub(pp3A, pp2A);
258 psumB = vec_sub(pp3B, pp2B);
260 sumA = vec_sra(psumA, v5us);
261 sumB = vec_sra(psumB, v5us);
263 sum = vec_packsu(sumA, sumB);
267 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
269 vec_st(fsum, 0, dst);
277 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
278 static void PREFIX_h264_qpel16_hv_lowpass_altivec(
uint8_t * dst, int16_t * tmp,
uint8_t * src,
int dstStride,
int tmpStride,
int srcStride) {
281 const vec_u8 permM2 = vec_lvsl(-2, src);
282 const vec_u8 permM1 = vec_lvsl(-1, src);
283 const vec_u8 permP0 = vec_lvsl(+0, src);
284 const vec_u8 permP1 = vec_lvsl(+1, src);
285 const vec_u8 permP2 = vec_lvsl(+2, src);
286 const vec_u8 permP3 = vec_lvsl(+3, src);
287 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
288 const vec_u32 v10ui = vec_splat_u32(10);
289 const vec_s16 v5ss = vec_splat_s16(5);
290 const vec_s16 v1ss = vec_splat_s16(1);
291 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
292 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
294 register int align = ((((
unsigned long)src) - 2) % 16);
296 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
297 srcP2A, srcP2B, srcP3A, srcP3B,
298 srcM1A, srcM1B, srcM2A, srcM2B,
299 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
300 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
303 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
304 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
305 int16_t *tmpbis = tmp;
307 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
308 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
311 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
312 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
313 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
314 ssumAe, ssumAo, ssumBe, ssumBo;
318 src -= (2 * srcStride);
319 for (i = 0 ; i < 21 ; i ++) {
320 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
321 vec_u8 srcR1 = vec_ld(-2, src);
322 vec_u8 srcR2 = vec_ld(14, src);
326 srcM2 = vec_perm(srcR1, srcR2, permM2);
327 srcM1 = vec_perm(srcR1, srcR2, permM1);
328 srcP0 = vec_perm(srcR1, srcR2, permP0);
329 srcP1 = vec_perm(srcR1, srcR2, permP1);
330 srcP2 = vec_perm(srcR1, srcR2, permP2);
331 srcP3 = vec_perm(srcR1, srcR2, permP3);
334 srcM2 = vec_perm(srcR1, srcR2, permM2);
335 srcM1 = vec_perm(srcR1, srcR2, permM1);
336 srcP0 = vec_perm(srcR1, srcR2, permP0);
337 srcP1 = vec_perm(srcR1, srcR2, permP1);
338 srcP2 = vec_perm(srcR1, srcR2, permP2);
342 vec_u8 srcR3 = vec_ld(30, src);
343 srcM2 = vec_perm(srcR1, srcR2, permM2);
344 srcM1 = vec_perm(srcR1, srcR2, permM1);
345 srcP0 = vec_perm(srcR1, srcR2, permP0);
346 srcP1 = vec_perm(srcR1, srcR2, permP1);
348 srcP3 = vec_perm(srcR2, srcR3, permP3);
351 vec_u8 srcR3 = vec_ld(30, src);
352 srcM2 = vec_perm(srcR1, srcR2, permM2);
353 srcM1 = vec_perm(srcR1, srcR2, permM1);
354 srcP0 = vec_perm(srcR1, srcR2, permP0);
356 srcP2 = vec_perm(srcR2, srcR3, permP2);
357 srcP3 = vec_perm(srcR2, srcR3, permP3);
360 vec_u8 srcR3 = vec_ld(30, src);
361 srcM2 = vec_perm(srcR1, srcR2, permM2);
362 srcM1 = vec_perm(srcR1, srcR2, permM1);
364 srcP1 = vec_perm(srcR2, srcR3, permP1);
365 srcP2 = vec_perm(srcR2, srcR3, permP2);
366 srcP3 = vec_perm(srcR2, srcR3, permP3);
369 vec_u8 srcR3 = vec_ld(30, src);
370 srcM2 = vec_perm(srcR1, srcR2, permM2);
372 srcP0 = vec_perm(srcR2, srcR3, permP0);
373 srcP1 = vec_perm(srcR2, srcR3, permP1);
374 srcP2 = vec_perm(srcR2, srcR3, permP2);
375 srcP3 = vec_perm(srcR2, srcR3, permP3);
394 sum1A = vec_adds(srcP0A, srcP1A);
395 sum1B = vec_adds(srcP0B, srcP1B);
396 sum2A = vec_adds(srcM1A, srcP2A);
397 sum2B = vec_adds(srcM1B, srcP2B);
398 sum3A = vec_adds(srcM2A, srcP3A);
399 sum3B = vec_adds(srcM2B, srcP3B);
401 pp1A = vec_mladd(sum1A, v20ss, sum3A);
402 pp1B = vec_mladd(sum1B, v20ss, sum3B);
404 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
405 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
407 psumA = vec_sub(pp1A, pp2A);
408 psumB = vec_sub(pp1B, pp2B);
410 vec_st(psumA, 0, tmp);
411 vec_st(psumB, 16, tmp);
417 tmpM2ssA = vec_ld(0, tmpbis);
418 tmpM2ssB = vec_ld(16, tmpbis);
420 tmpM1ssA = vec_ld(0, tmpbis);
421 tmpM1ssB = vec_ld(16, tmpbis);
423 tmpP0ssA = vec_ld(0, tmpbis);
424 tmpP0ssB = vec_ld(16, tmpbis);
426 tmpP1ssA = vec_ld(0, tmpbis);
427 tmpP1ssB = vec_ld(16, tmpbis);
429 tmpP2ssA = vec_ld(0, tmpbis);
430 tmpP2ssB = vec_ld(16, tmpbis);
433 for (i = 0 ; i < 16 ; i++) {
434 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
435 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
437 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
438 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
439 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
440 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
441 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
442 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
457 pp1Ae = vec_mule(sum1A, v20ss);
458 pp1Ao = vec_mulo(sum1A, v20ss);
459 pp1Be = vec_mule(sum1B, v20ss);
460 pp1Bo = vec_mulo(sum1B, v20ss);
462 pp2Ae = vec_mule(sum2A, v5ss);
463 pp2Ao = vec_mulo(sum2A, v5ss);
464 pp2Be = vec_mule(sum2B, v5ss);
465 pp2Bo = vec_mulo(sum2B, v5ss);
467 pp3Ae = vec_sra((
vec_s32)sum3A, v16ui);
468 pp3Ao = vec_mulo(sum3A, v1ss);
469 pp3Be = vec_sra((
vec_s32)sum3B, v16ui);
470 pp3Bo = vec_mulo(sum3B, v1ss);
472 pp1cAe = vec_add(pp1Ae, v512si);
473 pp1cAo = vec_add(pp1Ao, v512si);
474 pp1cBe = vec_add(pp1Be, v512si);
475 pp1cBo = vec_add(pp1Bo, v512si);
477 pp32Ae = vec_sub(pp3Ae, pp2Ae);
478 pp32Ao = vec_sub(pp3Ao, pp2Ao);
479 pp32Be = vec_sub(pp3Be, pp2Be);
480 pp32Bo = vec_sub(pp3Bo, pp2Bo);
482 sumAe = vec_add(pp1cAe, pp32Ae);
483 sumAo = vec_add(pp1cAo, pp32Ao);
484 sumBe = vec_add(pp1cBe, pp32Be);
485 sumBo = vec_add(pp1cBo, pp32Bo);
487 ssumAe = vec_sra(sumAe, v10ui);
488 ssumAo = vec_sra(sumAo, v10ui);
489 ssumBe = vec_sra(sumBe, v10ui);
490 ssumBo = vec_sra(sumBo, v10ui);
492 ssume = vec_packs(ssumAe, ssumBe);
493 ssumo = vec_packs(ssumAo, ssumBo);
495 sumv = vec_packsu(ssume, ssumo);
496 sum = vec_perm(sumv, sumv, mperm);
500 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
502 vec_st(fsum, 0, dst);