28 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign;
29 v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l,
30 r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l;
31 v4i32
A,
B,
C,
D, Ad, Bd, Cd, Dd,
E,
F,
G,
H;
32 v4i32 Ed, Gd, Add, Bdd, Fd, Hd;
34 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
35 v4i32 c0,
c1,
c2, c3, c4, c5, c6, c7;
36 v4i32 f0, f1, f2, f3, f4, f5, f6, f7;
39 v16i8
mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
40 v4i32 cnst64277w = {64277, 64277, 64277, 64277};
41 v4i32 cnst60547w = {60547, 60547, 60547, 60547};
42 v4i32 cnst54491w = {54491, 54491, 54491, 54491};
43 v4i32 cnst46341w = {46341, 46341, 46341, 46341};
44 v4i32 cnst36410w = {36410, 36410, 36410, 36410};
45 v4i32 cnst25080w = {25080, 25080, 25080, 25080};
46 v4i32 cnst12785w = {12785, 12785, 12785, 12785};
47 v4i32 cnst8w = {8, 8, 8, 8};
48 v4i32 cnst2048w = {2048, 2048, 2048, 2048};
49 v4i32 cnst128w = {128, 128, 128, 128};
53 sign = __msa_clti_s_h(r0, 0);
54 r0_r = (v4i32) __msa_ilvr_h(sign, r0);
55 r0_l = (v4i32) __msa_ilvl_h(sign, r0);
56 sign = __msa_clti_s_h(r1, 0);
57 r1_r = (v4i32) __msa_ilvr_h(sign, r1);
58 r1_l = (v4i32) __msa_ilvl_h(sign, r1);
59 sign = __msa_clti_s_h(r2, 0);
60 r2_r = (v4i32) __msa_ilvr_h(sign, r2);
61 r2_l = (v4i32) __msa_ilvl_h(sign, r2);
62 sign = __msa_clti_s_h(r3, 0);
63 r3_r = (v4i32) __msa_ilvr_h(sign, r3);
64 r3_l = (v4i32) __msa_ilvl_h(sign, r3);
65 sign = __msa_clti_s_h(r4, 0);
66 r4_r = (v4i32) __msa_ilvr_h(sign, r4);
67 r4_l = (v4i32) __msa_ilvl_h(sign, r4);
68 sign = __msa_clti_s_h(r5, 0);
69 r5_r = (v4i32) __msa_ilvr_h(sign, r5);
70 r5_l = (v4i32) __msa_ilvl_h(sign, r5);
71 sign = __msa_clti_s_h(r6, 0);
72 r6_r = (v4i32) __msa_ilvr_h(sign, r6);
73 r6_l = (v4i32) __msa_ilvl_h(sign, r6);
74 sign = __msa_clti_s_h(r7, 0);
75 r7_r = (v4i32) __msa_ilvr_h(sign, r7);
76 r7_l = (v4i32) __msa_ilvl_h(sign, r7);
79 A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16);
80 B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16);
81 C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16);
82 D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
83 Ad = ((
A -
C) * cnst46341w) >> 16;
84 Bd = ((
B -
D) * cnst46341w) >> 16;
87 E = ((r0_r + r4_r) * cnst46341w) >> 16;
88 F = ((r0_r - r4_r) * cnst46341w) >> 16;
89 G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16);
90 H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16);
107 A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
108 B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
109 C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
110 D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16);
111 Ad = ((
A -
C) * cnst46341w) >> 16;
112 Bd = ((
B -
D) * cnst46341w) >> 16;
115 E = ((r0_l + r4_l) * cnst46341w) >> 16;
116 F = ((r0_l - r4_l) * cnst46341w) >> 16;
117 G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
118 H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
136 r0_r, r1_r, r2_r, r3_r);
138 r0_l, r1_l, r2_l, r3_l);
139 A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16);
140 B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16);
141 C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16);
142 D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
143 Ad = ((
A -
C) * cnst46341w) >> 16;
144 Bd = ((
B -
D) * cnst46341w) >> 16;
147 E = ((r0_r + r0_l) * cnst46341w) >> 16;
149 F = ((r0_r - r0_l) * cnst46341w) >> 16;
155 G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16);
156 H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16);
191 sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
192 sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
193 sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
194 sign_l = __msa_or_v(sign_l, (v16u8)r1_l);
195 sign_l = __msa_or_v(sign_l, (v16u8)r2_l);
196 sign_l = __msa_or_v(sign_l, (v16u8)r3_l);
197 sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
198 Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
200 Bdd = Add + cnst128w;
221 Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
222 Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
223 Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
224 Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
225 Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
226 Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
227 Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
228 Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
229 sign_t = __msa_ceqi_w(sign_t, 0);
230 A = (v4i32)__msa_and_v((v16u8)
A, (v16u8)sign_t);
231 B = (v4i32)__msa_and_v((v16u8)
B, (v16u8)sign_t);
232 C = (v4i32)__msa_and_v((v16u8)
C, (v16u8)sign_t);
233 D = (v4i32)__msa_and_v((v16u8)
D, (v16u8)sign_t);
234 E = (v4i32)__msa_and_v((v16u8)
E, (v16u8)sign_t);
235 F = (v4i32)__msa_and_v((v16u8)
F, (v16u8)sign_t);
236 G = (v4i32)__msa_and_v((v16u8)
G, (v16u8)sign_t);
237 H = (v4i32)__msa_and_v((v16u8)
H, (v16u8)sign_t);
249 r4_r, r5_r, r6_r, r7_r);
251 r4_l, r5_l, r6_l, r7_l);
252 A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
253 B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
254 C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
255 D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16);
256 Ad = ((
A -
C) * cnst46341w) >> 16;
257 Bd = ((
B -
D) * cnst46341w) >> 16;
260 E = ((r4_r + r4_l) * cnst46341w) >> 16;
262 F = ((r4_r - r4_l) * cnst46341w) >> 16;
268 G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
269 H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
299 sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
300 sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
301 sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
302 sign_l = __msa_or_v(sign_l, (v16u8)r5_l);
303 sign_l = __msa_or_v(sign_l, (v16u8)r6_l);
304 sign_l = __msa_or_v(sign_l, (v16u8)r7_l);
305 sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
306 Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
308 Bdd = Add + cnst128w;
329 Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
330 Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
331 Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
332 Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
333 Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
334 Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
335 Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
336 Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
337 sign_t = __msa_ceqi_w(sign_t, 0);
338 A = (v4i32)__msa_and_v((v16u8)
A, (v16u8)sign_t);
339 B = (v4i32)__msa_and_v((v16u8)
B, (v16u8)sign_t);
340 C = (v4i32)__msa_and_v((v16u8)
C, (v16u8)sign_t);
341 D = (v4i32)__msa_and_v((v16u8)
D, (v16u8)sign_t);
342 E = (v4i32)__msa_and_v((v16u8)
E, (v16u8)sign_t);
343 F = (v4i32)__msa_and_v((v16u8)
F, (v16u8)sign_t);
344 G = (v4i32)__msa_and_v((v16u8)
G, (v16u8)sign_t);
345 H = (v4i32)__msa_and_v((v16u8)
H, (v16u8)sign_t);
384 int i = (
block[0] + 15) >> 5;
386 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
387 v4i32 c0,
c1,
c2, c3, c4, c5, c6, c7;
388 v4i32 e0, e1, e2, e3, e4, e5, e6, e7;
389 v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
390 v16i8
mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
393 LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
434 ST_D1(d1, 0, dest + line_size);
435 ST_D1(d2, 0, dest + 2 * line_size);
436 ST_D1(d3, 0, dest + 3 * line_size);
437 ST_D1(d4, 0, dest + 4 * line_size);
438 ST_D1(d5, 0, dest + 5 * line_size);
439 ST_D1(d6, 0, dest + 6 * line_size);
440 ST_D1(d7, 0, dest + 7 * line_size);
446 int *bounding_values)
449 v4i32 e0, e1, f0, f1, g0, g1;
451 v16i8 d0, d1, d2, d3;
452 v8i16 c0,
c1,
c2, c3;
454 v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
455 cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
456 v16i8
mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
460 LD_SB4(first_pixel + nstride * 2,
stride, d0, d1, d2, d3);
463 r0 = (c0 - c3) + (
c2 -
c1) * cnst3h;
468 for (
int i = 0;
i < 8;
i++)
469 temp_32[
i] = bounding_values[temp_16[
i]];
470 LD_SW2(temp_32, 4, e0, e1);
481 ST_D1(d1, 0, first_pixel + nstride);
482 ST_D1(d2, 0, first_pixel);
486 int *bounding_values)
488 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
489 v8i16 c0,
c1,
c2, c3, c4, c5, c6, c7;
491 v4i32 e0, e1, f0, f1, g0, g1;
493 v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
494 cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
495 v16i8
mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0};
499 LD_SB8(first_pixel - 2,
stride, d0, d1, d2, d3, d4, d5, d6, d7);
505 c0,
c1,
c2, c3, c4, c5, c6, c7);
506 r0 = (c0 - c3) + (
c2 -
c1) * cnst3h;
512 for (
int i = 0;
i < 8;
i++)
513 temp_32[
i] = bounding_values[temp_16[
i]];
514 LD_SW2(temp_32, 4, e0, e1);
532 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
533 v16i8 c0,
c1,
c2, c3;
538 v16i8
mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
545 a0 = (v4i32) __msa_pckev_d((v2i64)
c1, (v2i64)c0);
546 a2 = (v4i32) __msa_pckod_d((v2i64)
c1, (v2i64)c0);
547 a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)
c2);
548 a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)
c2);
553 b0 = (v4i32) __msa_pckev_d((v2i64)
c1, (v2i64)c0);
554 b2 = (v4i32) __msa_pckod_d((v2i64)
c1, (v2i64)c0);
555 b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)
c2);
556 b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)
c2);
558 e0 = (v4i32) __msa_xor_v((v16u8)
a0, (v16u8)
b0);
559 e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask);
560 t0 = ((v4u32)e0) >> 1;
561 e2 = (v4i32) __msa_and_v((v16u8)
a0, (v16u8)
b0);
564 e1 = (v4i32) __msa_xor_v((v16u8)
a1, (v16u8)
b1);
565 e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask);
566 t1 = ((v4u32)e1) >> 1;
567 e2 = (v4i32) __msa_and_v((v16u8)
a1, (v16u8)
b1);
570 f0 = (v4i32) __msa_xor_v((v16u8)
a2, (v16u8)
b2);
571 f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask);
572 t2 = ((v4u32)f0) >> 1;
573 f2 = (v4i32) __msa_and_v((v16u8)
a2, (v16u8)
b2);
576 f1 = (v4i32) __msa_xor_v((v16u8)
a3, (v16u8)
b3);
577 f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask);
578 t3 = ((v4u32)f1) >> 1;
579 f2 = (v4i32) __msa_and_v((v16u8)
a3, (v16u8)
b3);
582 ST_W8(
t0,
t1, 0, 1, 2, 3, 0, 1, 2, 3, dst,
stride);
583 ST_W8(
t2,
t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4,
stride);
587 for (
i = 0;
i <
h;
i++) {