28 static inline uint64_t
avg2(uint64_t
a, uint64_t
b)
33 static inline uint64_t
avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
35 uint64_t r1 = ((l1 & ~
BYTE_VEC(0x03)) >> 2)
39 uint64_t r2 = (( (l1 &
BYTE_VEC(0x03))
51 if ((
size_t) pix2 & 0x7) {
85 if ((
size_t) pix2 & 0x7) {
88 uint64_t p1_l, p1_r, p2_l, p2_r;
104 uint64_t p1_l, p1_r, p2_l, p2_r;
107 p1_r =
ldq(pix1 + 8);
109 p2_r =
ldq(pix2 + 8);
125 uint64_t disalign = (size_t) pix2 & 0x7;
130 uint64_t p1_l, p1_r, p2_l, p2_r;
134 p1_r =
ldq(pix1 + 8);
137 p2_l =
avg2(l, (l >> 8) | ((uint64_t)
r << 56));
138 p2_r =
avg2(
r, (
r >> 8) | ((uint64_t) pix2[16] << 56));
152 uint64_t p1_l, p1_r, p2_l, p2_r;
156 p1_r =
ldq(pix1 + 8);
171 uint64_t disalign1 = disalign + 1;
172 uint64_t p1_l, p1_r, p2_l, p2_r;
176 p1_r =
ldq(pix1 + 8);
199 if ((
size_t) pix2 & 0x7) {
200 uint64_t t, p2_l, p2_r;
206 uint64_t p1_l, p1_r, np2_l, np2_r;
210 p1_r =
ldq(pix1 + 8);
227 p2_r =
ldq(pix2 + 8);
229 uint64_t p1_l, p1_r, np2_l, np2_r;
232 p1_r =
ldq(pix1 + 8);
235 np2_r =
ldq(pix2 + 8);
253 uint64_t p2_l, p2_r, p2_x;
256 p1_r =
ldq(pix1 + 8);
258 if ((
size_t) pix2 & 0x7) {
260 p2_r =
uldq(pix2 + 8);
261 p2_x = (uint64_t) pix2[16] << 56;
264 p2_r =
ldq(pix2 + 8);
265 p2_x =
ldq(pix2 + 16) << 56;
269 uint64_t np1_l, np1_r;
270 uint64_t np2_l, np2_r, np2_x;
276 np1_r =
ldq(pix1 + 8);
278 if ((
size_t) pix2 & 0x7) {
280 np2_r =
uldq(pix2 + 8);
281 np2_x = (uint64_t) pix2[16] << 56;
284 np2_r =
ldq(pix2 + 8);
285 np2_x =
ldq(pix2 + 16) << 56;
289 avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56),
290 np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
292 avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x),
293 np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));