28 static inline uint64_t 
avg2(uint64_t 
a, uint64_t 
b)
 
   30     return (a | b) - (((a ^ 
b) & 
BYTE_VEC(0xfe)) >> 1);
 
   33 static inline uint64_t 
avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
 
   35     uint64_t r1 = ((l1 & ~
BYTE_VEC(0x03)) >> 2)
 
   39     uint64_t r2 = ((  (l1 & 
BYTE_VEC(0x03))
 
   51     if ((
size_t) pix2 & 0x7) {
 
   58             result += 
perr(p1, p2);
 
   69             result += 
perr(p1, p2);
 
   85     if ((
size_t) pix2 & 0x7) {
 
   88             uint64_t p1_l, p1_r, p2_l, p2_r;
 
   99             result += 
perr(p1_l, p2_l)
 
  104             uint64_t p1_l, p1_r, p2_l, p2_r;
 
  107             p1_r = 
ldq(pix1 + 8);
 
  109             p2_r = 
ldq(pix2 + 8);
 
  113             result += 
perr(p1_l, p2_l)
 
  125     uint64_t disalign = (size_t) pix2 & 0x7;
 
  130             uint64_t p1_l, p1_r, p2_l, p2_r;
 
  134             p1_r = 
ldq(pix1 + 8);
 
  137             p2_l = 
avg2(l, (l >> 8) | ((uint64_t) r << 56));
 
  138             p2_r = 
avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
 
  142             result += 
perr(p1_l, p2_l)
 
  152             uint64_t p1_l, p1_r, p2_l, p2_r;
 
  156             p1_r = 
ldq(pix1 + 8);
 
  159             r     = 
ldq_u(pix2 + 16);
 
  165             result += 
perr(p1_l, p2_l)
 
  171             uint64_t disalign1 = disalign + 1;
 
  172             uint64_t p1_l, p1_r, p2_l, p2_r;
 
  176             p1_r  = 
ldq(pix1 + 8);
 
  179             r     = 
ldq_u(pix2 + 16);
 
  187             result += 
perr(p1_l, p2_l)
 
  199     if ((
size_t) pix2 & 0x7) {
 
  200         uint64_t t, p2_l, p2_r;
 
  206             uint64_t p1_l, p1_r, np2_l, np2_r;
 
  210             p1_r  = 
ldq(pix1 + 8);
 
  216             result += 
perr(p1_l, 
avg2(p2_l, np2_l))
 
  227         p2_r = 
ldq(pix2 + 8);
 
  229             uint64_t p1_l, p1_r, np2_l, np2_r;
 
  232             p1_r = 
ldq(pix1 + 8);
 
  235             np2_r = 
ldq(pix2 + 8);
 
  237             result += 
perr(p1_l, 
avg2(p2_l, np2_l))
 
  253     uint64_t p2_l, p2_r, p2_x;
 
  256     p1_r = 
ldq(pix1 + 8);
 
  258     if ((
size_t) pix2 & 0x7) { 
 
  260         p2_r = 
uldq(pix2 + 8);
 
  261         p2_x = (uint64_t) pix2[16] << 56;
 
  264         p2_r = 
ldq(pix2 + 8);
 
  265         p2_x = 
ldq(pix2 + 16) << 56;
 
  269         uint64_t np1_l, np1_r;
 
  270         uint64_t np2_l, np2_r, np2_x;
 
  276         np1_r = 
ldq(pix1 + 8);
 
  278         if ((
size_t) pix2 & 0x7) { 
 
  280             np2_r = 
uldq(pix2 + 8);
 
  281             np2_x = (uint64_t) pix2[16] << 56;
 
  284             np2_r = 
ldq(pix2 + 8);
 
  285             np2_x = 
ldq(pix2 + 16) << 56;
 
  289                        avg4( p2_l, ( p2_l >> 8) | ((uint64_t)  p2_r << 56),
 
  290                             np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
 
  292                        avg4( p2_r, ( p2_r >> 8) | ((uint64_t)  p2_x),
 
  293                             np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));