00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavcodec/dsputil.h"
00023 #include "dsputil_alpha.h"
00024 #include "asm.h"
00025
00026 void get_pixels_mvi(DCTELEM *restrict block,
00027 const uint8_t *restrict pixels, int line_size)
00028 {
00029 int h = 8;
00030
00031 do {
00032 uint64_t p;
00033
00034 p = ldq(pixels);
00035 stq(unpkbw(p), block);
00036 stq(unpkbw(p >> 32), block + 4);
00037
00038 pixels += line_size;
00039 block += 8;
00040 } while (--h);
00041 }
00042
00043 void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2,
00044 int stride) {
00045 int h = 8;
00046 uint64_t mask = 0x4040;
00047
00048 mask |= mask << 16;
00049 mask |= mask << 32;
00050 do {
00051 uint64_t x, y, c, d, a;
00052 uint64_t signs;
00053
00054 x = ldq(s1);
00055 y = ldq(s2);
00056 c = cmpbge(x, y);
00057 d = x - y;
00058 a = zap(mask, c);
00059 d += 4 * a;
00060 signs = zap(-1, c);
00061
00062 stq(unpkbw(d) | (unpkbw(signs) << 8), block);
00063 stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
00064
00065 s1 += stride;
00066 s2 += stride;
00067 block += 8;
00068 } while (--h);
00069 }
00070
00071 static inline uint64_t avg2(uint64_t a, uint64_t b)
00072 {
00073 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
00074 }
00075
00076 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
00077 {
00078 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
00079 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
00080 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
00081 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
00082 uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
00083 + (l2 & BYTE_VEC(0x03))
00084 + (l3 & BYTE_VEC(0x03))
00085 + (l4 & BYTE_VEC(0x03))
00086 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
00087 return r1 + r2;
00088 }
00089
00090 int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00091 {
00092 int result = 0;
00093
00094 if ((size_t) pix2 & 0x7) {
00095
00096 do {
00097 uint64_t p1, p2;
00098
00099 p1 = ldq(pix1);
00100 p2 = uldq(pix2);
00101 result += perr(p1, p2);
00102
00103 pix1 += line_size;
00104 pix2 += line_size;
00105 } while (--h);
00106 } else {
00107 do {
00108 uint64_t p1, p2;
00109
00110 p1 = ldq(pix1);
00111 p2 = ldq(pix2);
00112 result += perr(p1, p2);
00113
00114 pix1 += line_size;
00115 pix2 += line_size;
00116 } while (--h);
00117 }
00118
00119 return result;
00120 }
00121
00122 #if 0
00123 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
00124 {
00125 int result = 0;
00126 int h = 16;
00127
00128 if ((size_t) pix2 & 0x7) {
00129
00130 do {
00131 uint64_t p1_l, p1_r, p2_l, p2_r;
00132 uint64_t t;
00133
00134 p1_l = ldq(pix1);
00135 p1_r = ldq(pix1 + 8);
00136 t = ldq_u(pix2 + 8);
00137 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
00138 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
00139 pix1 += line_size;
00140 pix2 += line_size;
00141
00142 result += perr(p1_l, p2_l)
00143 + perr(p1_r, p2_r);
00144 } while (--h);
00145 } else {
00146 do {
00147 uint64_t p1_l, p1_r, p2_l, p2_r;
00148
00149 p1_l = ldq(pix1);
00150 p1_r = ldq(pix1 + 8);
00151 p2_l = ldq(pix2);
00152 p2_r = ldq(pix2 + 8);
00153 pix1 += line_size;
00154 pix2 += line_size;
00155
00156 result += perr(p1_l, p2_l)
00157 + perr(p1_r, p2_r);
00158 } while (--h);
00159 }
00160
00161 return result;
00162 }
00163 #endif
00164
00165 int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00166 {
00167 int result = 0;
00168 uint64_t disalign = (size_t) pix2 & 0x7;
00169
00170 switch (disalign) {
00171 case 0:
00172 do {
00173 uint64_t p1_l, p1_r, p2_l, p2_r;
00174 uint64_t l, r;
00175
00176 p1_l = ldq(pix1);
00177 p1_r = ldq(pix1 + 8);
00178 l = ldq(pix2);
00179 r = ldq(pix2 + 8);
00180 p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
00181 p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
00182 pix1 += line_size;
00183 pix2 += line_size;
00184
00185 result += perr(p1_l, p2_l)
00186 + perr(p1_r, p2_r);
00187 } while (--h);
00188 break;
00189 case 7:
00190
00191
00192
00193
00194 do {
00195 uint64_t p1_l, p1_r, p2_l, p2_r;
00196 uint64_t l, m, r;
00197
00198 p1_l = ldq(pix1);
00199 p1_r = ldq(pix1 + 8);
00200 l = ldq_u(pix2);
00201 m = ldq_u(pix2 + 8);
00202 r = ldq_u(pix2 + 16);
00203 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m);
00204 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r);
00205 pix1 += line_size;
00206 pix2 += line_size;
00207
00208 result += perr(p1_l, p2_l)
00209 + perr(p1_r, p2_r);
00210 } while (--h);
00211 break;
00212 default:
00213 do {
00214 uint64_t disalign1 = disalign + 1;
00215 uint64_t p1_l, p1_r, p2_l, p2_r;
00216 uint64_t l, m, r;
00217
00218 p1_l = ldq(pix1);
00219 p1_r = ldq(pix1 + 8);
00220 l = ldq_u(pix2);
00221 m = ldq_u(pix2 + 8);
00222 r = ldq_u(pix2 + 16);
00223 p2_l = avg2(extql(l, disalign) | extqh(m, disalign),
00224 extql(l, disalign1) | extqh(m, disalign1));
00225 p2_r = avg2(extql(m, disalign) | extqh(r, disalign),
00226 extql(m, disalign1) | extqh(r, disalign1));
00227 pix1 += line_size;
00228 pix2 += line_size;
00229
00230 result += perr(p1_l, p2_l)
00231 + perr(p1_r, p2_r);
00232 } while (--h);
00233 break;
00234 }
00235 return result;
00236 }
00237
00238 int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00239 {
00240 int result = 0;
00241
00242 if ((size_t) pix2 & 0x7) {
00243 uint64_t t, p2_l, p2_r;
00244 t = ldq_u(pix2 + 8);
00245 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
00246 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
00247
00248 do {
00249 uint64_t p1_l, p1_r, np2_l, np2_r;
00250 uint64_t t;
00251
00252 p1_l = ldq(pix1);
00253 p1_r = ldq(pix1 + 8);
00254 pix2 += line_size;
00255 t = ldq_u(pix2 + 8);
00256 np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
00257 np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
00258
00259 result += perr(p1_l, avg2(p2_l, np2_l))
00260 + perr(p1_r, avg2(p2_r, np2_r));
00261
00262 pix1 += line_size;
00263 p2_l = np2_l;
00264 p2_r = np2_r;
00265
00266 } while (--h);
00267 } else {
00268 uint64_t p2_l, p2_r;
00269 p2_l = ldq(pix2);
00270 p2_r = ldq(pix2 + 8);
00271 do {
00272 uint64_t p1_l, p1_r, np2_l, np2_r;
00273
00274 p1_l = ldq(pix1);
00275 p1_r = ldq(pix1 + 8);
00276 pix2 += line_size;
00277 np2_l = ldq(pix2);
00278 np2_r = ldq(pix2 + 8);
00279
00280 result += perr(p1_l, avg2(p2_l, np2_l))
00281 + perr(p1_r, avg2(p2_r, np2_r));
00282
00283 pix1 += line_size;
00284 p2_l = np2_l;
00285 p2_r = np2_r;
00286 } while (--h);
00287 }
00288 return result;
00289 }
00290
00291 int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00292 {
00293 int result = 0;
00294
00295 uint64_t p1_l, p1_r;
00296 uint64_t p2_l, p2_r, p2_x;
00297
00298 p1_l = ldq(pix1);
00299 p1_r = ldq(pix1 + 8);
00300
00301 if ((size_t) pix2 & 0x7) {
00302 p2_l = uldq(pix2);
00303 p2_r = uldq(pix2 + 8);
00304 p2_x = (uint64_t) pix2[16] << 56;
00305 } else {
00306 p2_l = ldq(pix2);
00307 p2_r = ldq(pix2 + 8);
00308 p2_x = ldq(pix2 + 16) << 56;
00309 }
00310
00311 do {
00312 uint64_t np1_l, np1_r;
00313 uint64_t np2_l, np2_r, np2_x;
00314
00315 pix1 += line_size;
00316 pix2 += line_size;
00317
00318 np1_l = ldq(pix1);
00319 np1_r = ldq(pix1 + 8);
00320
00321 if ((size_t) pix2 & 0x7) {
00322 np2_l = uldq(pix2);
00323 np2_r = uldq(pix2 + 8);
00324 np2_x = (uint64_t) pix2[16] << 56;
00325 } else {
00326 np2_l = ldq(pix2);
00327 np2_r = ldq(pix2 + 8);
00328 np2_x = ldq(pix2 + 16) << 56;
00329 }
00330
00331 result += perr(p1_l,
00332 avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56),
00333 np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
00334 + perr(p1_r,
00335 avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x),
00336 np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
00337
00338 p1_l = np1_l;
00339 p1_r = np1_r;
00340 p2_l = np2_l;
00341 p2_r = np2_r;
00342 p2_x = np2_x;
00343 } while (--h);
00344
00345 return result;
00346 }