25 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
26 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
27 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
29 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
30 psum = vec_mladd(vB, vsrc1ssH, psum);\
31 psum = vec_mladd(vC, vsrc2ssH, psum);\
32 psum = vec_mladd(vD, vsrc3ssH, psum);\
34 psum = vec_sr(psum, v6us);\
36 vdst = vec_ld(0, dst);\
37 ppsum = (vec_u8)vec_pack(psum, psum);\
38 vfdst = vec_perm(vdst, ppsum, fperm);\
40 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
42 vec_st(fsum, 0, dst);\
50 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
52 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
53 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
55 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
56 psum = vec_mladd(vE, vsrc1ssH, psum);\
57 psum = vec_sr(psum, v6us);\
59 vdst = vec_ld(0, dst);\
60 ppsum = (vec_u8)vec_pack(psum, psum);\
61 vfdst = vec_perm(vdst, ppsum, fperm);\
63 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
65 vec_st(fsum, 0, dst);\
71 #define add28(a) vec_add(v28ss, a)
73 #ifdef PREFIX_h264_chroma_mc8_altivec
75 int stride,
int h,
int x,
int y) {
83 const vec_s32 vABCD = vec_ld(0, ABCD);
89 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
90 const vec_u16 v6us = vec_splat_u16(6);
91 register int loadSecond = (((
unsigned long)src) % 16) <= 7 ? 0 : 1;
92 register int reallyBadAlign = (((
unsigned long)src) % 16) == 15 ? 1 : 0;
97 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
98 vec_s16 vsrc2ssH, vsrc3ssH, psum;
99 vec_u8 vdst, ppsum, vfdst, fsum;
101 if (((
unsigned long)dst) % 16 == 0) {
102 fperm = (
vec_u8){0x10, 0x11, 0x12, 0x13,
103 0x14, 0x15, 0x16, 0x17,
104 0x08, 0x09, 0x0A, 0x0B,
105 0x0C, 0x0D, 0x0E, 0x0F};
107 fperm = (
vec_u8){0x00, 0x01, 0x02, 0x03,
108 0x04, 0x05, 0x06, 0x07,
109 0x18, 0x19, 0x1A, 0x1B,
110 0x1C, 0x1D, 0x1E, 0x1F};
113 vsrcAuc = vec_ld(0, src);
116 vsrcBuc = vec_ld(16, src);
117 vsrcperm0 = vec_lvsl(0, src);
118 vsrcperm1 = vec_lvsl(1, src);
120 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
124 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
131 for (i = 0 ; i < h ; i++) {
132 vsrcCuc = vec_ld(stride + 0, src);
133 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
134 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
140 for (i = 0 ; i < h ; i++) {
141 vsrcCuc = vec_ld(stride + 0, src);
142 vsrcDuc = vec_ld(stride + 16, src);
143 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
147 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
153 const vec_s16 vE = vec_add(vB, vC);
156 for (i = 0 ; i < h ; i++) {
157 vsrcCuc = vec_ld(stride + 0, src);
158 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
165 for (i = 0 ; i < h ; i++) {
166 vsrcCuc = vec_ld(stride + 0, src);
167 vsrcDuc = vec_ld(stride + 15, src);
168 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
176 for (i = 0 ; i < h ; i++) {
177 vsrcCuc = vec_ld(0, src);
178 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
179 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
185 for (i = 0 ; i < h ; i++) {
186 vsrcCuc = vec_ld(0, src);
187 vsrcDuc = vec_ld(15, src);
188 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
192 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
203 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
206 {((8 - x) * (8 - y)),
212 const vec_s32 vABCD = vec_ld(0, ABCD);
218 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
219 const vec_u16 v6us = vec_splat_u16(6);
220 register int loadSecond = (((
unsigned long)src) % 16) <= 7 ? 0 : 1;
221 register int reallyBadAlign = (((
unsigned long)src) % 16) == 15 ? 1 : 0;
226 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
227 vec_s16 vsrc2ssH, vsrc3ssH, psum;
228 vec_u8 vdst, ppsum, vfdst, fsum;
230 if (((
unsigned long)dst) % 16 == 0) {
231 fperm = (
vec_u8){0x10, 0x11, 0x12, 0x13,
232 0x14, 0x15, 0x16, 0x17,
233 0x08, 0x09, 0x0A, 0x0B,
234 0x0C, 0x0D, 0x0E, 0x0F};
236 fperm = (
vec_u8){0x00, 0x01, 0x02, 0x03,
237 0x04, 0x05, 0x06, 0x07,
238 0x18, 0x19, 0x1A, 0x1B,
239 0x1C, 0x1D, 0x1E, 0x1F};
242 vsrcAuc = vec_ld(0, src);
245 vsrcBuc = vec_ld(16, src);
246 vsrcperm0 = vec_lvsl(0, src);
247 vsrcperm1 = vec_lvsl(1, src);
249 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
253 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
259 for (i = 0 ; i < h ; i++) {
262 vsrcCuc = vec_ld(stride + 0, src);
264 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
265 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
271 for (i = 0 ; i < h ; i++) {
272 vsrcCuc = vec_ld(stride + 0, src);
273 vsrcDuc = vec_ld(stride + 16, src);
275 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
279 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
289 #undef CHROMA_MC8_ALTIVEC_CORE