36 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
37 "ldc1 %[ftmp1], 0x00(%[src]) \n\t"
38 "ldc1 %[ftmp2], 0x08(%[src]) \n\t"
39 "ldc1 %[ftmp3], 0x10(%[src]) \n\t"
40 "ldc1 %[ftmp4], 0x18(%[src]) \n\t"
41 "uld %[low32], 0x00(%[dst0]) \n\t"
42 "mtc1 %[low32], %[ftmp5] \n\t"
43 "uld %[low32], 0x00(%[dst1]) \n\t"
44 "mtc1 %[low32], %[ftmp6] \n\t"
45 "uld %[low32], 0x00(%[dst2]) \n\t"
46 "mtc1 %[low32], %[ftmp7] \n\t"
47 "uld %[low32], 0x00(%[dst3]) \n\t"
48 "mtc1 %[low32], %[ftmp8] \n\t"
49 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
50 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
51 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
52 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
53 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
54 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
55 "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
56 "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
57 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
58 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
59 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
60 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
61 "gsswlc1 %[ftmp1], 0x03(%[dst0]) \n\t"
62 "gsswrc1 %[ftmp1], 0x00(%[dst0]) \n\t"
63 "gsswlc1 %[ftmp2], 0x03(%[dst1]) \n\t"
64 "gsswrc1 %[ftmp2], 0x00(%[dst1]) \n\t"
65 "gsswlc1 %[ftmp3], 0x03(%[dst2]) \n\t"
66 "gsswrc1 %[ftmp3], 0x00(%[dst2]) \n\t"
67 "gsswlc1 %[ftmp4], 0x03(%[dst3]) \n\t"
68 "gsswrc1 %[ftmp4], 0x00(%[dst3]) \n\t"
69 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
70 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
71 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
72 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
73 [ftmp8]
"=&f"(ftmp[8]),
75 : [dst0]
"r"(dst), [dst1]
"r"(dst+stride),
76 [dst2]
"r"(dst+2*
stride), [dst3]
"r"(dst+3*stride),
91 "dli %[tmp0], 0x01 \n\t"
92 "ldc1 %[ftmp0], 0x00(%[block]) \n\t"
93 "mtc1 %[tmp0], %[ftmp8] \n\t"
94 "ldc1 %[ftmp1], 0x08(%[block]) \n\t"
95 "dli %[tmp0], 0x06 \n\t"
96 "ldc1 %[ftmp2], 0x10(%[block]) \n\t"
97 "mtc1 %[tmp0], %[ftmp9] \n\t"
98 "psrah %[ftmp4], %[ftmp1], %[ftmp8] \n\t"
99 "ldc1 %[ftmp3], 0x18(%[block]) \n\t"
100 "psrah %[ftmp5], %[ftmp3], %[ftmp8] \n\t"
101 "psubh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
102 "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
103 "paddh %[ftmp10], %[ftmp2], %[ftmp0] \n\t"
104 "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
105 "paddh %[ftmp11], %[ftmp5], %[ftmp10] \n\t"
106 "psubh %[ftmp2], %[ftmp10], %[ftmp5] \n\t"
107 "paddh %[ftmp10], %[ftmp4], %[ftmp0] \n\t"
108 "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
109 "punpckhhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t"
110 "punpcklhw %[ftmp5], %[ftmp11], %[ftmp10] \n\t"
111 "punpckhhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t"
112 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
113 "punpckhwd %[ftmp2], %[ftmp5], %[ftmp0] \n\t"
114 "punpcklwd %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
115 "punpcklwd %[ftmp10], %[ftmp1], %[ftmp4] \n\t"
116 "punpckhwd %[ftmp0], %[ftmp1], %[ftmp4] \n\t"
117 "paddh %[ftmp5], %[ftmp5], %[ff_pw_32] \n\t"
118 "psrah %[ftmp4], %[ftmp2], %[ftmp8] \n\t"
119 "psrah %[ftmp3], %[ftmp0], %[ftmp8] \n\t"
120 "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
121 "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
122 "paddh %[ftmp1], %[ftmp10], %[ftmp5] \n\t"
123 "psubh %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
124 "paddh %[ftmp10], %[ftmp3], %[ftmp1] \n\t"
125 "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
126 "paddh %[ftmp11], %[ftmp4], %[ftmp5] \n\t"
127 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
128 "psubh %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
129 "sdc1 %[ftmp7], 0x00(%[block]) \n\t"
130 "sdc1 %[ftmp7], 0x08(%[block]) \n\t"
131 "sdc1 %[ftmp7], 0x10(%[block]) \n\t"
132 "sdc1 %[ftmp7], 0x18(%[block]) \n\t"
133 "uld %[low32], 0x00(%[dst]) \n\t"
134 "mtc1 %[low32], %[ftmp2] \n\t"
135 "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t"
136 "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
137 "psrah %[ftmp4], %[ftmp11], %[ftmp9] \n\t"
138 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
139 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
140 "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
141 "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
142 "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
143 "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
144 "gsswlc1 %[ftmp2], 0x03(%[dst]) \n\t"
145 "gsswrc1 %[ftmp2], 0x00(%[dst]) \n\t"
146 "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
147 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
148 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
149 "uld %[low32], 0x00(%[dst]) \n\t"
150 "mtc1 %[low32], %[ftmp2] \n\t"
151 "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
152 "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
153 "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
154 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
155 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
156 "paddh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
157 "paddh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
158 "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
159 "gsswlc1 %[ftmp2], 0x03(%[dst]) \n\t"
160 "gsswrc1 %[ftmp2], 0x00(%[dst]) \n\t"
161 "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
162 "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
163 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
164 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
165 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
166 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
167 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
168 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
171 : [dst]
"r"(dst), [block]
"r"(block),
176 memset(block, 0, 32);
187 "lhu %[tmp0], 0x00(%[block]) \n\t"
190 "ldc1 %[ftmp1], 0x10(%[block]) \n\t"
191 "sh %[tmp0], 0x00(%[block]) \n\t"
192 "ldc1 %[ftmp2], 0x20(%[block]) \n\t"
193 "dli %[tmp0], 0x01 \n\t"
194 "ldc1 %[ftmp3], 0x30(%[block]) \n\t"
195 "mtc1 %[tmp0], %[ftmp8] \n\t"
196 "ldc1 %[ftmp5], 0x50(%[block]) \n\t"
197 "ldc1 %[ftmp6], 0x60(%[block]) \n\t"
198 "ldc1 %[ftmp7], 0x70(%[block]) \n\t"
199 "mov.d %[ftmp0], %[ftmp1] \n\t"
200 "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
201 "psrah %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
202 "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
203 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
204 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
205 "paddh %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
206 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
207 "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
208 "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
209 "psubh %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
210 "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
211 "paddh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
212 "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
213 "psrah %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
214 "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
215 "dli %[tmp0], 0x02 \n\t"
216 "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
217 "mtc1 %[tmp0], %[ftmp9] \n\t"
218 "mov.d %[ftmp7], %[ftmp1] \n\t"
219 "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
220 "psrah %[ftmp3], %[ftmp4], %[ftmp9] \n\t"
221 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
222 "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
223 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
224 "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
225 "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
226 "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
227 "mov.d %[ftmp5], %[ftmp6] \n\t"
228 "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
229 "psrah %[ftmp4], %[ftmp2], %[ftmp8] \n\t"
230 "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
231 "psubh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
232 "ldc1 %[ftmp2], 0x00(%[block]) \n\t"
233 "ldc1 %[ftmp5], 0x40(%[block]) \n\t"
234 "paddh %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
235 "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
236 "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
237 "psubh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
238 "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
239 "paddh %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
240 "psubh %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
241 "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
242 "paddh %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
243 "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
244 "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
245 "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
246 "psubh %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
247 "paddh %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
248 "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
249 "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
250 "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
251 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
252 "psubh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
253 "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
254 "sdc1 %[ftmp6], 0x00(%[block]) \n\t"
255 "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
256 "punpckhhw %[ftmp6], %[ftmp7], %[ftmp0] \n\t"
257 "punpcklhw %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
258 "punpckhhw %[ftmp0], %[ftmp3], %[ftmp1] \n\t"
259 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
260 "punpckhwd %[ftmp1], %[ftmp7], %[ftmp3] \n\t"
261 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
262 "punpckhwd %[ftmp3], %[ftmp6], %[ftmp0] \n\t"
263 "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
264 "ldc1 %[ftmp0], 0x00(%[block]) \n\t"
265 "sdc1 %[ftmp7], 0x00($29) \n\t"
266 "sdc1 %[ftmp1], 0x10($29) \n\t"
267 "dmfc1 %[tmp1], %[ftmp6] \n\t"
268 "dmfc1 %[tmp3], %[ftmp3] \n\t"
269 "punpckhhw %[ftmp3], %[ftmp5], %[ftmp2] \n\t"
270 "punpcklhw %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
271 "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t"
272 "punpcklhw %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
273 "punpckhwd %[ftmp0], %[ftmp5], %[ftmp4] \n\t"
274 "punpcklwd %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
275 "punpckhwd %[ftmp4], %[ftmp3], %[ftmp2] \n\t"
276 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
277 "sdc1 %[ftmp5], 0x08($29) \n\t"
278 "sdc1 %[ftmp0], 0x18($29) \n\t"
279 "dmfc1 %[tmp2], %[ftmp3] \n\t"
280 "dmfc1 %[tmp4], %[ftmp4] \n\t"
281 "ldc1 %[ftmp1], 0x18(%[block]) \n\t"
282 "ldc1 %[ftmp6], 0x28(%[block]) \n\t"
283 "ldc1 %[ftmp2], 0x38(%[block]) \n\t"
284 "ldc1 %[ftmp0], 0x58(%[block]) \n\t"
285 "ldc1 %[ftmp3], 0x68(%[block]) \n\t"
286 "ldc1 %[ftmp4], 0x78(%[block]) \n\t"
287 "mov.d %[ftmp7], %[ftmp1] \n\t"
288 "psrah %[ftmp5], %[ftmp0], %[ftmp8] \n\t"
289 "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
290 "paddh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
291 "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
292 "paddh %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
293 "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
294 "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
295 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
296 "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
297 "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
298 "psrah %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
299 "paddh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
300 "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
301 "psrah %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
302 "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
303 "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
304 "mov.d %[ftmp4], %[ftmp1] \n\t"
305 "psrah %[ftmp2], %[ftmp5], %[ftmp9] \n\t"
306 "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
307 "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
308 "psrah %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
309 "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
310 "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
311 "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
312 "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
313 "mov.d %[ftmp0], %[ftmp3] \n\t"
314 "psrah %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
315 "psrah %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
316 "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
317 "psubh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
318 "ldc1 %[ftmp6], 0x08(%[block]) \n\t"
319 "ldc1 %[ftmp0], 0x48(%[block]) \n\t"
320 "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
321 "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
322 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
323 "psubh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
324 "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
325 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
326 "psubh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
327 "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
328 "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
329 "psubh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
330 "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
331 "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
332 "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
333 "paddh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
334 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
335 "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
336 "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
337 "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
338 "psubh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
339 "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
340 "sdc1 %[ftmp3], 0x08(%[block]) \n\t"
341 "psubh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
342 "punpckhhw %[ftmp3], %[ftmp4], %[ftmp7] \n\t"
343 "punpcklhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
344 "punpckhhw %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
345 "punpcklhw %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
346 "punpckhwd %[ftmp1], %[ftmp4], %[ftmp2] \n\t"
347 "punpcklwd %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
348 "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t"
349 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
350 "ldc1 %[ftmp7], 0x08(%[block]) \n\t"
351 "dmfc1 %[tmp5], %[ftmp4] \n\t"
352 "dmfc1 %[tmp7], %[ftmp1] \n\t"
353 "mov.d %[ftmp12], %[ftmp3] \n\t"
354 "mov.d %[ftmp14], %[ftmp2] \n\t"
355 "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t"
356 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
357 "punpckhhw %[ftmp6], %[ftmp5], %[ftmp7] \n\t"
358 "punpcklhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
359 "punpckhwd %[ftmp7], %[ftmp0], %[ftmp5] \n\t"
360 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
361 "punpckhwd %[ftmp5], %[ftmp2], %[ftmp6] \n\t"
362 "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
363 "dmfc1 %[tmp6], %[ftmp0] \n\t"
364 "mov.d %[ftmp11], %[ftmp7] \n\t"
365 "mov.d %[ftmp13], %[ftmp2] \n\t"
366 "mov.d %[ftmp15], %[ftmp5] \n\t"
368 "dmtc1 %[tmp7], %[ftmp7] \n\t"
369 "dmtc1 %[tmp3], %[ftmp6] \n\t"
370 "ldc1 %[ftmp1], 0x10($29) \n\t"
371 "dmtc1 %[tmp1], %[ftmp3] \n\t"
372 "mov.d %[ftmp4], %[ftmp1] \n\t"
373 "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
374 "psrah %[ftmp0], %[ftmp7], %[ftmp8] \n\t"
375 "paddh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
376 "paddh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
377 "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
378 "paddh %[ftmp0], %[ftmp0], %[ftmp14] \n\t"
379 "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
380 "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
381 "psubh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
382 "psubh %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
383 "psrah %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
384 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
385 "psubh %[ftmp7], %[ftmp7], %[ftmp14] \n\t"
386 "psrah %[ftmp5], %[ftmp14], %[ftmp8] \n\t"
387 "psubh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
388 "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
389 "mov.d %[ftmp5], %[ftmp1] \n\t"
390 "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
391 "psrah %[ftmp6], %[ftmp0], %[ftmp9] \n\t"
392 "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
393 "paddh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
394 "psrah %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
395 "psrah %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
396 "psubh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
397 "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
398 "mov.d %[ftmp7], %[ftmp12] \n\t"
399 "psrah %[ftmp2], %[ftmp12], %[ftmp8] \n\t"
400 "psrah %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
401 "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
402 "psubh %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
403 "ldc1 %[ftmp3], 0x00($29) \n\t"
404 "dmtc1 %[tmp5], %[ftmp7] \n\t"
405 "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
406 "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
407 "paddh %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
408 "psubh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
409 "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
410 "paddh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
411 "psubh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
412 "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
413 "paddh %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
414 "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
415 "paddh %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
416 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
417 "psubh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
418 "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
419 "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
420 "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
421 "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
422 "paddh %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
423 "psubh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
424 "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
425 "sdc1 %[ftmp3], 0x00($29) \n\t"
426 "psubh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
427 "sdc1 %[ftmp0], 0x10($29) \n\t"
428 "dmfc1 %[tmp1], %[ftmp2] \n\t"
429 "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t"
430 "sdc1 %[ftmp2], 0x00(%[block]) \n\t"
431 "sdc1 %[ftmp2], 0x08(%[block]) \n\t"
432 "sdc1 %[ftmp2], 0x10(%[block]) \n\t"
433 "sdc1 %[ftmp2], 0x18(%[block]) \n\t"
434 "sdc1 %[ftmp2], 0x20(%[block]) \n\t"
435 "sdc1 %[ftmp2], 0x28(%[block]) \n\t"
436 "sdc1 %[ftmp2], 0x30(%[block]) \n\t"
437 "sdc1 %[ftmp2], 0x38(%[block]) \n\t"
438 "sdc1 %[ftmp2], 0x40(%[block]) \n\t"
439 "sdc1 %[ftmp2], 0x48(%[block]) \n\t"
440 "sdc1 %[ftmp2], 0x50(%[block]) \n\t"
441 "sdc1 %[ftmp2], 0x58(%[block]) \n\t"
442 "sdc1 %[ftmp2], 0x60(%[block]) \n\t"
443 "sdc1 %[ftmp2], 0x68(%[block]) \n\t"
444 "sdc1 %[ftmp2], 0x70(%[block]) \n\t"
445 "sdc1 %[ftmp2], 0x78(%[block]) \n\t"
446 "dli %[tmp3], 0x06 \n\t"
447 "uld %[low32], 0x00(%[dst]) \n\t"
448 "mtc1 %[low32], %[ftmp3] \n\t"
449 "mtc1 %[tmp3], %[ftmp10] \n\t"
450 "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
451 "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
452 "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
453 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
454 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
455 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
456 "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
457 "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
458 "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
459 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t"
460 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t"
461 "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
462 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
463 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
464 "uld %[low32], 0x00(%[dst]) \n\t"
465 "mtc1 %[low32], %[ftmp3] \n\t"
466 "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
467 "psrah %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
468 "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
469 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
470 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
471 "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
472 "paddh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
473 "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
474 "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
475 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t"
476 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t"
477 "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
478 "ldc1 %[ftmp5], 0x00($29) \n\t"
479 "ldc1 %[ftmp4], 0x10($29) \n\t"
480 "dmtc1 %[tmp1], %[ftmp6] \n\t"
481 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
482 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
483 "uld %[low32], 0x00(%[dst]) \n\t"
484 "mtc1 %[low32], %[ftmp3] \n\t"
485 "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
486 "psrah %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
487 "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
488 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
489 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
490 "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
491 "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t"
492 "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
493 "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
494 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t"
495 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t"
496 "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
497 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
498 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
499 "uld %[low32], 0x00(%[dst]) \n\t"
500 "mtc1 %[low32], %[ftmp3] \n\t"
501 "gslwxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
502 "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
503 "psrah %[ftmp6], %[ftmp6], %[ftmp10] \n\t"
504 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
505 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
506 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
507 "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
508 "packushb %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
509 "packushb %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
510 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t"
511 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t"
512 "gsswxc1 %[ftmp0], 0x00(%[dst], %[stride]) \n\t"
513 "dmtc1 %[tmp4], %[ftmp1] \n\t"
514 "dmtc1 %[tmp2], %[ftmp6] \n\t"
515 "ldc1 %[ftmp4], 0x18($29) \n\t"
516 "mov.d %[ftmp5], %[ftmp4] \n\t"
517 "psrah %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
518 "psrah %[ftmp7], %[ftmp11], %[ftmp8] \n\t"
519 "paddh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
520 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
521 "paddh %[ftmp7], %[ftmp7], %[ftmp15] \n\t"
522 "paddh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
523 "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
524 "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
525 "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
526 "psubh %[ftmp3], %[ftmp11], %[ftmp1] \n\t"
527 "psrah %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
528 "paddh %[ftmp5], %[ftmp5], %[ftmp15] \n\t"
529 "psubh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
530 "psrah %[ftmp2], %[ftmp15], %[ftmp8] \n\t"
531 "psubh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
532 "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
533 "mov.d %[ftmp2], %[ftmp4] \n\t"
534 "psrah %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
535 "psrah %[ftmp1], %[ftmp7], %[ftmp9] \n\t"
536 "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
537 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
538 "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
539 "psrah %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
540 "psubh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
541 "psubh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
542 "mov.d %[ftmp3], %[ftmp13] \n\t"
543 "psrah %[ftmp0], %[ftmp13], %[ftmp8] \n\t"
544 "psrah %[ftmp7], %[ftmp6], %[ftmp8] \n\t"
545 "paddh %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
546 "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
547 "ldc1 %[ftmp6], 0x08($29) \n\t"
548 "dmtc1 %[tmp6], %[ftmp3] \n\t"
549 "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
550 "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
551 "paddh %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
552 "psubh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
553 "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
554 "paddh %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
555 "psubh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
556 "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
557 "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
558 "psubh %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
559 "paddh %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
560 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
561 "psubh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
562 "paddh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
563 "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
564 "psubh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
565 "paddh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
566 "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
567 "psubh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
568 "paddh %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
569 "sdc1 %[ftmp6], 0x08($29) \n\t"
570 "psubh %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
571 "sdc1 %[ftmp7], 0x18($29) \n\t"
572 "dmfc1 %[tmp2], %[ftmp0] \n\t"
573 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
574 "uld %[low32], 0x00(%[addr0]) \n\t"
575 "mtc1 %[low32], %[ftmp6] \n\t"
576 "gslwxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t"
577 "psrah %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
578 "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
579 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
580 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
581 "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
582 "paddh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
583 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
584 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
585 "gsswlc1 %[ftmp6], 0x03(%[addr0]) \n\t"
586 "gsswrc1 %[ftmp6], 0x00(%[addr0]) \n\t"
587 "gsswxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t"
588 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
589 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
590 "uld %[low32], 0x00(%[addr0]) \n\t"
591 "mtc1 %[low32], %[ftmp6] \n\t"
592 "gslwxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t"
593 "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
594 "psrah %[ftmp4], %[ftmp4], %[ftmp10] \n\t"
595 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
596 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
597 "paddh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
598 "paddh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
599 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
600 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
601 "gsswlc1 %[ftmp6], 0x03(%[addr0]) \n\t"
602 "gsswrc1 %[ftmp6], 0x00(%[addr0]) \n\t"
603 "gsswxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t"
604 "ldc1 %[ftmp2], 0x08($29) \n\t"
605 "ldc1 %[ftmp5], 0x18($29) \n\t"
606 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
607 "dmtc1 %[tmp2], %[ftmp1] \n\t"
608 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
609 "uld %[low32], 0x00(%[addr0]) \n\t"
610 "mtc1 %[low32], %[ftmp6] \n\t"
611 "gslwxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t"
612 "psrah %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
613 "psrah %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
614 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
615 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
616 "paddh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
617 "paddh %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
618 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
619 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
620 "gsswlc1 %[ftmp6], 0x03(%[addr0]) \n\t"
621 "gsswrc1 %[ftmp6], 0x00(%[addr0]) \n\t"
622 "gsswxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t"
623 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
624 PTR_ADDU "%[addr0], %[addr0], %[stride] \n\t"
625 "uld %[low32], 0x00(%[addr0]) \n\t"
626 "mtc1 %[low32], %[ftmp6] \n\t"
627 "gslwxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t"
628 "psrah %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
629 "psrah %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
630 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
631 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
632 "paddh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
633 "paddh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
634 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
635 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
636 "gsswlc1 %[ftmp6], 0x03(%[addr0]) \n\t"
637 "gsswrc1 %[ftmp6], 0x00(%[addr0]) \n\t"
638 "gsswxc1 %[ftmp7], 0x00(%[addr0], %[stride]) \n\t"
640 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
641 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
642 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
643 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
644 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
645 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
646 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
647 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
648 [tmp0]
"=&r"(tmp[0]), [tmp1]
"=&r"(tmp[1]),
649 [tmp2]
"=&r"(tmp[2]), [tmp3]
"=&r"(tmp[3]),
650 [tmp4]
"=&r"(tmp[4]), [tmp5]
"=&r"(tmp[5]),
651 [tmp6]
"=&r"(tmp[6]), [tmp7]
"=&r"(tmp[7]),
652 [addr0]
"=&r"(addr[0]),
654 : [dst]
"r"(dst), [block]
"r"(block),
659 memset(block, 0, 128);
664 int dc = (block[0] + 32) >> 6;
671 "mtc1 %[dc], %[ftmp5] \n\t"
672 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
673 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
674 "uld %[low32], 0x00(%[dst0]) \n\t"
675 "mtc1 %[low32], %[ftmp1] \n\t"
676 "uld %[low32], 0x00(%[dst1]) \n\t"
677 "mtc1 %[low32], %[ftmp2] \n\t"
678 "uld %[low32], 0x00(%[dst2]) \n\t"
679 "mtc1 %[low32], %[ftmp3] \n\t"
680 "uld %[low32], 0x00(%[dst3]) \n\t"
681 "mtc1 %[low32], %[ftmp4] \n\t"
682 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
683 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
684 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
685 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
686 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
687 "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
688 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
689 "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
690 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
691 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
692 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
693 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
694 "gsswlc1 %[ftmp1], 0x03(%[dst0]) \n\t"
695 "gsswrc1 %[ftmp1], 0x00(%[dst0]) \n\t"
696 "gsswlc1 %[ftmp2], 0x03(%[dst1]) \n\t"
697 "gsswrc1 %[ftmp2], 0x00(%[dst1]) \n\t"
698 "gsswlc1 %[ftmp3], 0x03(%[dst2]) \n\t"
699 "gsswrc1 %[ftmp3], 0x00(%[dst2]) \n\t"
700 "gsswlc1 %[ftmp4], 0x03(%[dst3]) \n\t"
701 "gsswrc1 %[ftmp4], 0x00(%[dst3]) \n\t"
702 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
703 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
704 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
706 : [dst0]
"r"(dst), [dst1]
"r"(dst+
stride),
707 [dst2]
"r"(dst+2*stride), [dst3]
"r"(dst+3*
stride),
715 int dc = (block[0] + 32) >> 6;
721 "mtc1 %[dc], %[ftmp5] \n\t"
722 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
723 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
724 "ldc1 %[ftmp1], 0x00(%[dst0]) \n\t"
725 "ldc1 %[ftmp2], 0x00(%[dst1]) \n\t"
726 "ldc1 %[ftmp3], 0x00(%[dst2]) \n\t"
727 "ldc1 %[ftmp4], 0x00(%[dst3]) \n\t"
728 "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
729 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
730 "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
731 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
732 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t"
733 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
734 "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t"
735 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
736 "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
737 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
738 "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
739 "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
740 "paddsh %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
741 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
742 "paddsh %[ftmp9], %[ftmp9], %[ftmp5] \n\t"
743 "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
744 "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
745 "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
746 "packushb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
747 "packushb %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
748 "sdc1 %[ftmp1], 0x00(%[dst0]) \n\t"
749 "sdc1 %[ftmp2], 0x00(%[dst1]) \n\t"
750 "sdc1 %[ftmp3], 0x00(%[dst2]) \n\t"
751 "sdc1 %[ftmp4], 0x00(%[dst3]) \n\t"
753 "ldc1 %[ftmp1], 0x00(%[dst4]) \n\t"
754 "ldc1 %[ftmp2], 0x00(%[dst5]) \n\t"
755 "ldc1 %[ftmp3], 0x00(%[dst6]) \n\t"
756 "ldc1 %[ftmp4], 0x00(%[dst7]) \n\t"
757 "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
758 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
759 "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
760 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
761 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t"
762 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
763 "punpckhbh %[ftmp9], %[ftmp4], %[ftmp0] \n\t"
764 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
765 "paddsh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
766 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
767 "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
768 "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
769 "paddsh %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
770 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
771 "paddsh %[ftmp9], %[ftmp9], %[ftmp5] \n\t"
772 "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
773 "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
774 "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
775 "packushb %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
776 "packushb %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
777 "sdc1 %[ftmp1], 0x00(%[dst4]) \n\t"
778 "sdc1 %[ftmp2], 0x00(%[dst5]) \n\t"
779 "sdc1 %[ftmp3], 0x00(%[dst6]) \n\t"
780 "sdc1 %[ftmp4], 0x00(%[dst7]) \n\t"
781 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
782 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
783 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
784 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
785 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9])
786 : [dst0]
"r"(dst), [dst1]
"r"(dst+stride),
787 [dst2]
"r"(dst+2*
stride), [dst3]
"r"(dst+3*stride),
788 [dst4]
"r"(dst+4*
stride), [dst5]
"r"(dst+5*stride),
789 [dst6]
"r"(dst+6*
stride), [dst7]
"r"(dst+7*stride),
800 int nnz = nnzc[
scan8[i] ];
802 if(nnz==1 && ((int16_t*)block)[i*16])
819 else if(((int16_t*)block)[i*16])
829 for(i=0; i<16; i+=4){
830 int nnz = nnzc[
scan8[i] ];
832 if(nnz==1 && ((int16_t*)block)[i*16])
834 block + i*16, stride);
847 for(i=j*16; i<j*16+4; i++){
850 block + i*16, stride);
851 else if(((int16_t*)block)[i*16])
853 block + i*16, stride);
864 for(i=j*16; i<j*16+4; i++){
867 block + i*16, stride);
868 else if(((int16_t*)block)[i*16])
870 block + i*16, stride);
875 for(i=j*16+4; i<j*16+8; i++){
876 if(nnzc[
scan8[i+4] ])
878 block + i*16, stride);
879 else if(((int16_t*)block)[i*16])
881 block + i*16, stride);
893 ".set noreorder \n\t"
894 "dli %[tmp0], 0x08 \n\t"
895 "ldc1 %[ftmp3], 0x18(%[input]) \n\t"
896 "mtc1 %[tmp0], %[ftmp8] \n\t"
897 "ldc1 %[ftmp2], 0x10(%[input]) \n\t"
898 "dli %[tmp0], 0x20 \n\t"
899 "ldc1 %[ftmp1], 0x08(%[input]) \n\t"
900 "mtc1 %[tmp0], %[ftmp9] \n\t"
901 "ldc1 %[ftmp0], 0x00(%[input]) \n\t"
902 "mov.d %[ftmp4], %[ftmp3] \n\t"
903 "paddh %[ftmp3], %[ftmp3], %[ftmp2] \n\t"
904 "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
905 "mov.d %[ftmp4], %[ftmp1] \n\t"
906 "paddh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
907 "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
908 "mov.d %[ftmp4], %[ftmp3] \n\t"
909 "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
910 "psubh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
911 "mov.d %[ftmp4], %[ftmp2] \n\t"
912 "paddh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
913 "psubh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
914 "mov.d %[ftmp4], %[ftmp3] \n\t"
915 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
916 "punpckhhw %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
917 "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
918 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
919 "punpckhwd %[ftmp2], %[ftmp3], %[ftmp0] \n\t"
920 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
921 "mov.d %[ftmp0], %[ftmp4] \n\t"
922 "punpcklwd %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
923 "punpckhwd %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
924 "mov.d %[ftmp1], %[ftmp0] \n\t"
925 "paddh %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
926 "psubh %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
927 "mov.d %[ftmp1], %[ftmp2] \n\t"
928 "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
929 "psubh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
930 "mov.d %[ftmp1], %[ftmp0] \n\t"
931 "paddh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
932 "psubh %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
933 "mov.d %[ftmp1], %[ftmp4] \n\t"
934 "daddi %[tmp0], %[qmul], -0x7fff \n\t"
935 "paddh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
936 "bgtz %[tmp0], 1f \n\t"
937 "psubh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
938 "ori %[tmp0], $0, 0x80 \n\t"
939 "dsll %[tmp0], %[tmp0], 0x10 \n\t"
940 "punpckhhw %[ftmp1], %[ftmp0], %[ff_pw_1] \n\t"
941 "daddu %[qmul], %[qmul], %[tmp0] \n\t"
942 "punpcklhw %[ftmp0], %[ftmp0], %[ff_pw_1] \n\t"
943 "punpckhhw %[ftmp5], %[ftmp2], %[ff_pw_1] \n\t"
944 "punpcklhw %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t"
945 "mtc1 %[qmul], %[ftmp7] \n\t"
946 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
947 "pmaddhw %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
948 "pmaddhw %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
949 "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
950 "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
951 "psraw %[ftmp0], %[ftmp0], %[ftmp8] \n\t"
952 "psraw %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
953 "psraw %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
954 "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
955 "packsswh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
956 "packsswh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
957 "dmfc1 %[tmp1], %[ftmp0] \n\t"
958 "dsrl %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
959 "mfc1 %[input], %[ftmp0] \n\t"
960 "sh %[tmp1], 0x00(%[output]) \n\t"
961 "sh %[input], 0x80(%[output]) \n\t"
962 "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
963 PTR_SRL "%[input], %[input], 0x10 \n\t"
964 "sh %[tmp1], 0x20(%[output]) \n\t"
965 "sh %[input], 0xa0(%[output]) \n\t"
966 "dmfc1 %[tmp1], %[ftmp2] \n\t"
967 "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
968 "mfc1 %[input], %[ftmp2] \n\t"
969 "sh %[tmp1], 0x40(%[output]) \n\t"
970 "sh %[input], 0xc0(%[output]) \n\t"
971 "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
972 PTR_SRL "%[input], %[input], 0x10 \n\t"
973 "sh %[tmp1], 0x60(%[output]) \n\t"
974 "sh %[input], 0xe0(%[output]) \n\t"
975 "punpckhhw %[ftmp1], %[ftmp3], %[ff_pw_1] \n\t"
976 "punpcklhw %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t"
977 "punpckhhw %[ftmp5], %[ftmp4], %[ff_pw_1] \n\t"
978 "punpcklhw %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t"
979 "mtc1 %[qmul], %[ftmp7] \n\t"
980 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
981 "pmaddhw %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
982 "pmaddhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
983 "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
984 "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
985 "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
986 "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
987 "psraw %[ftmp1], %[ftmp1], %[ftmp8] \n\t"
988 "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
989 "packsswh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
990 "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
991 "dmfc1 %[tmp1], %[ftmp3] \n\t"
992 "dsrl %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
993 "mfc1 %[input], %[ftmp3] \n\t"
994 "sh %[tmp1], 0x100(%[output]) \n\t"
995 "sh %[input], 0x180(%[output]) \n\t"
996 "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
997 PTR_SRL "%[input], %[input], 0x10 \n\t"
998 "sh %[tmp1], 0x120(%[output]) \n\t"
999 "sh %[input], 0x1a0(%[output]) \n\t"
1000 "dmfc1 %[tmp1], %[ftmp4] \n\t"
1001 "dsrl %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1002 "mfc1 %[input], %[ftmp4] \n\t"
1003 "sh %[tmp1], 0x140(%[output]) \n\t"
1004 "sh %[input], 0x1c0(%[output]) \n\t"
1005 "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
1006 PTR_SRL "%[input], %[input], 0x10 \n\t"
1007 "sh %[tmp1], 0x160(%[output]) \n\t"
1009 "sh %[input], 0x1e0(%[output]) \n\t"
1011 "ori %[tmp0], $0, 0x1f \n\t"
1012 "clz %[tmp1], %[qmul] \n\t"
1013 "ori %[input], $0, 0x07 \n\t"
1014 "dsubu %[tmp1], %[tmp0], %[tmp1] \n\t"
1015 "ori %[tmp0], $0, 0x80 \n\t"
1016 "dsll %[tmp0], %[tmp0], 0x10 \n\t"
1017 "daddu %[qmul], %[qmul], %[tmp0] \n\t"
1018 "dsubu %[tmp0], %[tmp1], %[input] \n\t"
1019 "movn %[tmp1], %[input], %[tmp0] \n\t"
1020 PTR_ADDIU "%[input], %[input], 0x01 \n\t"
1021 "andi %[tmp0], %[tmp1], 0xff \n\t"
1022 "srlv %[qmul], %[qmul], %[tmp0] \n\t"
1023 PTR_SUBU "%[input], %[input], %[tmp1] \n\t"
1024 "mtc1 %[input], %[ftmp6] \n\t"
1025 "punpckhhw %[ftmp1], %[ftmp0], %[ff_pw_1] \n\t"
1026 "punpcklhw %[ftmp0], %[ftmp0], %[ff_pw_1] \n\t"
1027 "punpckhhw %[ftmp5], %[ftmp2], %[ff_pw_1] \n\t"
1028 "punpcklhw %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t"
1029 "mtc1 %[qmul], %[ftmp7] \n\t"
1030 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1031 "pmaddhw %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
1032 "pmaddhw %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
1033 "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
1034 "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1035 "psraw %[ftmp0], %[ftmp0], %[ftmp6] \n\t"
1036 "psraw %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1037 "psraw %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1038 "psraw %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1039 "packsswh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
1040 "packsswh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1041 "dmfc1 %[tmp1], %[ftmp0] \n\t"
1042 "dsrl %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
1043 "sh %[tmp1], 0x00(%[output]) \n\t"
1044 "mfc1 %[input], %[ftmp0] \n\t"
1045 "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
1046 "sh %[input], 0x80(%[output]) \n\t"
1047 "sh %[tmp1], 0x20(%[output]) \n\t"
1048 PTR_SRL "%[input], %[input], 0x10 \n\t"
1049 "dmfc1 %[tmp1], %[ftmp2] \n\t"
1050 "sh %[input], 0xa0(%[output]) \n\t"
1051 "dsrl %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
1052 "sh %[tmp1], 0x40(%[output]) \n\t"
1053 "mfc1 %[input], %[ftmp2] \n\t"
1054 "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
1055 "sh %[input], 0xc0(%[output]) \n\t"
1056 "sh %[tmp1], 0x60(%[output]) \n\t"
1057 PTR_SRL "%[input], %[input], 0x10 \n\t"
1058 "sh %[input], 0xe0(%[output]) \n\t"
1059 "punpckhhw %[ftmp1], %[ftmp3], %[ff_pw_1] \n\t"
1060 "punpcklhw %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t"
1061 "punpckhhw %[ftmp5], %[ftmp4], %[ff_pw_1] \n\t"
1062 "punpcklhw %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t"
1063 "mtc1 %[qmul], %[ftmp7] \n\t"
1064 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1065 "pmaddhw %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1066 "pmaddhw %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
1067 "pmaddhw %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
1068 "pmaddhw %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1069 "psraw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
1070 "psraw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
1071 "psraw %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1072 "psraw %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1073 "packsswh %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
1074 "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1075 "dmfc1 %[tmp1], %[ftmp3] \n\t"
1076 "dsrl %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
1077 "mfc1 %[input], %[ftmp3] \n\t"
1078 "sh %[tmp1], 0x100(%[output]) \n\t"
1079 "sh %[input], 0x180(%[output]) \n\t"
1080 "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
1081 PTR_SRL "%[input], %[input], 0x10 \n\t"
1082 "sh %[tmp1], 0x120(%[output]) \n\t"
1083 "sh %[input], 0x1a0(%[output]) \n\t"
1084 "dmfc1 %[tmp1], %[ftmp4] \n\t"
1085 "dsrl %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1086 "mfc1 %[input], %[ftmp4] \n\t"
1087 "sh %[tmp1], 0x140(%[output]) \n\t"
1088 "sh %[input], 0x1c0(%[output]) \n\t"
1089 "dsrl %[tmp1], %[tmp1], 0x10 \n\t"
1090 PTR_SRL "%[input], %[input], 0x10 \n\t"
1091 "sh %[tmp1], 0x160(%[output]) \n\t"
1092 "sh %[input], 0x1e0(%[output]) \n\t"
1095 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1096 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1097 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1098 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1099 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1100 [tmp0]
"=&r"(tmp[0]), [tmp1]
"=&r"(tmp[1]),
1101 [output]
"+&r"(output), [input]
"+&r"(input),
1113 temp[0] = block[0] + block[16];
1114 temp[1] = block[0] - block[16];
1115 temp[2] = block[32] + block[48];
1116 temp[3] = block[32] - block[48];
1117 temp[4] = block[64] + block[80];
1118 temp[5] = block[64] - block[80];
1119 temp[6] = block[96] + block[112];
1120 temp[7] = block[96] - block[112];
1122 t[0] = temp[0] + temp[4] + temp[2] + temp[6];
1123 t[1] = temp[0] - temp[4] + temp[2] - temp[6];
1124 t[2] = temp[0] - temp[4] - temp[2] + temp[6];
1125 t[3] = temp[0] + temp[4] - temp[2] - temp[6];
1126 t[4] = temp[1] + temp[5] + temp[3] + temp[7];
1127 t[5] = temp[1] - temp[5] + temp[3] - temp[7];
1128 t[6] = temp[1] - temp[5] - temp[3] + temp[7];
1129 t[7] = temp[1] + temp[5] - temp[3] - temp[7];
1131 block[ 0]= (t[0]*qmul + 128) >> 8;
1132 block[ 32]= (t[1]*qmul + 128) >> 8;
1133 block[ 64]= (t[2]*qmul + 128) >> 8;
1134 block[ 96]= (t[3]*qmul + 128) >> 8;
1135 block[ 16]= (t[4]*qmul + 128) >> 8;
1136 block[ 48]= (t[5]*qmul + 128) >> 8;
1137 block[ 80]= (t[6]*qmul + 128) >> 8;
1138 block[112]= (t[7]*qmul + 128) >> 8;
1145 d = block[0] - block[16];
1146 a = block[0] + block[16];
1147 b = block[32] - block[48];
1148 c = block[32] + block[48];
1149 block[0] = ((a+
c)*qmul) >> 7;
1150 block[16]= ((d+
b)*qmul) >> 7;
1151 block[32]= ((a-
c)*qmul) >> 7;
1152 block[48]= ((d-
b)*qmul) >> 7;
1161 offset <<= log2_denom;
1164 offset += 1 << (log2_denom - 1);
1168 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1169 "ldc1 %[ftmp1], 0x00(%[block0]) \n\t"
1170 "ldc1 %[ftmp2], 0x00(%[block1]) \n\t"
1171 "mtc1 %[weight], %[ftmp3] \n\t"
1172 "mtc1 %[offset], %[ftmp4] \n\t"
1173 "mtc1 %[log2_denom], %[ftmp5] \n\t"
1174 "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1175 "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1176 "punpckhbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
1177 "punpckhbh %[ftmp7], %[ftmp2], %[ftmp0] \n\t"
1178 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1179 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1180 "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
1181 "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
1182 "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1183 "pmullh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
1184 "paddsh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
1185 "paddsh %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
1186 "paddsh %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
1187 "paddsh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1188 "psrah %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
1189 "psrah %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1190 "psrah %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1191 "psrah %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1192 "packushb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1193 "packushb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
1194 "sdc1 %[ftmp1], 0x00(%[block0]) \n\t"
1195 "sdc1 %[ftmp2], 0x00(%[block1]) \n\t"
1196 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1197 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1198 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1199 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7])
1202 [log2_denom]
"r"(log2_denom)
1209 int height,
int log2_denom,
int weightd,
int weights,
int offset)
1214 offset = ((offset + 1) | 1) << log2_denom;
1218 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1219 "ldc1 %[ftmp1], 0x00(%[src0]) \n\t"
1220 "ldc1 %[ftmp2], 0x00(%[dst0]) \n\t"
1221 "mtc1 %[weights], %[ftmp3] \n\t"
1222 "mtc1 %[weightd], %[ftmp4] \n\t"
1223 "mtc1 %[offset], %[ftmp5] \n\t"
1224 "mtc1 %[log2_denom], %[ftmp6] \n\t"
1225 "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1226 "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1227 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1228 "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t"
1229 "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
1230 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1231 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1232 "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
1233 "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
1234 "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1235 "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1236 "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1237 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1238 "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1239 "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1240 "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1241 "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1242 "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
1243 "sdc1 %[ftmp1], 0x00(%[dst0]) \n\t"
1244 "ldc1 %[ftmp1], 0x00(%[src1]) \n\t"
1245 "ldc1 %[ftmp2], 0x00(%[dst1]) \n\t"
1246 "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t"
1247 "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
1248 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1249 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1250 "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
1251 "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
1252 "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1253 "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1254 "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1255 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1256 "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1257 "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1258 "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1259 "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1260 "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
1261 "sdc1 %[ftmp1], 0x00(%[dst1]) \n\t"
1262 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1263 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1264 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1265 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1266 [ftmp8]
"=&f"(ftmp[8])
1267 : [dst0]
"r"(dst), [dst1]
"r"(dst+8),
1269 [weights]
"r"(weights), [weightd]
"r"(weightd),
1270 [offset]
"r"(offset), [log2_denom]
"r"(log2_denom+1)
1282 offset <<= log2_denom;
1285 offset += 1 << (log2_denom - 1);
1289 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1290 "ldc1 %[ftmp1], 0x00(%[block]) \n\t"
1291 "mtc1 %[weight], %[ftmp2] \n\t"
1292 "mtc1 %[offset], %[ftmp3] \n\t"
1293 "mtc1 %[log2_denom], %[ftmp5] \n\t"
1294 "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1295 "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1296 "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t"
1297 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1298 "pmullh %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
1299 "pmullh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1300 "paddsh %[ftmp4], %[ftmp4], %[ftmp3] \n\t"
1301 "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1302 "psrah %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1303 "psrah %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1304 "packushb %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
1305 "sdc1 %[ftmp1], 0x00(%[block]) \n\t"
1306 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1307 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1308 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5])
1317 int height,
int log2_denom,
int weightd,
int weights,
int offset)
1322 offset = ((offset + 1) | 1) << log2_denom;
1326 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1327 "ldc1 %[ftmp1], 0x00(%[src]) \n\t"
1328 "ldc1 %[ftmp2], 0x00(%[dst]) \n\t"
1329 "mtc1 %[weights], %[ftmp3] \n\t"
1330 "mtc1 %[weightd], %[ftmp4] \n\t"
1331 "mtc1 %[offset], %[ftmp5] \n\t"
1332 "mtc1 %[log2_denom], %[ftmp6] \n\t"
1333 "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1334 "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1335 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1336 "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t"
1337 "punpckhbh %[ftmp8], %[ftmp2], %[ftmp0] \n\t"
1338 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1339 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1340 "pmullh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
1341 "pmullh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
1342 "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1343 "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1344 "paddsh %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1345 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1346 "paddsh %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1347 "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1348 "psrah %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1349 "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1350 "packushb %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
1351 "sdc1 %[ftmp1], 0x00(%[dst]) \n\t"
1352 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1353 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1354 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1355 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1356 [ftmp8]
"=&f"(ftmp[8])
1357 : [dst]
"r"(dst), [
src]
"r"(
src),
1358 [weights]
"r"(weights), [weightd]
"r"(weightd),
1359 [offset]
"r"(offset), [log2_denom]
"r"(log2_denom+1)
1372 offset <<= log2_denom;
1375 offset += 1 << (log2_denom - 1);
1379 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1380 "uld %[low32], 0x00(%[block]) \n\t"
1381 "mtc1 %[low32], %[ftmp1] \n\t"
1382 "mtc1 %[weight], %[ftmp2] \n\t"
1383 "mtc1 %[offset], %[ftmp3] \n\t"
1384 "mtc1 %[log2_denom], %[ftmp4] \n\t"
1385 "pshufh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1386 "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1387 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1388 "pmullh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1389 "paddsh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1390 "psrah %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
1391 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1392 "gsswlc1 %[ftmp1], 0x03(%[block]) \n\t"
1393 "gsswrc1 %[ftmp1], 0x00(%[block]) \n\t"
1394 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1395 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1396 [ftmp4]
"=&f"(ftmp[4]),
1406 int height,
int log2_denom,
int weightd,
int weights,
int offset)
1412 offset = ((offset + 1) | 1) << log2_denom;
1416 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1417 "uld %[low32], 0x00(%[src]) \n\t"
1418 "mtc1 %[low32], %[ftmp1] \n\t"
1419 "uld %[low32], 0x00(%[dst]) \n\t"
1420 "mtc1 %[low32], %[ftmp2] \n\t"
1421 "mtc1 %[weight], %[ftmp3] \n\t"
1422 "mtc1 %[weightd], %[ftmp4] \n\t"
1423 "mtc1 %[offset], %[ftmp5] \n\t"
1424 "mtc1 %[log2_denom], %[ftmp6] \n\t"
1425 "pshufh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1426 "pshufh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1427 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1428 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1429 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1430 "pmullh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1431 "pmullh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1432 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1433 "paddsh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
1434 "psrah %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
1435 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1436 "gsswlc1 %[ftmp1], 0x03(%[dst]) \n\t"
1437 "gsswrc1 %[ftmp1], 0x00(%[dst]) \n\t"
1438 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1439 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1440 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1441 [ftmp6]
"=&f"(ftmp[6]),
1443 : [dst]
"r"(dst), [src]
"r"(src),
1444 [
weight]
"r"(weights), [weightd]
"r"(weightd),
1459 PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
1460 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1461 PTR_ADDU "%[addr1], %[stride], %[addr0] \n\t"
1462 "addi %[alpha], %[alpha], -0x01 \n\t"
1463 PTR_SUBU "%[addr1], $0, %[addr1] \n\t"
1464 "addi %[beta], %[beta], -0x01 \n\t"
1465 PTR_ADDU "%[addr1], %[addr1], %[pix] \n\t"
1466 "ldc1 %[ftmp3], 0x00(%[pix]) \n\t"
1467 "gsldxc1 %[ftmp1], 0x00(%[addr1], %[stride]) \n\t"
1468 "gsldxc1 %[ftmp2], 0x00(%[addr1], %[addr0]) \n\t"
1469 "gsldxc1 %[ftmp4], 0x00(%[pix], %[stride]) \n\t"
1470 "mtc1 %[alpha], %[ftmp5] \n\t"
1471 "mtc1 %[beta], %[ftmp6] \n\t"
1472 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1473 "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1474 "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1475 "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
1476 "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
1477 "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
1478 "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1479 "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
1480 "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1481 "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1482 "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1483 "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1484 "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1485 "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1486 "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
1487 "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1488 "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1489 "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1490 "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1491 "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
1492 "uld %[low32], 0x00(%[tc0]) \n\t"
1493 "mtc1 %[low32], %[ftmp5] \n\t"
1494 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1495 "punpcklbh %[ftmp9], %[ftmp5], %[ftmp5] \n\t"
1496 "pcmpgtb %[ftmp5], %[ftmp9], %[ftmp4] \n\t"
1497 "ldc1 %[ftmp4], 0x00(%[addr1]) \n\t"
1498 "and %[ftmp10], %[ftmp5], %[ftmp8] \n\t"
1499 "psubusb %[ftmp8], %[ftmp4], %[ftmp2] \n\t"
1500 "psubusb %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
1501 "psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
1502 "psubusb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1503 "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1504 "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1505 "and %[ftmp5], %[ftmp10], %[ftmp9] \n\t"
1506 "psubb %[ftmp8], %[ftmp5], %[ftmp7] \n\t"
1507 "and %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1508 "pavgb %[ftmp5], %[ftmp2], %[ftmp3] \n\t"
1509 "ldc1 %[ftmp11], 0x00(%[addr1]) \n\t"
1510 "pavgb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1511 "xor %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
1512 "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
1513 "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1514 "psubusb %[ftmp5], %[ftmp1], %[ftmp7] \n\t"
1515 "paddusb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
1516 "pmaxub %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1517 "pminub %[ftmp4], %[ftmp4], %[ftmp7] \n\t"
1518 "gssdxc1 %[ftmp4], 0x00(%[addr1], %[stride]) \n\t"
1519 "gsldxc1 %[ftmp5], 0x00(%[pix], %[addr0]) \n\t"
1520 "psubusb %[ftmp4], %[ftmp5], %[ftmp3] \n\t"
1521 "psubusb %[ftmp7], %[ftmp3], %[ftmp5] \n\t"
1522 "psubusb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
1523 "psubusb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1524 "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
1525 "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1526 "psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1527 "and %[ftmp6], %[ftmp9], %[ftmp7] \n\t"
1528 "gsldxc1 %[ftmp4], 0x00(%[pix], %[stride]) \n\t"
1529 "pavgb %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
1530 "gsldxc1 %[ftmp11], 0x00(%[pix], %[addr0]) \n\t"
1531 "pavgb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1532 "xor %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
1533 "and %[ftmp7], %[ftmp7], %[ff_pb_1] \n\t"
1534 "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1535 "psubusb %[ftmp7], %[ftmp4], %[ftmp6] \n\t"
1536 "paddusb %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
1537 "pmaxub %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1538 "pminub %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1539 "gssdxc1 %[ftmp5], 0x00(%[pix], %[stride]) \n\t"
1540 "xor %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
1541 "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1542 "and %[ftmp6], %[ftmp6], %[ff_pb_1] \n\t"
1543 "xor %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1544 "xor %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
1545 "pavgb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
1546 "pavgb %[ftmp4], %[ftmp4], %[ff_pb_3] \n\t"
1547 "pavgb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
1548 "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
1549 "paddusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1550 "psubusb %[ftmp7], %[ff_pb_A1], %[ftmp4] \n\t"
1551 "psubusb %[ftmp4], %[ftmp4], %[ff_pb_A1] \n\t"
1552 "pminub %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1553 "pminub %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1554 "psubusb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
1555 "psubusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
1556 "paddusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1557 "paddusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1558 "gssdxc1 %[ftmp2], 0x00(%[addr1], %[addr0]) \n\t"
1559 "sdc1 %[ftmp3], 0x00(%[pix]) \n\t"
1560 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1561 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1562 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1563 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1564 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1565 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1566 [addr0]
"=&r"(addr[0]), [addr1]
"=&r"(addr[1]),
1585 "ori %[tmp0], $0, 0x01 \n\t"
1586 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1587 "mtc1 %[tmp0], %[ftmp9] \n\t"
1588 PTR_SLL "%[addr0], %[stride], 0x02 \n\t"
1589 PTR_ADDU "%[addr2], %[stride], %[stride] \n\t"
1590 PTR_ADDIU "%[alpha], %[alpha], -0x01 \n\t"
1591 PTR_SLL "%[ftmp11], %[ftmp9], %[ftmp9] \n\t"
1592 "bltz %[alpha], 1f \n\t"
1593 PTR_ADDU "%[addr1], %[addr2], %[stride] \n\t"
1594 PTR_ADDIU "%[beta], %[beta], -0x01 \n\t"
1595 "bltz %[beta], 1f \n\t"
1596 PTR_SUBU "%[addr0], $0, %[addr0] \n\t"
1597 PTR_ADDU "%[addr0], %[addr0], %[pix] \n\t"
1598 "ldc1 %[ftmp3], 0x00(%[pix]) \n\t"
1599 "gsldxc1 %[ftmp1], 0x00(%[addr0], %[addr2]) \n\t"
1600 "gsldxc1 %[ftmp2], 0x00(%[addr0], %[addr1]) \n\t"
1601 "gsldxc1 %[ftmp4], 0x00(%[pix], %[stride]) \n\t"
1602 "mtc1 %[alpha], %[ftmp5] \n\t"
1603 "mtc1 %[beta], %[ftmp6] \n\t"
1604 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1605 "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1606 "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1607 "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
1608 "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
1609 "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
1610 "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1611 "sdc1 %[ftmp5], 0x10+%[stack] \n\t"
1612 "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1613 "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
1614 "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1615 "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1616 "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1617 "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1618 "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1619 "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
1620 "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1621 "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1622 "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1623 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1624 "ldc1 %[ftmp5], 0x10+%[stack] \n\t"
1625 "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1626 "ldc1 %[ftmp10], %[ff_pb_1] \n\t"
1627 "sdc1 %[ftmp8], 0x20+%[stack] \n\t"
1628 "pavgb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1629 "psubusb %[ftmp8], %[ftmp3], %[ftmp2] \n\t"
1630 "pavgb %[ftmp5], %[ftmp5], %[ftmp10] \n\t"
1631 "psubusb %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
1632 "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1633 "psubusb %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1634 "ldc1 %[ftmp15], 0x20+%[stack] \n\t"
1635 "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1636 "and %[ftmp7], %[ftmp7], %[ftmp15] \n\t"
1637 "gsldxc1 %[ftmp15], 0x00(%[addr0], %[stride]) \n\t"
1638 "psubusb %[ftmp8], %[ftmp15], %[ftmp2] \n\t"
1639 "psubusb %[ftmp5], %[ftmp2], %[ftmp15] \n\t"
1640 "psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
1641 "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1642 "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
1643 "and %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1644 "gsldxc1 %[ftmp14], 0x00(%[pix], %[addr2]) \n\t"
1645 "sdc1 %[ftmp5], 0x30+%[stack] \n\t"
1646 "psubusb %[ftmp8], %[ftmp14], %[ftmp3] \n\t"
1647 "psubusb %[ftmp5], %[ftmp3], %[ftmp14] \n\t"
1648 "psubusb %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
1649 "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1650 "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
1651 "and %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1652 "sdc1 %[ftmp5], 0x40+%[stack] \n\t"
1653 "pavgb %[ftmp5], %[ftmp15], %[ftmp1] \n\t"
1654 "pavgb %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
1655 "pavgb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1656 "sdc1 %[ftmp6], 0x10+%[stack] \n\t"
1657 "paddb %[ftmp7], %[ftmp15], %[ftmp1] \n\t"
1658 "paddb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
1659 "paddb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1660 "mov.d %[ftmp8], %[ftmp7] \n\t"
1661 "sdc1 %[ftmp7], 0x00+%[stack] \n\t"
1662 "psrlh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
1663 "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1664 "xor %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1665 "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1666 "psubb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1667 "pavgb %[ftmp6], %[ftmp15], %[ftmp4] \n\t"
1668 "psubb %[ftmp7], %[ftmp15], %[ftmp4] \n\t"
1669 "paddb %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
1670 "psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1671 "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1672 "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1673 "ldc1 %[ftmp13], 0x10+%[stack] \n\t"
1674 "pavgb %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
1675 "psrlh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
1676 "pavgb %[ftmp6], %[ftmp6], %[ftmp13] \n\t"
1677 "pavgb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1678 "xor %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
1679 "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1680 "psubb %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
1681 "xor %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
1682 "pavgb %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
1683 "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1684 "psubb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1685 "ldc1 %[ftmp13], 0x30+%[stack] \n\t"
1686 "pavgb %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
1687 "ldc1 %[ftmp12], 0x20+%[stack] \n\t"
1688 "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1689 "xor %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
1690 "and %[ftmp6], %[ftmp6], %[ftmp13] \n\t"
1691 "and %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
1692 "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1693 "xor %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
1694 "gssdxc1 %[ftmp6], 0x00(%[addr0], %[addr1]) \n\t"
1695 "ldc1 %[ftmp6], 0x00(%[addr0]) \n\t"
1696 "paddb %[ftmp7], %[ftmp15], %[ftmp6] \n\t"
1697 "pavgb %[ftmp6], %[ftmp6], %[ftmp15] \n\t"
1698 "ldc1 %[ftmp12], 0x00+%[stack] \n\t"
1699 "pavgb %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
1700 "paddb %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1701 "paddb %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
1702 "psrlh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
1703 "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1704 "xor %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1705 "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1706 "ldc1 %[ftmp12], 0x30+%[stack] \n\t"
1707 "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1708 "xor %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
1709 "xor %[ftmp6], %[ftmp6], %[ftmp15] \n\t"
1710 "and %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
1711 "and %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
1712 "xor %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
1713 "xor %[ftmp6], %[ftmp6], %[ftmp15] \n\t"
1714 "gssdxc1 %[ftmp5], 0x00(%[addr0], %[addr2]) \n\t"
1715 "gssdxc1 %[ftmp6], 0x00(%[addr0], %[stride]) \n\t"
1716 "pavgb %[ftmp5], %[ftmp14], %[ftmp4] \n\t"
1717 "pavgb %[ftmp6], %[ftmp3], %[ftmp2] \n\t"
1718 "pavgb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1719 "sdc1 %[ftmp6], 0x10+%[stack] \n\t"
1720 "paddb %[ftmp7], %[ftmp14], %[ftmp4] \n\t"
1721 "paddb %[ftmp8], %[ftmp3], %[ftmp2] \n\t"
1722 "paddb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1723 "mov.d %[ftmp8], %[ftmp7] \n\t"
1724 "sdc1 %[ftmp7], 0x00+%[stack] \n\t"
1725 "psrlh %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
1726 "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1727 "xor %[ftmp7], %[ftmp7], %[ftmp5] \n\t"
1728 "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1729 "psubb %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1730 "pavgb %[ftmp6], %[ftmp14], %[ftmp1] \n\t"
1731 "paddb %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
1732 "psubb %[ftmp7], %[ftmp14], %[ftmp1] \n\t"
1733 "psubb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1734 "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1735 "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1736 "ldc1 %[ftmp12], 0x10+%[stack] \n\t"
1737 "pavgb %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
1738 "pavgb %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
1739 "psrlh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
1740 "pavgb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1741 "xor %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
1742 "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1743 "psubb %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
1744 "xor %[ftmp8], %[ftmp3], %[ftmp1] \n\t"
1745 "pavgb %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1746 "and %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1747 "ldc1 %[ftmp12], 0x40+%[stack] \n\t"
1748 "psubb %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1749 "ldc1 %[ftmp13], 0x20+%[stack] \n\t"
1750 "pavgb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
1751 "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1752 "xor %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
1753 "and %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
1754 "and %[ftmp7], %[ftmp7], %[ftmp13] \n\t"
1755 "xor %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1756 "xor %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
1757 "sdc1 %[ftmp6], 0x00(%[pix]) \n\t"
1758 "gsldxc1 %[ftmp6], 0x00(%[pix], %[addr1]) \n\t"
1759 "paddb %[ftmp7], %[ftmp14], %[ftmp6] \n\t"
1760 "pavgb %[ftmp6], %[ftmp6], %[ftmp14] \n\t"
1761 "ldc1 %[ftmp12], 0x00+%[stack] \n\t"
1762 "pavgb %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
1763 "paddb %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1764 "paddb %[ftmp7], %[ftmp7], %[ftmp12] \n\t"
1765 "psrlh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
1766 "pavgb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1767 "xor %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
1768 "and %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1769 "ldc1 %[ftmp12], 0x40+%[stack] \n\t"
1770 "psubb %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
1771 "xor %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
1772 "xor %[ftmp6], %[ftmp6], %[ftmp14] \n\t"
1773 "and %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
1774 "and %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
1775 "xor %[ftmp5], %[ftmp5], %[ftmp4] \n\t"
1776 "xor %[ftmp6], %[ftmp6], %[ftmp14] \n\t"
1777 "gssdxc1 %[ftmp5], 0x00(%[pix], %[stride]) \n\t"
1778 "gssdxc1 %[ftmp6], 0x00(%[pix], %[addr2]) \n\t"
1780 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1781 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1782 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1783 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1784 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1785 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1786 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
1787 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
1788 [tmp0]
"=&r"(tmp[0]),
1789 [addr0]
"=&r"(addr[0]), [addr1]
"=&r"(addr[1]),
1790 [addr2]
"=&r"(addr[2]),
1806 "addi %[alpha], %[alpha], -0x01 \n\t"
1807 "addi %[beta], %[beta], -0x01 \n\t"
1808 "or %[addr0], $0, %[pix] \n\t"
1809 PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
1810 PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
1811 "ldc1 %[ftmp1], 0x00(%[addr0]) \n\t"
1812 "gsldxc1 %[ftmp2], 0x00(%[addr0], %[stride]) \n\t"
1813 "ldc1 %[ftmp3], 0x00(%[pix]) \n\t"
1814 "gsldxc1 %[ftmp4], 0x00(%[pix], %[stride]) \n\t"
1816 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1817 "mtc1 %[alpha], %[ftmp5] \n\t"
1818 "mtc1 %[beta], %[ftmp6] \n\t"
1819 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1820 "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1821 "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1822 "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
1823 "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
1824 "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
1825 "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1826 "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1827 "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
1828 "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1829 "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1830 "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1831 "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1832 "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1833 "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
1834 "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1835 "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1836 "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1837 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1838 "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1839 "uld %[low32], 0x00(%[tc0]) \n\t"
1840 "mtc1 %[low32], %[ftmp7] \n\t"
1841 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1842 "and %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1843 "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1844 "xor %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
1845 "xor %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1846 "and %[ftmp6], %[ftmp6], %[ff_pb_1] \n\t"
1847 "pavgb %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
1848 "xor %[ftmp5], %[ftmp5], %[ftmp2] \n\t"
1849 "pavgb %[ftmp4], %[ftmp4], %[ff_pb_3] \n\t"
1850 "pavgb %[ftmp5], %[ftmp5], %[ftmp3] \n\t"
1851 "pavgb %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
1852 "paddusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1853 "psubusb %[ftmp7], %[ff_pb_A1], %[ftmp4] \n\t"
1854 "psubusb %[ftmp4], %[ftmp4], %[ff_pb_A1] \n\t"
1855 "pminub %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
1856 "pminub %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1857 "psubusb %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
1858 "psubusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
1859 "paddusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1860 "paddusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1862 "gssdxc1 %[ftmp2], 0x00(%[addr0], %[stride]) \n\t"
1863 "sdc1 %[ftmp3], 0x00(%[pix]) \n\t"
1864 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1865 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1866 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1867 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1868 [ftmp8]
"=&f"(ftmp[8]),
1869 [addr0]
"=&r"(addr[0]),
1886 "addi %[alpha], %[alpha], -0x01 \n\t"
1887 "addi %[beta], %[beta], -0x01 \n\t"
1888 "or %[addr0], $0, %[pix] \n\t"
1889 PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
1890 PTR_SUBU "%[addr0], %[addr0], %[stride] \n\t"
1891 "ldc1 %[ftmp1], 0x00(%[addr0]) \n\t"
1892 "gsldxc1 %[ftmp2], 0x00(%[addr0], %[stride]) \n\t"
1893 "ldc1 %[ftmp3], 0x00(%[pix]) \n\t"
1894 "gsldxc1 %[ftmp4], 0x00(%[pix], %[stride]) \n\t"
1896 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1897 "mtc1 %[alpha], %[ftmp5] \n\t"
1898 "mtc1 %[beta], %[ftmp6] \n\t"
1899 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1900 "pshufh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1901 "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
1902 "packushb %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
1903 "psubusb %[ftmp7], %[ftmp3], %[ftmp2] \n\t"
1904 "psubusb %[ftmp8], %[ftmp2], %[ftmp3] \n\t"
1905 "or %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1906 "psubusb %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1907 "psubusb %[ftmp7], %[ftmp2], %[ftmp1] \n\t"
1908 "psubusb %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1909 "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1910 "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1911 "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1912 "psubusb %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1913 "psubusb %[ftmp5], %[ftmp4], %[ftmp3] \n\t"
1914 "or %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1915 "psubusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t"
1916 "or %[ftmp8], %[ftmp8], %[ftmp5] \n\t"
1917 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
1918 "pcmpeqb %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
1919 "mov.d %[ftmp6], %[ftmp2] \n\t"
1920 "mov.d %[ftmp7], %[ftmp3] \n\t"
1921 "xor %[ftmp5], %[ftmp2], %[ftmp4] \n\t"
1922 "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
1923 "pavgb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
1924 "psubusb %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1925 "pavgb %[ftmp2], %[ftmp2], %[ftmp1] \n\t"
1926 "xor %[ftmp5], %[ftmp3], %[ftmp1] \n\t"
1927 "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
1928 "pavgb %[ftmp3], %[ftmp3], %[ftmp1] \n\t"
1929 "psubusb %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1930 "pavgb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
1931 "psubb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1932 "psubb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1933 "and %[ftmp2], %[ftmp2], %[ftmp8] \n\t"
1934 "and %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
1935 "paddb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1936 "paddb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1938 "gssdxc1 %[ftmp2], 0x00(%[addr0], %[stride]) \n\t"
1939 "sdc1 %[ftmp3], 0x00(%[pix]) \n\t"
1940 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1941 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1942 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1943 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1944 [ftmp8]
"=&f"(ftmp[8]),
1945 [addr0]
"=&r"(addr[0])
1947 [alpha]
"r"(alpha), [beta]
"r"(beta),
1961 "addi %[alpha], %[alpha], -0x01 \n\t"
1962 "addi %[beta], %[beta], -0x01 \n\t"
1963 PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
1964 PTR_ADDI "%[pix], %[pix], -0x02 \n\t"
1965 PTR_ADDU "%[addr1], %[addr0], %[stride] \n\t"
1966 PTR_ADDU "%[addr2], %[addr0], %[addr0] \n\t"
1967 "or %[addr5], $0, %[pix] \n\t"
1968 PTR_ADDU "%[pix], %[pix], %[addr1] \n\t"
1969 "uld %[low32], 0x00(%[addr5]) \n\t"
1970 "mtc1 %[low32], %[ftmp0] \n\t"
1971 PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
1972 "uld %[low32], 0x00(%[addr3]) \n\t"
1973 "mtc1 %[low32], %[ftmp2] \n\t"
1974 PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
1975 "uld %[low32], 0x00(%[addr4]) \n\t"
1976 "mtc1 %[low32], %[ftmp1] \n\t"
1977 "uld %[low32], 0x00(%[pix]) \n\t"
1978 "mtc1 %[low32], %[ftmp3] \n\t"
1979 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
1980 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
1981 PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
1982 "punpckhhw %[ftmp2], %[ftmp0], %[ftmp1] \n\t"
1983 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
1984 "uld %[low32], 0x00(%[addr3]) \n\t"
1985 "mtc1 %[low32], %[ftmp4] \n\t"
1986 PTR_ADDU "%[addr4], %[pix], %[addr0] \n\t"
1987 "uld %[low32], 0x00(%[addr4]) \n\t"
1988 "mtc1 %[low32], %[ftmp6] \n\t"
1989 PTR_ADDU "%[addr3], %[pix], %[addr1] \n\t"
1990 "uld %[low32], 0x00(%[addr3]) \n\t"
1991 "mtc1 %[low32], %[ftmp5] \n\t"
1992 PTR_ADDU "%[addr4], %[pix], %[addr2] \n\t"
1993 "uld %[low32], 0x00(%[addr4]) \n\t"
1994 "mtc1 %[low32], %[ftmp7] \n\t"
1995 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
1996 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
1997 "mov.d %[ftmp6], %[ftmp4] \n\t"
1998 "punpcklhw %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1999 "punpckhhw %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
2000 "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
2001 "punpckhwd %[ftmp3], %[ftmp2], %[ftmp6] \n\t"
2002 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2003 "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
2004 "mov.d %[ftmp9], %[ftmp0] \n\t"
2005 "mov.d %[ftmp10], %[ftmp3] \n\t"
2007 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
2008 "mtc1 %[alpha], %[ftmp4] \n\t"
2009 "mtc1 %[beta], %[ftmp5] \n\t"
2010 "pshufh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
2011 "pshufh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
2012 "packushb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
2013 "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2014 "psubusb %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
2015 "psubusb %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
2016 "or %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
2017 "psubusb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2018 "psubusb %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
2019 "psubusb %[ftmp4], %[ftmp0], %[ftmp1] \n\t"
2020 "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2021 "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2022 "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2023 "psubusb %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
2024 "psubusb %[ftmp4], %[ftmp3], %[ftmp2] \n\t"
2025 "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2026 "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2027 "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2028 "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
2029 "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
2030 "uld %[low32], 0x00(%[tc0]) \n\t"
2031 "mtc1 %[low32], %[ftmp6] \n\t"
2032 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
2033 "and %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
2034 "pcmpeqb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
2035 "xor %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
2036 "xor %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
2037 "and %[ftmp5], %[ftmp5], %[ff_pb_1] \n\t"
2038 "pavgb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
2039 "xor %[ftmp4], %[ftmp4], %[ftmp1] \n\t"
2040 "pavgb %[ftmp3], %[ftmp3], %[ff_pb_3] \n\t"
2041 "pavgb %[ftmp4], %[ftmp4], %[ftmp2] \n\t"
2042 "pavgb %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
2043 "paddusb %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
2044 "psubusb %[ftmp6], %[ff_pb_A1], %[ftmp3] \n\t"
2045 "psubusb %[ftmp3], %[ftmp3], %[ff_pb_A1] \n\t"
2046 "pminub %[ftmp6], %[ftmp6], %[ftmp7] \n\t"
2047 "pminub %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
2048 "psubusb %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
2049 "psubusb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2050 "paddusb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
2051 "paddusb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
2053 "punpckhwd %[ftmp4], %[ftmp9], %[ftmp9] \n\t"
2054 "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
2055 "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
2056 "punpcklbh %[ftmp0], %[ftmp9], %[ftmp1] \n\t"
2057 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
2058 "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2059 "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2060 "gsswlc1 %[ftmp1], 0x03(%[addr5]) \n\t"
2061 "gsswrc1 %[ftmp1], 0x00(%[addr5]) \n\t"
2062 PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
2063 "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
2064 "gsswlc1 %[ftmp1], 0x03(%[addr3]) \n\t"
2065 PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
2066 "gsswrc1 %[ftmp1], 0x00(%[addr3]) \n\t"
2067 "gsswlc1 %[ftmp0], 0x03(%[addr4]) \n\t"
2068 "gsswrc1 %[ftmp0], 0x00(%[addr4]) \n\t"
2069 "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2070 "punpckhwd %[ftmp3], %[ftmp10], %[ftmp10] \n\t"
2071 "gsswlc1 %[ftmp0], 0x03(%[pix]) \n\t"
2072 "gsswrc1 %[ftmp0], 0x00(%[pix]) \n\t"
2073 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2074 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
2075 PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
2076 "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
2077 "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2078 "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t"
2079 "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t"
2080 "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2081 PTR_ADDU "%[addr3], %[pix], %[addr0] \n\t"
2082 PTR_ADDU "%[addr4], %[pix], %[addr1] \n\t"
2083 "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t"
2084 "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t"
2085 "gsswlc1 %[ftmp4], 0x03(%[addr4]) \n\t"
2086 PTR_ADDU "%[addr3], %[pix], %[addr2] \n\t"
2087 "punpckhwd %[ftmp9], %[ftmp4], %[ftmp4] \n\t"
2088 "gsswrc1 %[ftmp4], 0x00(%[addr4]) \n\t"
2089 "gsswlc1 %[ftmp9], 0x03(%[addr3]) \n\t"
2090 "gsswrc1 %[ftmp9], 0x00(%[addr3]) \n\t"
2091 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2092 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2093 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2094 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2095 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
2096 [ftmp10]
"=&f"(ftmp[10]),
2097 [addr0]
"=&r"(addr[0]), [addr1]
"=&r"(addr[1]),
2098 [addr2]
"=&r"(addr[2]), [addr3]
"=&r"(addr[3]),
2099 [addr4]
"=&r"(addr[4]), [addr5]
"=&r"(addr[5]),
2102 : [alpha]
"r"(alpha), [beta]
"r"(beta),
2118 "addi %[alpha], %[alpha], -0x01 \n\t"
2119 "addi %[beta], %[beta], -0x01 \n\t"
2120 PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
2121 PTR_ADDI "%[pix], %[pix], -0x02 \n\t"
2122 PTR_ADDU "%[addr1], %[addr0], %[stride] \n\t"
2123 PTR_ADDU "%[addr2], %[addr0], %[addr0] \n\t"
2124 "or %[addr5], $0, %[pix] \n\t"
2125 PTR_ADDU "%[pix], %[pix], %[addr1] \n\t"
2126 "uld %[low32], 0x00(%[addr5]) \n\t"
2127 "mtc1 %[low32], %[ftmp0] \n\t"
2128 PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
2129 "uld %[low32], 0x00(%[addr3]) \n\t"
2130 "mtc1 %[low32], %[ftmp2] \n\t"
2131 PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
2132 "uld %[low32], 0x00(%[addr4]) \n\t"
2133 "mtc1 %[low32], %[ftmp1] \n\t"
2134 "uld %[low32], 0x00(%[pix]) \n\t"
2135 "mtc1 %[low32], %[ftmp3] \n\t"
2136 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2137 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
2138 PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
2139 "punpckhhw %[ftmp2], %[ftmp0], %[ftmp1] \n\t"
2140 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2141 "uld %[low32], 0x00(%[addr3]) \n\t"
2142 "mtc1 %[low32], %[ftmp4] \n\t"
2143 PTR_ADDU "%[addr4], %[pix], %[addr0] \n\t"
2144 "uld %[low32], 0x00(%[addr4]) \n\t"
2145 "mtc1 %[low32], %[ftmp6] \n\t"
2146 PTR_ADDU "%[addr3], %[pix], %[addr1] \n\t"
2147 "uld %[low32], 0x00(%[addr3]) \n\t"
2148 "mtc1 %[low32], %[ftmp5] \n\t"
2149 PTR_ADDU "%[addr4], %[pix], %[addr2] \n\t"
2150 "uld %[low32], 0x00(%[addr4]) \n\t"
2151 "mtc1 %[low32], %[ftmp7] \n\t"
2152 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2153 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
2154 "mov.d %[ftmp6], %[ftmp4] \n\t"
2155 "punpcklhw %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2156 "punpckhhw %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
2157 "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t"
2158 "punpckhwd %[ftmp3], %[ftmp2], %[ftmp6] \n\t"
2159 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2160 "punpcklwd %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
2162 "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
2163 "mtc1 %[alpha], %[ftmp4] \n\t"
2164 "mtc1 %[beta], %[ftmp5] \n\t"
2165 "pshufh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
2166 "pshufh %[ftmp5], %[ftmp5], %[ftmp8] \n\t"
2167 "packushb %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
2168 "packushb %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2169 "psubusb %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
2170 "psubusb %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
2171 "or %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
2172 "psubusb %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2173 "psubusb %[ftmp6], %[ftmp1], %[ftmp0] \n\t"
2174 "psubusb %[ftmp4], %[ftmp0], %[ftmp1] \n\t"
2175 "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2176 "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2177 "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2178 "psubusb %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
2179 "psubusb %[ftmp4], %[ftmp3], %[ftmp2] \n\t"
2180 "or %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2181 "psubusb %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2182 "or %[ftmp7], %[ftmp7], %[ftmp4] \n\t"
2183 "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
2184 "pcmpeqb %[ftmp7], %[ftmp7], %[ftmp6] \n\t"
2185 "mov.d %[ftmp5], %[ftmp1] \n\t"
2186 "mov.d %[ftmp6], %[ftmp2] \n\t"
2187 "xor %[ftmp4], %[ftmp1], %[ftmp3] \n\t"
2188 "and %[ftmp4], %[ftmp4], %[ff_pb_1] \n\t"
2189 "pavgb %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
2190 "psubusb %[ftmp1], %[ftmp1], %[ftmp4] \n\t"
2191 "pavgb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2192 "xor %[ftmp4], %[ftmp2], %[ftmp0] \n\t"
2193 "and %[ftmp4], %[ftmp4], %[ff_pb_1] \n\t"
2194 "pavgb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
2195 "psubusb %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
2196 "pavgb %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2197 "psubb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2198 "psubb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
2199 "and %[ftmp1], %[ftmp1], %[ftmp7] \n\t"
2200 "and %[ftmp2], %[ftmp2], %[ftmp7] \n\t"
2201 "paddb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2202 "paddb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
2204 "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t"
2205 "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
2206 "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
2207 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2208 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2209 "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2210 "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2211 "gsswlc1 %[ftmp1], 0x03(%[addr5]) \n\t"
2212 "gsswrc1 %[ftmp1], 0x00(%[addr5]) \n\t"
2213 PTR_ADDU "%[addr3], %[addr5], %[stride] \n\t"
2214 "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
2215 "gsswlc1 %[ftmp1], 0x03(%[addr3]) \n\t"
2216 PTR_ADDU "%[addr4], %[addr5], %[addr0] \n\t"
2217 "gsswrc1 %[ftmp1], 0x00(%[addr3]) \n\t"
2218 "gsswlc1 %[ftmp0], 0x03(%[addr4]) \n\t"
2219 "gsswrc1 %[ftmp0], 0x00(%[addr4]) \n\t"
2220 "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2221 "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
2222 "gsswlc1 %[ftmp0], 0x03(%[pix]) \n\t"
2223 "gsswrc1 %[ftmp0], 0x00(%[pix]) \n\t"
2224 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2225 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
2226 PTR_ADDU "%[addr3], %[pix], %[stride] \n\t"
2227 "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
2228 "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2229 "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t"
2230 "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t"
2231 "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2232 PTR_ADDU "%[addr3], %[pix], %[addr0] \n\t"
2233 PTR_ADDU "%[addr4], %[pix], %[addr1] \n\t"
2234 "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t"
2235 "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t"
2236 "gsswlc1 %[ftmp4], 0x03(%[addr4]) \n\t"
2237 PTR_ADDU "%[addr3], %[pix], %[addr2] \n\t"
2238 "punpckhwd %[ftmp9], %[ftmp4], %[ftmp4] \n\t"
2239 "gsswrc1 %[ftmp4], 0x00(%[addr4]) \n\t"
2240 "gsswlc1 %[ftmp9], 0x03(%[addr3]) \n\t"
2241 "gsswrc1 %[ftmp9], 0x00(%[addr3]) \n\t"
2242 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2243 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2244 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2245 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2246 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
2247 [ftmp10]
"=&f"(ftmp[10]),
2248 [addr0]
"=&r"(addr[0]), [addr1]
"=&r"(addr[1]),
2249 [addr2]
"=&r"(addr[2]), [addr3]
"=&r"(addr[3]),
2250 [addr4]
"=&r"(addr[4]), [addr5]
"=&r"(addr[5]),
2253 : [alpha]
"r"(alpha), [beta]
"r"(beta),
2262 if ((tc0[0] & tc0[1]) >= 0)
2264 if ((tc0[2] & tc0[3]) >= 0)
2278 uint64_t stack[0xd];
2283 PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
2284 PTR_ADDI "%[addr1], %[pix], -0x4 \n\t"
2285 PTR_ADDU "%[addr2], %[stride], %[addr0] \n\t"
2286 "gsldlc1 %[ftmp0], 0x07(%[addr1]) \n\t"
2287 "gsldrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
2288 PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
2289 PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
2290 "gsldlc1 %[ftmp1], 0x07(%[addr3]) \n\t"
2291 PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2292 "gsldrc1 %[ftmp1], 0x00(%[addr3]) \n\t"
2293 "gsldlc1 %[ftmp2], 0x07(%[addr5]) \n\t"
2294 "gsldrc1 %[ftmp2], 0x00(%[addr5]) \n\t"
2295 "gsldlc1 %[ftmp3], 0x07(%[addr4]) \n\t"
2296 PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
2297 "gsldrc1 %[ftmp3], 0x00(%[addr4]) \n\t"
2298 "gsldlc1 %[ftmp4], 0x07(%[addr3]) \n\t"
2299 PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
2300 "gsldrc1 %[ftmp4], 0x00(%[addr3]) \n\t"
2301 "gsldlc1 %[ftmp5], 0x07(%[addr5]) \n\t"
2302 PTR_ADDU "%[addr3], %[addr4], %[addr2] \n\t"
2303 "gsldrc1 %[ftmp5], 0x00(%[addr5]) \n\t"
2304 "gsldlc1 %[ftmp6], 0x07(%[addr3]) \n\t"
2305 "gsldrc1 %[ftmp6], 0x00(%[addr3]) \n\t"
2306 PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t"
2307 "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2308 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2309 "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2310 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2311 "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2312 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2313 PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
2314 "sdc1 %[ftmp1], 0x10(%[stack]) \n\t"
2315 "gsldlc1 %[ftmp8], 0x07(%[addr3]) \n\t"
2316 "gsldrc1 %[ftmp8], 0x00(%[addr3]) \n\t"
2317 PTR_ADDU "%[addr7], %[addr6], %[addr6] \n\t"
2318 "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2319 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2320 "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2321 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2322 "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2323 "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2324 "ldc1 %[ftmp8], 0x10(%[stack]) \n\t"
2325 "punpckhwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2326 "sdc1 %[ftmp0], 0x00(%[stack]) \n\t"
2327 "punpckhhw %[ftmp6], %[ftmp7], %[ftmp8] \n\t"
2328 "punpcklhw %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
2329 "punpckhhw %[ftmp0], %[ftmp3], %[ftmp5] \n\t"
2330 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
2331 "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
2332 "punpckhwd %[ftmp5], %[ftmp7], %[ftmp3] \n\t"
2333 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
2334 "punpckhwd %[ftmp3], %[ftmp1], %[ftmp2] \n\t"
2335 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2336 "sdc1 %[ftmp1], 0x10(%[stack]) \n\t"
2337 "sdc1 %[ftmp3], 0x20(%[stack]) \n\t"
2338 "sdc1 %[ftmp7], 0x30(%[stack]) \n\t"
2339 "sdc1 %[ftmp5], 0x40(%[stack]) \n\t"
2340 "sdc1 %[ftmp6], 0x50(%[stack]) \n\t"
2341 PTR_ADDU "%[addr1], %[addr1], %[addr7] \n\t"
2342 PTR_ADDU "%[addr4], %[addr4], %[addr7] \n\t"
2343 "gsldlc1 %[ftmp0], 0x07(%[addr1]) \n\t"
2344 PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
2345 "gsldrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
2346 "gsldlc1 %[ftmp1], 0x07(%[addr3]) \n\t"
2347 PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2348 "gsldrc1 %[ftmp1], 0x00(%[addr3]) \n\t"
2349 "gsldlc1 %[ftmp2], 0x07(%[addr5]) \n\t"
2350 "gsldrc1 %[ftmp2], 0x00(%[addr5]) \n\t"
2351 "gsldlc1 %[ftmp3], 0x07(%[addr4]) \n\t"
2352 PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
2353 "gsldrc1 %[ftmp3], 0x00(%[addr4]) \n\t"
2354 "gsldlc1 %[ftmp4], 0x07(%[addr3]) \n\t"
2355 PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
2356 "gsldrc1 %[ftmp4], 0x00(%[addr3]) \n\t"
2357 "gsldlc1 %[ftmp5], 0x07(%[addr5]) \n\t"
2358 PTR_ADDU "%[addr3], %[addr4], %[addr2] \n\t"
2359 "gsldrc1 %[ftmp5], 0x00(%[addr5]) \n\t"
2360 "gsldlc1 %[ftmp6], 0x07(%[addr3]) \n\t"
2361 "gsldrc1 %[ftmp6], 0x00(%[addr3]) \n\t"
2362 "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2363 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2364 "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2365 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2366 "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2367 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2368 PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
2369 "sdc1 %[ftmp1], 0x18(%[stack]) \n\t"
2370 "gsldlc1 %[ftmp8], 0x07(%[addr3]) \n\t"
2371 "gsldrc1 %[ftmp8], 0x00(%[addr3]) \n\t"
2372 "punpckhhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2373 "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2374 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2375 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2376 "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2377 "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2378 "punpckhwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2379 "ldc1 %[ftmp8], 0x18(%[stack]) \n\t"
2380 "sdc1 %[ftmp0], 0x08(%[stack]) \n\t"
2381 "punpckhhw %[ftmp6], %[ftmp7], %[ftmp8] \n\t"
2382 "punpcklhw %[ftmp7], %[ftmp7], %[ftmp8] \n\t"
2383 "punpckhhw %[ftmp0], %[ftmp3], %[ftmp5] \n\t"
2384 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
2385 "punpckhwd %[ftmp5], %[ftmp7], %[ftmp3] \n\t"
2386 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
2387 "punpckhwd %[ftmp3], %[ftmp1], %[ftmp2] \n\t"
2388 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2389 "punpcklwd %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
2390 "sdc1 %[ftmp1], 0x18(%[stack]) \n\t"
2391 "sdc1 %[ftmp3], 0x28(%[stack]) \n\t"
2392 "sdc1 %[ftmp7], 0x38(%[stack]) \n\t"
2393 "sdc1 %[ftmp5], 0x48(%[stack]) \n\t"
2394 "sdc1 %[ftmp6], 0x58(%[stack]) \n\t"
2395 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2396 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2397 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2398 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2399 [ftmp8]
"=&f"(ftmp[8]),
2400 [addr0]
"=&r"(addr[0]), [addr1]
"=&r"(addr[1]),
2401 [addr2]
"=&r"(addr[2]), [addr3]
"=&r"(addr[3]),
2402 [addr4]
"=&r"(addr[4]), [addr5]
"=&r"(addr[5]),
2403 [addr6]
"=&r"(addr[6]), [addr7]
"=&r"(addr[7])
2412 PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
2413 PTR_ADDI "%[addr1], %[pix], -0x02 \n\t"
2414 PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t"
2415 PTR_ADDU "%[addr2], %[addr0], %[stride] \n\t"
2416 PTR_ADDU "%[addr7], %[addr6], %[addr6] \n\t"
2417 PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
2418 "ldc1 %[ftmp0], 0x10(%[stack]) \n\t"
2419 "ldc1 %[ftmp1], 0x20(%[stack]) \n\t"
2420 "ldc1 %[ftmp2], 0x30(%[stack]) \n\t"
2421 "ldc1 %[ftmp3], 0x40(%[stack]) \n\t"
2422 "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t"
2423 "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
2424 "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
2425 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2426 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2427 "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2428 "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2429 "gsswlc1 %[ftmp1], 0x03(%[addr1]) \n\t"
2430 "gsswrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
2431 PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
2432 "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
2433 PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2434 "gsswlc1 %[ftmp1], 0x03(%[addr3]) \n\t"
2435 "gsswrc1 %[ftmp1], 0x00(%[addr3]) \n\t"
2436 "gsswlc1 %[ftmp0], 0x03(%[addr5]) \n\t"
2437 "gsswrc1 %[ftmp0], 0x00(%[addr5]) \n\t"
2438 "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2439 "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
2440 "gsswlc1 %[ftmp0], 0x03(%[addr4]) \n\t"
2441 "gsswrc1 %[ftmp0], 0x00(%[addr4]) \n\t"
2442 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2443 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
2444 "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
2445 PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
2446 "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2447 "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t"
2448 "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t"
2449 PTR_ADDU "%[addr3], %[addr4], %[addr0] \n\t"
2450 "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2451 PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2452 "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t"
2453 "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t"
2454 "gsswlc1 %[ftmp4], 0x03(%[addr5]) \n\t"
2455 "gsswrc1 %[ftmp4], 0x00(%[addr5]) \n\t"
2456 PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
2457 "punpckhwd %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
2458 PTR_ADDU "%[addr1], %[addr1], %[addr7] \n\t"
2459 "gsswlc1 %[ftmp4], 0x03(%[addr3]) \n\t"
2460 "gsswrc1 %[ftmp4], 0x00(%[addr3]) \n\t"
2461 PTR_ADDU "%[addr4], %[addr4], %[addr7] \n\t"
2462 "ldc1 %[ftmp0], 0x18(%[stack]) \n\t"
2463 "ldc1 %[ftmp1], 0x28(%[stack]) \n\t"
2464 "ldc1 %[ftmp2], 0x38(%[stack]) \n\t"
2465 "ldc1 %[ftmp3], 0x48(%[stack]) \n\t"
2466 PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
2467 "punpckhwd %[ftmp4], %[ftmp0], %[ftmp0] \n\t"
2468 PTR_ADDU "%[addr6], %[addr0], %[addr0] \n\t"
2469 "punpckhwd %[ftmp5], %[ftmp1], %[ftmp1] \n\t"
2470 "punpckhwd %[ftmp6], %[ftmp2], %[ftmp2] \n\t"
2471 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2472 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2473 PTR_ADDU "%[addr3], %[addr1], %[stride] \n\t"
2474 "punpcklhw %[ftmp1], %[ftmp0], %[ftmp2] \n\t"
2475 "punpckhhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2476 "gsswlc1 %[ftmp1], 0x03(%[addr1]) \n\t"
2477 "gsswrc1 %[ftmp1], 0x00(%[addr1]) \n\t"
2478 "punpckhwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t"
2479 PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2480 "gsswlc1 %[ftmp1], 0x03(%[addr3]) \n\t"
2481 "gsswrc1 %[ftmp1], 0x00(%[addr3]) \n\t"
2482 "gsswlc1 %[ftmp0], 0x03(%[addr5]) \n\t"
2483 "gsswrc1 %[ftmp0], 0x00(%[addr5]) \n\t"
2484 "punpckhwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2485 "punpckhwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
2486 "gsswlc1 %[ftmp0], 0x03(%[addr4]) \n\t"
2487 "gsswrc1 %[ftmp0], 0x00(%[addr4]) \n\t"
2488 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2489 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
2490 PTR_ADDU "%[addr3], %[addr4], %[stride] \n\t"
2491 "punpcklhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t"
2492 "punpckhhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2493 "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t"
2494 "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t"
2495 PTR_ADDU "%[addr3], %[addr4], %[addr0] \n\t"
2496 "punpckhwd %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
2497 PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2498 "gsswlc1 %[ftmp5], 0x03(%[addr3]) \n\t"
2499 "gsswrc1 %[ftmp5], 0x00(%[addr3]) \n\t"
2500 "gsswlc1 %[ftmp4], 0x03(%[addr5]) \n\t"
2501 "gsswrc1 %[ftmp4], 0x00(%[addr5]) \n\t"
2502 PTR_ADDU "%[addr3], %[addr4], %[addr6] \n\t"
2503 "punpckhwd %[ftmp4], %[ftmp4], %[ftmp4] \n\t"
2504 "gsswlc1 %[ftmp4], 0x03(%[addr3]) \n\t"
2505 "gsswrc1 %[ftmp4], 0x00(%[addr3]) \n\t"
2506 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2507 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2508 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2509 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2510 [ftmp8]
"=&f"(ftmp[8]),
2511 [addr0]
"=&r"(addr[0]), [addr1]
"=&r"(addr[1]),
2512 [addr2]
"=&r"(addr[2]), [addr3]
"=&r"(addr[3]),
2513 [addr4]
"=&r"(addr[4]), [addr5]
"=&r"(addr[5]),
2514 [addr6]
"=&r"(addr[6]), [addr7]
"=&r"(addr[7])
2524 uint64_t ptmp[0x11];
2530 PTR_ADDU "%[addr0], %[stride], %[stride] \n\t"
2531 PTR_ADDI "%[addr1], %[pix], -0x04 \n\t"
2532 PTR_ADDU "%[addr2], %[addr0], %[stride] \n\t"
2533 PTR_ADDU "%[addr3], %[addr0], %[addr0] \n\t"
2534 PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
2535 PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
2536 "gsldlc1 %[ftmp0], 0x07(%[addr1]) \n\t"
2537 "gsldrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
2538 PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
2539 "gsldlc1 %[ftmp1], 0x07(%[addr5]) \n\t"
2540 "gsldrc1 %[ftmp1], 0x00(%[addr5]) \n\t"
2541 "gsldlc1 %[ftmp2], 0x07(%[addr6]) \n\t"
2542 "gsldrc1 %[ftmp2], 0x00(%[addr6]) \n\t"
2543 PTR_ADDU "%[addr5], %[addr4], %[stride] \n\t"
2544 "gsldlc1 %[ftmp3], 0x07(%[addr4]) \n\t"
2545 "gsldrc1 %[ftmp3], 0x00(%[addr4]) \n\t"
2546 PTR_ADDU "%[addr6], %[addr4], %[addr0] \n\t"
2547 "gsldlc1 %[ftmp4], 0x07(%[addr5]) \n\t"
2548 "gsldrc1 %[ftmp4], 0x00(%[addr5]) \n\t"
2549 PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2550 "gsldlc1 %[ftmp5], 0x07(%[addr6]) \n\t"
2551 "gsldrc1 %[ftmp5], 0x00(%[addr6]) \n\t"
2552 "gsldlc1 %[ftmp6], 0x07(%[addr5]) \n\t"
2553 "gsldrc1 %[ftmp6], 0x00(%[addr5]) \n\t"
2554 PTR_ADDU "%[addr5], %[addr4], %[addr3] \n\t"
2555 "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2556 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2557 "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2558 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2559 "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2560 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2561 "gsldlc1 %[ftmp8], 0x07(%[addr5]) \n\t"
2562 "gsldrc1 %[ftmp8], 0x00(%[addr5]) \n\t"
2563 "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2564 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2565 "sdc1 %[ftmp3], 0x00(%[ptmp]) \n\t"
2566 "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
2567 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2568 "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2569 "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2570 "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
2571 "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
2572 "sdc1 %[ftmp2], 0x20(%[ptmp]) \n\t"
2573 "ldc1 %[ftmp2], 0x00(%[ptmp]) \n\t"
2574 "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
2575 "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
2576 "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
2577 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2578 "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
2579 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
2580 "sdc1 %[ftmp0], 0x00(%[ptmp]) \n\t"
2581 "sdc1 %[ftmp5], 0x10(%[ptmp]) \n\t"
2582 "sdc1 %[ftmp7], 0x40(%[ptmp]) \n\t"
2583 "sdc1 %[ftmp4], 0x50(%[ptmp]) \n\t"
2584 "ldc1 %[ftmp8], 0x20(%[ptmp]) \n\t"
2585 "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
2586 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
2587 "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
2588 "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
2589 PTR_ADDU "%[addr5], %[addr3], %[addr3] \n\t"
2590 "sdc1 %[ftmp3], 0x20(%[ptmp]) \n\t"
2591 "sdc1 %[ftmp0], 0x30(%[ptmp]) \n\t"
2592 "sdc1 %[ftmp6], 0x60(%[ptmp]) \n\t"
2593 "sdc1 %[ftmp5], 0x70(%[ptmp]) \n\t"
2594 PTR_ADDU "%[addr1], %[addr1], %[addr5] \n\t"
2595 PTR_ADDU "%[addr4], %[addr4], %[addr5] \n\t"
2596 PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
2597 "gsldlc1 %[ftmp0], 0x07(%[addr1]) \n\t"
2598 "gsldrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
2599 PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
2600 "gsldlc1 %[ftmp1], 0x07(%[addr5]) \n\t"
2601 "gsldrc1 %[ftmp1], 0x00(%[addr5]) \n\t"
2602 "gsldlc1 %[ftmp2], 0x07(%[addr6]) \n\t"
2603 "gsldrc1 %[ftmp2], 0x00(%[addr6]) \n\t"
2604 PTR_ADDU "%[addr5], %[addr4], %[stride] \n\t"
2605 "gsldlc1 %[ftmp3], 0x07(%[addr4]) \n\t"
2606 "gsldrc1 %[ftmp3], 0x00(%[addr4]) \n\t"
2607 PTR_ADDU "%[addr6], %[addr4], %[addr0] \n\t"
2608 "gsldlc1 %[ftmp4], 0x07(%[addr5]) \n\t"
2609 "gsldrc1 %[ftmp4], 0x00(%[addr5]) \n\t"
2610 PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2611 "gsldlc1 %[ftmp5], 0x07(%[addr6]) \n\t"
2612 "gsldrc1 %[ftmp5], 0x00(%[addr6]) \n\t"
2613 "gsldlc1 %[ftmp6], 0x07(%[addr5]) \n\t"
2614 "gsldrc1 %[ftmp6], 0x00(%[addr5]) \n\t"
2615 PTR_ADDU "%[addr5], %[addr4], %[addr3] \n\t"
2616 "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2617 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2618 "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2619 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2620 "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2621 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2622 "gsldlc1 %[ftmp8], 0x07(%[addr5]) \n\t"
2623 "gsldrc1 %[ftmp8], 0x00(%[addr5]) \n\t"
2624 "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2625 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2626 "sdc1 %[ftmp3], 0x08(%[ptmp]) \n\t"
2627 "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
2628 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2629 "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2630 "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2631 "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
2632 "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
2633 "sdc1 %[ftmp2], 0x28(%[ptmp]) \n\t"
2634 "ldc1 %[ftmp2], 0x08(%[ptmp]) \n\t"
2635 "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
2636 "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
2637 "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
2638 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2639 "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
2640 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
2641 "sdc1 %[ftmp0], 0x08(%[ptmp]) \n\t"
2642 "sdc1 %[ftmp5], 0x18(%[ptmp]) \n\t"
2643 "sdc1 %[ftmp7], 0x48(%[ptmp]) \n\t"
2644 "sdc1 %[ftmp4], 0x58(%[ptmp]) \n\t"
2645 "ldc1 %[ftmp8], 0x28(%[ptmp]) \n\t"
2646 "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
2647 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
2648 "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
2649 "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
2650 "sdc1 %[ftmp3], 0x28(%[ptmp]) \n\t"
2651 "sdc1 %[ftmp0], 0x38(%[ptmp]) \n\t"
2652 "sdc1 %[ftmp6], 0x68(%[ptmp]) \n\t"
2653 "sdc1 %[ftmp5], 0x78(%[ptmp]) \n\t"
2654 PTR_S "%[addr1], 0x00(%[pdat]) \n\t"
2655 PTR_S "%[addr2], 0x08(%[pdat]) \n\t"
2656 PTR_S "%[addr0], 0x10(%[pdat]) \n\t"
2657 PTR_S "%[addr3], 0x18(%[pdat]) \n\t"
2658 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2659 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2660 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2661 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2662 [ftmp8]
"=&f"(ftmp[8]),
2663 [addr0]
"=&r"(addr[0]), [addr1]
"=&r"(addr[1]),
2664 [addr2]
"=&r"(addr[2]), [addr3]
"=&r"(addr[3]),
2665 [addr4]
"=&r"(addr[4]), [addr5]
"=&r"(addr[5]),
2666 [addr6]
"=&r"(addr[6])
2668 [ptmp]
"r"(ptmp), [pdat]
"r"(pdat)
2675 PTR_L "%[addr1], 0x00(%[pdat]) \n\t"
2676 PTR_L "%[addr2], 0x08(%[pdat]) \n\t"
2677 PTR_L "%[addr0], 0x10(%[pdat]) \n\t"
2678 PTR_L "%[addr3], 0x18(%[pdat]) \n\t"
2679 PTR_ADDU "%[addr4], %[addr1], %[addr2] \n\t"
2680 "ldc1 %[ftmp0], 0x08(%[ptmp]) \n\t"
2681 "ldc1 %[ftmp1], 0x18(%[ptmp]) \n\t"
2682 "ldc1 %[ftmp2], 0x28(%[ptmp]) \n\t"
2683 "ldc1 %[ftmp3], 0x38(%[ptmp]) \n\t"
2684 "ldc1 %[ftmp4], 0x48(%[ptmp]) \n\t"
2685 "ldc1 %[ftmp5], 0x58(%[ptmp]) \n\t"
2686 "ldc1 %[ftmp6], 0x68(%[ptmp]) \n\t"
2687 "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2688 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2689 "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2690 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2691 "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2692 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2693 "ldc1 %[ftmp8], 0x78(%[ptmp]) \n\t"
2694 "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2695 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2696 "gssdlc1 %[ftmp3], 0x07(%[addr1]) \n\t"
2697 "gssdrc1 %[ftmp3], 0x00(%[addr1]) \n\t"
2698 PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2699 "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
2700 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2701 "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2702 "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2703 "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
2704 "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
2705 "gssdlc1 %[ftmp2], 0x07(%[addr5]) \n\t"
2706 "gssdrc1 %[ftmp2], 0x00(%[addr5]) \n\t"
2707 "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
2708 "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
2709 "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
2710 "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
2711 "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
2712 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2713 "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
2714 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
2715 PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
2716 "gssdlc1 %[ftmp0], 0x07(%[addr1]) \n\t"
2717 "gssdrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
2718 PTR_ADDU "%[addr6], %[addr4], %[stride] \n\t"
2719 "gssdlc1 %[ftmp5], 0x07(%[addr5]) \n\t"
2720 "gssdrc1 %[ftmp5], 0x00(%[addr5]) \n\t"
2721 PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
2722 "gssdlc1 %[ftmp7], 0x07(%[addr6]) \n\t"
2723 "gssdrc1 %[ftmp7], 0x00(%[addr6]) \n\t"
2724 PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
2725 "gssdlc1 %[ftmp4], 0x07(%[addr5]) \n\t"
2726 "gssdrc1 %[ftmp4], 0x00(%[addr5]) \n\t"
2727 "gsldlc1 %[ftmp8], 0x07(%[addr6]) \n\t"
2728 "gsldrc1 %[ftmp8], 0x00(%[addr6]) \n\t"
2729 PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2730 "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
2731 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
2732 "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
2733 "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
2734 "gssdlc1 %[ftmp3], 0x07(%[addr5]) \n\t"
2735 "gssdrc1 %[ftmp3], 0x00(%[addr5]) \n\t"
2736 PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2737 "gssdlc1 %[ftmp0], 0x07(%[addr4]) \n\t"
2738 "gssdrc1 %[ftmp0], 0x00(%[addr4]) \n\t"
2739 PTR_ADDU "%[addr6], %[addr4], %[addr3] \n\t"
2740 "gssdlc1 %[ftmp6], 0x07(%[addr5]) \n\t"
2741 "gssdrc1 %[ftmp6], 0x00(%[addr5]) \n\t"
2742 PTR_ADDU "%[addr5], %[addr3], %[addr3] \n\t"
2743 "gssdlc1 %[ftmp5], 0x07(%[addr6]) \n\t"
2744 "gssdrc1 %[ftmp5], 0x00(%[addr6]) \n\t"
2745 PTR_SUBU "%[addr1], %[addr1], %[addr5] \n\t"
2746 PTR_SUBU "%[addr4], %[addr4], %[addr5] \n\t"
2747 "ldc1 %[ftmp0], 0x00(%[ptmp]) \n\t"
2748 "ldc1 %[ftmp1], 0x10(%[ptmp]) \n\t"
2749 "ldc1 %[ftmp2], 0x20(%[ptmp]) \n\t"
2750 "ldc1 %[ftmp3], 0x30(%[ptmp]) \n\t"
2751 "ldc1 %[ftmp4], 0x40(%[ptmp]) \n\t"
2752 "ldc1 %[ftmp5], 0x50(%[ptmp]) \n\t"
2753 "ldc1 %[ftmp6], 0x60(%[ptmp]) \n\t"
2754 "punpckhbh %[ftmp7], %[ftmp0], %[ftmp1] \n\t"
2755 "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
2756 "punpckhbh %[ftmp1], %[ftmp2], %[ftmp3] \n\t"
2757 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp3] \n\t"
2758 "punpckhbh %[ftmp3], %[ftmp4], %[ftmp5] \n\t"
2759 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
2760 "ldc1 %[ftmp8], 0x70(%[ptmp]) \n\t"
2761 "punpckhbh %[ftmp5], %[ftmp6], %[ftmp8] \n\t"
2762 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
2763 "gssdlc1 %[ftmp3], 0x07(%[addr1]) \n\t"
2764 "gssdrc1 %[ftmp3], 0x00(%[addr1]) \n\t"
2765 PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2766 "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t"
2767 "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t"
2768 "punpckhhw %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
2769 "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
2770 "punpckhhw %[ftmp6], %[ftmp7], %[ftmp1] \n\t"
2771 "punpcklhw %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
2772 "gssdlc1 %[ftmp2], 0x07(%[addr5]) \n\t"
2773 "gssdrc1 %[ftmp2], 0x00(%[addr5]) \n\t"
2774 "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t"
2775 "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t"
2776 "punpckhhw %[ftmp1], %[ftmp2], %[ftmp5] \n\t"
2777 "punpcklhw %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
2778 "punpckhwd %[ftmp5], %[ftmp0], %[ftmp4] \n\t"
2779 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t"
2780 "punpckhwd %[ftmp4], %[ftmp7], %[ftmp2] \n\t"
2781 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp2] \n\t"
2782 PTR_ADDU "%[addr5], %[addr1], %[stride] \n\t"
2783 "gssdlc1 %[ftmp0], 0x07(%[addr1]) \n\t"
2784 "gssdrc1 %[ftmp0], 0x00(%[addr1]) \n\t"
2785 PTR_ADDU "%[addr6], %[addr4], %[stride] \n\t"
2786 "gssdlc1 %[ftmp5], 0x07(%[addr5]) \n\t"
2787 "gssdrc1 %[ftmp5], 0x00(%[addr5]) \n\t"
2788 PTR_ADDU "%[addr5], %[addr4], %[addr0] \n\t"
2789 "gssdlc1 %[ftmp7], 0x07(%[addr6]) \n\t"
2790 "gssdrc1 %[ftmp7], 0x00(%[addr6]) \n\t"
2791 PTR_ADDU "%[addr6], %[addr1], %[addr0] \n\t"
2792 "gssdlc1 %[ftmp4], 0x07(%[addr5]) \n\t"
2793 "gssdrc1 %[ftmp4], 0x00(%[addr5]) \n\t"
2794 "gsldlc1 %[ftmp8], 0x07(%[addr6]) \n\t"
2795 "gsldrc1 %[ftmp8], 0x00(%[addr6]) \n\t"
2796 PTR_ADDU "%[addr5], %[addr1], %[addr0] \n\t"
2797 "punpckhwd %[ftmp0], %[ftmp3], %[ftmp8] \n\t"
2798 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp8] \n\t"
2799 "punpckhwd %[ftmp5], %[ftmp6], %[ftmp1] \n\t"
2800 "punpcklwd %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
2801 "gssdlc1 %[ftmp3], 0x07(%[addr5]) \n\t"
2802 "gssdrc1 %[ftmp3], 0x00(%[addr5]) \n\t"
2803 PTR_ADDU "%[addr5], %[addr4], %[addr2] \n\t"
2804 "gssdlc1 %[ftmp0], 0x07(%[addr4]) \n\t"
2805 "gssdrc1 %[ftmp0], 0x00(%[addr4]) \n\t"
2806 PTR_ADDU "%[addr6], %[addr4], %[addr3] \n\t"
2807 "gssdlc1 %[ftmp6], 0x07(%[addr5]) \n\t"
2808 "gssdrc1 %[ftmp6], 0x00(%[addr5]) \n\t"
2809 "gssdlc1 %[ftmp5], 0x07(%[addr6]) \n\t"
2810 "gssdrc1 %[ftmp5], 0x00(%[addr6]) \n\t"
2811 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2812 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2813 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2814 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2815 [ftmp8]
"=&f"(ftmp[8]),
2816 [addr0]
"=&r"(addr[0]), [addr1]
"=&r"(addr[1]),
2817 [addr2]
"=&r"(addr[2]), [addr3]
"=&r"(addr[3]),
2818 [addr4]
"=&r"(addr[4]), [addr5]
"=&r"(addr[5]),
2819 [addr6]
"=&r"(addr[6])
2821 [ptmp]
"r"(ptmp), [pdat]
"r"(pdat)
void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height, int log2_denom, int weight, int offset)
void ff_h264_idct_add16_8_mmi(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
MIPS assembly defines from sys/asm.h but rewritten for use with C inline assembly (rather than from w...
void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
#define DECLARE_ALIGNED(n, t, v)
void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
void ff_h264_idct_add8_8_mmi(uint8_t **dest, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weightd, int weights, int offset)
void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input, int qmul)
void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height, int log2_denom, int weight, int offset)
void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weightd, int weights, int offset)
void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
static double alpha(void *priv, double x, double y)
void ff_h264_idct8_add4_8_mmi(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_h264_idct_add16intra_8_mmi(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
void ff_h264_chroma422_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
static const uint8_t offset[127][2]
static const uint8_t scan8[16 *3+3]
void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
static void deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha, int beta)
void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weightd, int weights, int offset)
static int weight(int i, int blen, int offset)
void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
GLint GLenum GLboolean GLsizei stride
void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride, int height, int log2_denom, int weight, int offset)
static int16_t block1[64]
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(constint16_t *) pi >>8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(constint32_t *) pi >>24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(constfloat *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(constfloat *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(constfloat *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(constdouble *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(constdouble *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(constdouble *) pi *(1U<< 31))))#defineSET_CONV_FUNC_GROUP(ofmt, ifmt) staticvoidset_generic_function(AudioConvert *ac){}voidff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enumAVSampleFormatout_fmt, enumAVSampleFormatin_fmt, intchannels, intsample_rate, intapply_map){AudioConvert *ac;intin_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) returnNULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt)>2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);returnNULL;}returnac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}elseif(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;elseac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);returnac;}intff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){intuse_generic=1;intlen=in->nb_samples;intp;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%dsamples-audio_convert:%sto%s(dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));returnff_convert_dither(ac-> dc