FFmpeg
vp8dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * Loongson SIMD optimized vp8dsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
27 #include "libavutil/mem_internal.h"
28 
29 #define DECLARE_DOUBLE_1 double db_1
30 #define DECLARE_DOUBLE_2 double db_2
31 #define DECLARE_UINT32_T uint32_t it_1
32 #define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1)
33 #define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2)
34 #define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1)
35 
36 #define MMI_PCMPGTUB(dst, src1, src2) \
37  "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \
38  "pmaxub %[db_2], "#src1", "#src2" \n\t" \
39  "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \
40  "xor "#dst", %[db_2], %[db_1] \n\t"
41 
42 #define MMI_BTOH(dst_l, dst_r, src) \
43  "xor %[db_1], %[db_1], %[db_1] \n\t" \
44  "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \
45  "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \
46  "punpckhbh "#dst_l", "#src", %[db_2] \n\t"
47 
48 #define MMI_VP8_LOOP_FILTER \
49  /* Calculation of hev */ \
50  "dmtc1 %[thresh], %[ftmp3] \n\t" \
51  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
52  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
53  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
54  "pasubub %[ftmp0], %[p1], %[p0] \n\t" \
55  "pasubub %[ftmp1], %[q1], %[q0] \n\t" \
56  "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \
57  MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \
58  /* Calculation of mask */ \
59  "pasubub %[ftmp1], %[p0], %[q0] \n\t" \
60  "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
61  "pasubub %[ftmp2], %[p1], %[q1] \n\t" \
62  "li %[tmp0], 0x09 \n\t" \
63  "dmtc1 %[tmp0], %[ftmp3] \n\t" \
64  PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \
65  "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
66  "dmtc1 %[e], %[ftmp3] \n\t" \
67  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
68  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
69  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
70  MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \
71  "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \
72  "pasubub %[ftmp1], %[p3], %[p2] \n\t" \
73  "pasubub %[ftmp2], %[p2], %[p1] \n\t" \
74  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
75  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
76  "pasubub %[ftmp1], %[q3], %[q2] \n\t" \
77  "pasubub %[ftmp2], %[q2], %[q1] \n\t" \
78  "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
79  "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
80  "dmtc1 %[i], %[ftmp3] \n\t" \
81  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
82  "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
83  "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
84  MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \
85  "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
86  "xor %[mask], %[mask], %[ftmp3] \n\t" \
87  /* VP8_MBFILTER */ \
88  "li %[tmp0], 0x80808080 \n\t" \
89  "dmtc1 %[tmp0], %[ftmp7] \n\t" \
90  "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
91  "xor %[p2], %[p2], %[ftmp7] \n\t" \
92  "xor %[p1], %[p1], %[ftmp7] \n\t" \
93  "xor %[p0], %[p0], %[ftmp7] \n\t" \
94  "xor %[q0], %[q0], %[ftmp7] \n\t" \
95  "xor %[q1], %[q1], %[ftmp7] \n\t" \
96  "xor %[q2], %[q2], %[ftmp7] \n\t" \
97  "psubsb %[ftmp4], %[p1], %[q1] \n\t" \
98  "psubb %[ftmp5], %[q0], %[p0] \n\t" \
99  MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \
100  MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \
101  /* Right part */ \
102  "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \
103  "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \
104  "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \
105  /* Left part */ \
106  "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \
107  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
108  "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \
109  /* Combine left and right part */ \
110  "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
111  "and %[ftmp1], %[ftmp1], %[mask] \n\t" \
112  "and %[ftmp2], %[ftmp1], %[hev] \n\t" \
113  "li %[tmp0], 0x04040404 \n\t" \
114  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
115  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
116  "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
117  "li %[tmp0], 0x0B \n\t" \
118  "dmtc1 %[tmp0], %[ftmp4] \n\t" \
119  PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \
120  "li %[tmp0], 0x03030303 \n\t" \
121  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
122  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
123  "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
124  "li %[tmp0], 0x0B \n\t" \
125  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
126  PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \
127  "psubsb %[q0], %[q0], %[ftmp3] \n\t" \
128  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
129  /* filt_val &= ~hev */ \
130  "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
131  "xor %[hev], %[hev], %[ftmp0] \n\t" \
132  "and %[ftmp1], %[ftmp1], %[hev] \n\t" \
133  MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \
134  "li %[tmp0], 0x07 \n\t" \
135  "dmtc1 %[tmp0], %[ftmp2] \n\t" \
136  "li %[tmp0], 0x001b001b \n\t" \
137  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
138  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
139  "li %[tmp0], 0x003f003f \n\t" \
140  "dmtc1 %[tmp0], %[ftmp0] \n\t" \
141  "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
142  /* Right part */ \
143  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
144  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
145  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
146  /* Left part */ \
147  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
148  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
149  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
150  /* Combine left and right part */ \
151  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
152  "psubsb %[q0], %[q0], %[ftmp4] \n\t" \
153  "xor %[q0], %[q0], %[ftmp7] \n\t" \
154  "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
155  "xor %[p0], %[p0], %[ftmp7] \n\t" \
156  "li %[tmp0], 0x00120012 \n\t" \
157  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
158  "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
159  /* Right part */ \
160  "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
161  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
162  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
163  /* Left part */ \
164  "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
165  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
166  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
167  /* Combine left and right part */ \
168  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
169  "psubsb %[q1], %[q1], %[ftmp4] \n\t" \
170  "xor %[q1], %[q1], %[ftmp7] \n\t" \
171  "paddsb %[p1], %[p1], %[ftmp4] \n\t" \
172  "xor %[p1], %[p1], %[ftmp7] \n\t" \
173  "li %[tmp0], 0x03 \n\t" \
174  "dmtc1 %[tmp0], %[ftmp1] \n\t" \
175  /* Right part */ \
176  "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
177  "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \
178  "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
179  "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
180  /* Left part */ \
181  "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
182  "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
183  "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
184  "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
185  /* Combine left and right part */ \
186  "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
187  "psubsb %[q2], %[q2], %[ftmp4] \n\t" \
188  "xor %[q2], %[q2], %[ftmp7] \n\t" \
189  "paddsb %[p2], %[p2], %[ftmp4] \n\t" \
190  "xor %[p2], %[p2], %[ftmp7] \n\t"
191 
192 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
193  MMI_ULWC1(%[ftmp1], src, 0x00) \
194  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
195  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
196  \
197  MMI_ULWC1(%[ftmp1], src, -0x01) \
198  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
199  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
200  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
201  \
202  MMI_ULWC1(%[ftmp1], src, -0x02) \
203  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
204  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
205  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
206  \
207  MMI_ULWC1(%[ftmp1], src, 0x01) \
208  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
209  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
210  \
211  MMI_ULWC1(%[ftmp1], src, 0x02) \
212  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
213  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
214  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
215  \
216  MMI_ULWC1(%[ftmp1], src, 0x03) \
217  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
218  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
219  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
220  \
221  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
222  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
223  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
224  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
225  \
226  MMI_SWC1(%[ftmp1], dst, 0x00)
227 
228 
229 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
230  MMI_ULWC1(%[ftmp1], src, 0x00) \
231  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
232  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
233  \
234  MMI_ULWC1(%[ftmp1], src, -0x01) \
235  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
236  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
237  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
238  \
239  MMI_ULWC1(%[ftmp1], src, 0x01) \
240  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
241  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
242  \
243  MMI_ULWC1(%[ftmp1], src, 0x02) \
244  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
245  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
246  "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
247  \
248  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
249  \
250  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
251  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
252  \
253  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
254  MMI_SWC1(%[ftmp1], dst, 0x00)
255 
256 
257 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
258  MMI_ULWC1(%[ftmp1], src, 0x00) \
259  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
260  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
261  \
262  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
263  MMI_ULWC1(%[ftmp1], src1, 0x00) \
264  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
265  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
266  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
267  \
268  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
269  MMI_ULWC1(%[ftmp1], src1, 0x00) \
270  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
271  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
272  "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
273  \
274  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
275  MMI_ULWC1(%[ftmp1], src1, 0x00) \
276  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
277  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
278  \
279  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
280  MMI_ULWC1(%[ftmp1], src1, 0x00) \
281  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
282  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
283  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
284  \
285  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
286  MMI_ULWC1(%[ftmp1], src1, 0x00) \
287  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
288  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
289  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
290  \
291  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
292  \
293  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
294  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
295  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
296  \
297  MMI_SWC1(%[ftmp1], dst, 0x00)
298 
299 
300 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
301  MMI_ULWC1(%[ftmp1], src, 0x00) \
302  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
303  "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
304  \
305  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
306  MMI_ULWC1(%[ftmp1], src1, 0x00) \
307  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
308  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
309  "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
310  \
311  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
312  MMI_ULWC1(%[ftmp1], src1, 0x00) \
313  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
314  "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
315  \
316  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
317  MMI_ULWC1(%[ftmp1], src1, 0x00) \
318  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
319  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
320  "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
321  \
322  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
323  \
324  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
325  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
326  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
327  \
328  MMI_SWC1(%[ftmp1], dst, 0x00)
329 
330 
331 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
332  MMI_ULDC1(%[ftmp1], src, 0x00) \
333  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
334  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
335  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
336  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
337  \
338  MMI_ULDC1(%[ftmp1], src, -0x01) \
339  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
340  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
341  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
342  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
343  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
344  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
345  \
346  MMI_ULDC1(%[ftmp1], src, -0x02) \
347  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
348  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
349  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
350  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
351  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
352  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
353  \
354  MMI_ULDC1(%[ftmp1], src, 0x01) \
355  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
356  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
357  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
358  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
359  \
360  MMI_ULDC1(%[ftmp1], src, 0x02) \
361  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
362  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
363  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
364  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
365  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
366  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
367  \
368  MMI_ULDC1(%[ftmp1], src, 0x03) \
369  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
370  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
371  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
372  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
373  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
374  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
375  \
376  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
377  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
378  \
379  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
380  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
381  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
382  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
383  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
384  \
385  MMI_SDC1(%[ftmp1], dst, 0x00)
386 
387 
388 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
389  MMI_ULDC1(%[ftmp1], src, 0x00) \
390  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
391  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
392  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
393  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
394  \
395  MMI_ULDC1(%[ftmp1], src, -0x01) \
396  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
397  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
398  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
399  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
400  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
401  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
402  \
403  MMI_ULDC1(%[ftmp1], src, 0x01) \
404  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
405  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
406  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
407  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
408  \
409  MMI_ULDC1(%[ftmp1], src, 0x02) \
410  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
411  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
412  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
413  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
414  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
415  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
416  \
417  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
418  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
419  \
420  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
421  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
422  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
423  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
424  \
425  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
426  MMI_SDC1(%[ftmp1], dst, 0x00)
427 
428 
429 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
430  MMI_ULDC1(%[ftmp1], src, 0x00) \
431  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
432  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
433  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
434  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
435  \
436  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
437  MMI_ULDC1(%[ftmp1], src1, 0x00) \
438  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
439  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
440  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
441  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
442  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
443  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
444  \
445  PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
446  MMI_ULDC1(%[ftmp1], src1, 0x00) \
447  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
448  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
449  "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
450  "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
451  "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
452  "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
453  \
454  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
455  MMI_ULDC1(%[ftmp1], src1, 0x00) \
456  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
457  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
458  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
459  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
460  \
461  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
462  MMI_ULDC1(%[ftmp1], src1, 0x00) \
463  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
464  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
465  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
466  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
467  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
468  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
469  \
470  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
471  MMI_ULDC1(%[ftmp1], src1, 0x00) \
472  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
473  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
474  "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
475  "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
476  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
477  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
478  \
479  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
480  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
481  \
482  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
483  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
484  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
485  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
486  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
487  \
488  MMI_SDC1(%[ftmp1], dst, 0x00)
489 
490 
491 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
492  MMI_ULDC1(%[ftmp1], src, 0x00) \
493  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
494  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
495  "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
496  "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
497  \
498  PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
499  MMI_ULDC1(%[ftmp1], src1, 0x00) \
500  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
501  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
502  "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
503  "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
504  "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
505  "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
506  \
507  PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
508  MMI_ULDC1(%[ftmp1], src1, 0x00) \
509  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
510  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
511  "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
512  "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
513  \
514  PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
515  MMI_ULDC1(%[ftmp1], src1, 0x00) \
516  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
517  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
518  "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
519  "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
520  "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
521  "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
522  \
523  "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
524  "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
525  \
526  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
527  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
528  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
529  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
530  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
531  \
532  MMI_SDC1(%[ftmp1], dst, 0x00)
533 
534 
535 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
536  MMI_ULDC1(%[ftmp1], src, 0x00) \
537  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
538  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
539  "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
540  "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
541  \
542  MMI_ULDC1(%[ftmp1], src, 0x01) \
543  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
544  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
545  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
546  "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
547  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
548  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
549  \
550  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
551  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
552  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
553  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
554  \
555  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
556  MMI_SDC1(%[ftmp1], dst, 0x00)
557 
558 
559 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
560  MMI_ULWC1(%[ftmp1], src, 0x00) \
561  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
562  "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
563  \
564  MMI_ULWC1(%[ftmp1], src, 0x01) \
565  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
566  "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
567  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
568  \
569  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
570  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
571  \
572  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
573  MMI_SWC1(%[ftmp1], dst, 0x00)
574 
575 
576 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
577  MMI_ULDC1(%[ftmp1], src, 0x00) \
578  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
579  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
580  "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
581  "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
582  \
583  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
584  MMI_ULDC1(%[ftmp1], src1, 0x00) \
585  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
586  "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
587  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
588  "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
589  "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
590  "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
591  \
592  "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
593  "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
594  "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
595  "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
596  \
597  "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
598  MMI_SDC1(%[ftmp1], dst, 0x00)
599 
600 
601 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
602  MMI_ULWC1(%[ftmp1], src, 0x00) \
603  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
604  "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
605  \
606  PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
607  MMI_ULWC1(%[ftmp1], src1, 0x00) \
608  "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
609  "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
610  "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
611  \
612  "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
613  "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
614  \
615  "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
616  MMI_SWC1(%[ftmp1], dst, 0x00)
617 
618 
619 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
620  {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
621  0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
622 
623  {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
624  0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
625 
626  {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
627  0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
628 
629  {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
630  0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
631 
632  {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
633  0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
634 
635  {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
636  0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
637 
638  {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
639  0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
640 };
641 
642 #if 0
643 #define FILTER_6TAP(src, F, stride) \
644  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
645  F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
646  F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
647 
648 #define FILTER_4TAP(src, F, stride) \
649  cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
650  F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
651 
652 static const uint8_t subpel_filters[7][6] = {
653  { 0, 6, 123, 12, 1, 0 },
654  { 2, 11, 108, 36, 8, 1 },
655  { 0, 9, 93, 50, 6, 0 },
656  { 3, 16, 77, 77, 16, 3 },
657  { 0, 6, 50, 93, 9, 0 },
658  { 1, 8, 36, 108, 11, 2 },
659  { 0, 1, 12, 123, 6, 0 },
660 };
661 
662 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
663 #define MUL_35468(a) (((a) * 35468) >> 16)
664 #endif
665 
666 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
668  ptrdiff_t stride)
669 {
670  int av_unused p1 = p[-2 * stride];
671  int av_unused p0 = p[-1 * stride];
672  int av_unused q0 = p[ 0 * stride];
673  int av_unused q1 = p[ 1 * stride];
674  int a, f1, f2;
675  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
676 
677  a = 3 * (q0 - p0);
678  a += clip_int8(p1 - q1);
679  a = clip_int8(a);
680 
681  // We deviate from the spec here with c(a+3) >> 3
682  // since that's what libvpx does.
683  f1 = FFMIN(a + 4, 127) >> 3;
684  f2 = FFMIN(a + 3, 127) >> 3;
685 
686  // Despite what the spec says, we do need to clamp here to
687  // be bitexact with libvpx.
688  p[-1 * stride] = cm[p0 + f2];
689  p[ 0 * stride] = cm[q0 - f1];
690 }
691 
693  ptrdiff_t stride)
694 {
695  int av_unused p1 = p[-2 * stride];
696  int av_unused p0 = p[-1 * stride];
697  int av_unused q0 = p[ 0 * stride];
698  int av_unused q1 = p[ 1 * stride];
699  int a, f1, f2;
700  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
701 
702  a = 3 * (q0 - p0);
703  a = clip_int8(a);
704 
705  // We deviate from the spec here with c(a+3) >> 3
706  // since that's what libvpx does.
707  f1 = FFMIN(a + 4, 127) >> 3;
708  f2 = FFMIN(a + 3, 127) >> 3;
709 
710  // Despite what the spec says, we do need to clamp here to
711  // be bitexact with libvpx.
712  p[-1 * stride] = cm[p0 + f2];
713  p[ 0 * stride] = cm[q0 - f1];
714  a = (f1 + 1) >> 1;
715  p[-2 * stride] = cm[p1 + a];
716  p[ 1 * stride] = cm[q1 - a];
717 }
718 
720  int flim)
721 {
722  int av_unused p1 = p[-2 * stride];
723  int av_unused p0 = p[-1 * stride];
724  int av_unused q0 = p[ 0 * stride];
725  int av_unused q1 = p[ 1 * stride];
726 
727  return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
728 }
729 
730 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
731 {
732  int av_unused p1 = p[-2 * stride];
733  int av_unused p0 = p[-1 * stride];
734  int av_unused q0 = p[ 0 * stride];
735  int av_unused q1 = p[ 1 * stride];
736 
737  return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
738 }
739 
740 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
741 {
742  int a0, a1, a2, w;
743  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
744 
745  int av_unused p2 = p[-3 * stride];
746  int av_unused p1 = p[-2 * stride];
747  int av_unused p0 = p[-1 * stride];
748  int av_unused q0 = p[ 0 * stride];
749  int av_unused q1 = p[ 1 * stride];
750  int av_unused q2 = p[ 2 * stride];
751 
752  w = clip_int8(p1 - q1);
753  w = clip_int8(w + 3 * (q0 - p0));
754 
755  a0 = (27 * w + 63) >> 7;
756  a1 = (18 * w + 63) >> 7;
757  a2 = (9 * w + 63) >> 7;
758 
759  p[-3 * stride] = cm[p2 + a2];
760  p[-2 * stride] = cm[p1 + a1];
761  p[-1 * stride] = cm[p0 + a0];
762  p[ 0 * stride] = cm[q0 - a0];
763  p[ 1 * stride] = cm[q1 - a1];
764  p[ 2 * stride] = cm[q2 - a2];
765 }
766 
768  int E, int I)
769 {
770  int av_unused p3 = p[-4 * stride];
771  int av_unused p2 = p[-3 * stride];
772  int av_unused p1 = p[-2 * stride];
773  int av_unused p0 = p[-1 * stride];
774  int av_unused q0 = p[ 0 * stride];
775  int av_unused q1 = p[ 1 * stride];
776  int av_unused q2 = p[ 2 * stride];
777  int av_unused q3 = p[ 3 * stride];
778 
779  return vp8_simple_limit(p, stride, E) &&
780  FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
781  FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
782  FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
783 }
784 
786  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
787 {
788  double ftmp[18];
789  uint32_t tmp[1];
793  __asm__ volatile(
794  /* Get data from dst */
795  "gsldlc1 %[q0], 0x07(%[dst]) \n\t"
796  "gsldrc1 %[q0], 0x00(%[dst]) \n\t"
797  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
798  "gsldlc1 %[p0], 0x07(%[tmp0]) \n\t"
799  "gsldrc1 %[p0], 0x00(%[tmp0]) \n\t"
800  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
801  "gsldlc1 %[p1], 0x07(%[tmp0]) \n\t"
802  "gsldrc1 %[p1], 0x00(%[tmp0]) \n\t"
803  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
804  "gsldlc1 %[p2], 0x07(%[tmp0]) \n\t"
805  "gsldrc1 %[p2], 0x00(%[tmp0]) \n\t"
806  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
807  "gsldlc1 %[p3], 0x07(%[tmp0]) \n\t"
808  "gsldrc1 %[p3], 0x00(%[tmp0]) \n\t"
809  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
810  "gsldlc1 %[q1], 0x07(%[tmp0]) \n\t"
811  "gsldrc1 %[q1], 0x00(%[tmp0]) \n\t"
812  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
813  "gsldlc1 %[q2], 0x07(%[tmp0]) \n\t"
814  "gsldrc1 %[q2], 0x00(%[tmp0]) \n\t"
815  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
816  "gsldlc1 %[q3], 0x07(%[tmp0]) \n\t"
817  "gsldrc1 %[q3], 0x00(%[tmp0]) \n\t"
819  /* Move to dst */
820  "gssdlc1 %[q0], 0x07(%[dst]) \n\t"
821  "gssdrc1 %[q0], 0x00(%[dst]) \n\t"
822  PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
823  "gssdlc1 %[p0], 0x07(%[tmp0]) \n\t"
824  "gssdrc1 %[p0], 0x00(%[tmp0]) \n\t"
825  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
826  "gssdlc1 %[p1], 0x07(%[tmp0]) \n\t"
827  "gssdrc1 %[p1], 0x00(%[tmp0]) \n\t"
828  PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
829  "gssdlc1 %[p2], 0x07(%[tmp0]) \n\t"
830  "gssdrc1 %[p2], 0x00(%[tmp0]) \n\t"
831  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
832  "gssdlc1 %[q1], 0x07(%[tmp0]) \n\t"
833  "gssdrc1 %[q1], 0x00(%[tmp0]) \n\t"
834  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
835  "gssdlc1 %[q2], 0x07(%[tmp0]) \n\t"
836  "gssdrc1 %[q2], 0x00(%[tmp0]) \n\t"
837  : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
838  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
839  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
840  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
841  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
842  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
843  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
844  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
845  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
846  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
849  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
850  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
851  : "memory"
852  );
853 }
854 
856  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
857 {
858  int i;
859 
860  for (i = 0; i < 8; i++)
861  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
862  int hv = hev(dst + i * 1, stride, hev_thresh);
863  if (hv)
864  vp8_filter_common_is4tap(dst + i * 1, stride);
865  else
867  }
868 }
869 
871  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
872 {
873  double ftmp[18];
874  uint32_t tmp[1];
878  __asm__ volatile(
879  /* Get data from dst */
880  "gsldlc1 %[p3], 0x03(%[dst]) \n\t"
881  "gsldrc1 %[p3], -0x04(%[dst]) \n\t"
882  PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
883  "gsldlc1 %[p2], 0x03(%[tmp0]) \n\t"
884  "gsldrc1 %[p2], -0x04(%[tmp0]) \n\t"
885  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
886  "gsldlc1 %[p1], 0x03(%[tmp0]) \n\t"
887  "gsldrc1 %[p1], -0x04(%[tmp0]) \n\t"
888  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
889  "gsldlc1 %[p0], 0x03(%[tmp0]) \n\t"
890  "gsldrc1 %[p0], -0x04(%[tmp0]) \n\t"
891  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
892  "gsldlc1 %[q0], 0x03(%[tmp0]) \n\t"
893  "gsldrc1 %[q0], -0x04(%[tmp0]) \n\t"
894  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
895  "gsldlc1 %[q1], 0x03(%[tmp0]) \n\t"
896  "gsldrc1 %[q1], -0x04(%[tmp0]) \n\t"
897  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
898  "gsldlc1 %[q2], 0x03(%[tmp0]) \n\t"
899  "gsldrc1 %[q2], -0x04(%[tmp0]) \n\t"
900  PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
901  "gsldlc1 %[q3], 0x03(%[tmp0]) \n\t"
902  "gsldrc1 %[q3], -0x04(%[tmp0]) \n\t"
903  /* Matrix transpose */
904  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
905  %[q0], %[q1], %[q2], %[q3],
906  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
908  /* Matrix transpose */
909  TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
910  %[q0], %[q1], %[q2], %[q3],
911  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
912  /* Move to dst */
913  "gssdlc1 %[p3], 0x03(%[dst]) \n\t"
914  "gssdrc1 %[p3], -0x04(%[dst]) \n\t"
915  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
916  "gssdlc1 %[p2], 0x03(%[dst]) \n\t"
917  "gssdrc1 %[p2], -0x04(%[dst]) \n\t"
918  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
919  "gssdlc1 %[p1], 0x03(%[dst]) \n\t"
920  "gssdrc1 %[p1], -0x04(%[dst]) \n\t"
921  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
922  "gssdlc1 %[p0], 0x03(%[dst]) \n\t"
923  "gssdrc1 %[p0], -0x04(%[dst]) \n\t"
924  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
925  "gssdlc1 %[q0], 0x03(%[dst]) \n\t"
926  "gssdrc1 %[q0], -0x04(%[dst]) \n\t"
927  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
928  "gssdlc1 %[q1], 0x03(%[dst]) \n\t"
929  "gssdrc1 %[q1], -0x04(%[dst]) \n\t"
930  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
931  "gssdlc1 %[q2], 0x03(%[dst]) \n\t"
932  "gssdrc1 %[q2], -0x04(%[dst]) \n\t"
933  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
934  "gssdlc1 %[q3], 0x03(%[dst]) \n\t"
935  "gssdrc1 %[q3], -0x04(%[dst]) \n\t"
936  : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
937  [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
938  [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
939  [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
940  [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
941  [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
942  [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
943  [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
944  [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
945  [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
948  : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
949  [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
950  : "memory"
951  );
952 }
953 
955  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
956 {
957  int i;
958 
959  for (i = 0; i < 8; i++)
960  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
961  int hv = hev(dst + i * stride, 1, hev_thresh);
962  if (hv)
963  vp8_filter_common_is4tap(dst + i * stride, 1);
964  else
966  }
967 }
968 
969 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
970 {
971 #if 1
972  double ftmp[8];
973  DECLARE_VAR_ALL64;
974 
975  __asm__ volatile (
976  MMI_LDC1(%[ftmp0], %[dc], 0x00)
977  MMI_LDC1(%[ftmp1], %[dc], 0x08)
978  MMI_LDC1(%[ftmp2], %[dc], 0x10)
979  MMI_LDC1(%[ftmp3], %[dc], 0x18)
980  "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
981  "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
982  "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
983  "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
984  "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
985  "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
986  "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
987  "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
988  MMI_SDC1(%[ftmp0], %[dc], 0x00)
989  MMI_SDC1(%[ftmp1], %[dc], 0x08)
990  MMI_SDC1(%[ftmp2], %[dc], 0x10)
991  MMI_SDC1(%[ftmp3], %[dc], 0x18)
992  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
993  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
994  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
995  [ftmp6]"=&f"(ftmp[6]),
996  RESTRICT_ASM_ALL64
997  [ftmp7]"=&f"(ftmp[7])
998  : [dc]"r"((uint8_t*)dc)
999  : "memory"
1000  );
1001 
1002  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1003  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1004  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1005  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1006 
1007  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1008  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1009  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1010  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1011 
1012  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1013  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1014  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1015  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1016 
1017  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1018  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1019  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1020  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1021 
1022  __asm__ volatile (
1023  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1024  MMI_SDC1(%[ftmp0], %[dc], 0x00)
1025  MMI_SDC1(%[ftmp0], %[dc], 0x08)
1026  MMI_SDC1(%[ftmp0], %[dc], 0x10)
1027  MMI_SDC1(%[ftmp0], %[dc], 0x18)
1028  : RESTRICT_ASM_ALL64
1029  [ftmp0]"=&f"(ftmp[0])
1030  : [dc]"r"((uint8_t *)dc)
1031  : "memory"
1032  );
1033 #else
1034  int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1035 
1036  t00 = dc[0] + dc[12];
1037  t10 = dc[1] + dc[13];
1038  t20 = dc[2] + dc[14];
1039  t30 = dc[3] + dc[15];
1040 
1041  t03 = dc[0] - dc[12];
1042  t13 = dc[1] - dc[13];
1043  t23 = dc[2] - dc[14];
1044  t33 = dc[3] - dc[15];
1045 
1046  t01 = dc[4] + dc[ 8];
1047  t11 = dc[5] + dc[ 9];
1048  t21 = dc[6] + dc[10];
1049  t31 = dc[7] + dc[11];
1050 
1051  t02 = dc[4] - dc[ 8];
1052  t12 = dc[5] - dc[ 9];
1053  t22 = dc[6] - dc[10];
1054  t32 = dc[7] - dc[11];
1055 
1056  dc[ 0] = t00 + t01;
1057  dc[ 1] = t10 + t11;
1058  dc[ 2] = t20 + t21;
1059  dc[ 3] = t30 + t31;
1060 
1061  dc[ 4] = t03 + t02;
1062  dc[ 5] = t13 + t12;
1063  dc[ 6] = t23 + t22;
1064  dc[ 7] = t33 + t32;
1065 
1066  dc[ 8] = t00 - t01;
1067  dc[ 9] = t10 - t11;
1068  dc[10] = t20 - t21;
1069  dc[11] = t30 - t31;
1070 
1071  dc[12] = t03 - t02;
1072  dc[13] = t13 - t12;
1073  dc[14] = t23 - t22;
1074  dc[15] = t33 - t32;
1075 
1076  block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1077  block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1078  block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1079  block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1080 
1081  block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1082  block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1083  block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1084  block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1085 
1086  block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1087  block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1088  block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1089  block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1090 
1091  block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1092  block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1093  block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1094  block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1095 
1096  AV_ZERO64(dc + 0);
1097  AV_ZERO64(dc + 4);
1098  AV_ZERO64(dc + 8);
1099  AV_ZERO64(dc + 12);
1100 #endif
1101 }
1102 
1103 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1104 {
1105  int val = (dc[0] + 3) >> 3;
1106 
1107  dc[0] = 0;
1108 
1109  block[0][0][0] = val;
1110  block[0][1][0] = val;
1111  block[0][2][0] = val;
1112  block[0][3][0] = val;
1113  block[1][0][0] = val;
1114  block[1][1][0] = val;
1115  block[1][2][0] = val;
1116  block[1][3][0] = val;
1117  block[2][0][0] = val;
1118  block[2][1][0] = val;
1119  block[2][2][0] = val;
1120  block[2][3][0] = val;
1121  block[3][0][0] = val;
1122  block[3][1][0] = val;
1123  block[3][2][0] = val;
1124  block[3][3][0] = val;
1125 }
1126 
1127 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1128 {
1129 #if 1
1130  DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
1131  DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
1132  double ftmp[12];
1133  uint32_t tmp[1];
1134  DECLARE_VAR_LOW32;
1135  DECLARE_VAR_ALL64;
1136 
1137  __asm__ volatile (
1138  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1139  MMI_LDC1(%[ftmp1], %[block], 0x00)
1140  MMI_LDC1(%[ftmp2], %[block], 0x08)
1141  MMI_LDC1(%[ftmp3], %[block], 0x10)
1142  MMI_LDC1(%[ftmp4], %[block], 0x18)
1143 
1144  "li %[tmp0], 0x02 \n\t"
1145  "mtc1 %[tmp0], %[ftmp11] \n\t"
1146 
1147  // block[0...3] + block[8...11]
1148  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1149  // block[0...3] - block[8...11]
1150  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1151  // MUL_35468(block[12...15])
1152  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1153  "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
1154  // MUL_35468(block[4...7])
1155  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1156  "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
1157  // MUL_20091(block[4...7]
1158  "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
1159  "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
1160  // MUL_20091(block[12...15])
1161  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1162  "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
1163 
1164  // tmp[0 4 8 12]
1165  "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
1166  "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
1167  // tmp[1 5 9 13]
1168  "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
1169  "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
1170  // tmp[2 6 10 14]
1171  "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
1172  "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
1173  // tmp[3 7 11 15]
1174  "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
1175  "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1176 
1177  MMI_SDC1(%[ftmp0], %[block], 0x00)
1178  MMI_SDC1(%[ftmp0], %[block], 0x08)
1179  MMI_SDC1(%[ftmp0], %[block], 0x10)
1180  MMI_SDC1(%[ftmp0], %[block], 0x18)
1181 
1182  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1183  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1184 
1185  // t[0 4 8 12]
1186  "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1187  // t[1 5 9 13]
1188  "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1189  // t[2 6 10 14]
1190  "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1191  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1192  "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
1193  "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1194  "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1195  // t[3 7 11 15]
1196  "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1197  "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1198  "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
1199  "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
1200  "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1201 
1202  "li %[tmp0], 0x03 \n\t"
1203  "mtc1 %[tmp0], %[ftmp11] \n\t"
1204  "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
1205  "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
1206  "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1207  "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
1208  "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
1209  "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
1210  "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
1211  "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
1212  "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
1213  "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
1214  "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
1215  "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
1216 
1217  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1218  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1219 
1220  MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1221  MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1222  MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1223  MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1224 
1225  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1226  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1227  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1228  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1229 
1230  "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1231  "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1232  "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1233  "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1234 
1235  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1236  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1237  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1238  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1239 
1240  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1241  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1242  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1243  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1244  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1245  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1246  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1247  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1248  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1249  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1250  RESTRICT_ASM_LOW32
1251  RESTRICT_ASM_ALL64
1252  [tmp0]"=&r"(tmp[0])
1253  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1254  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1255  [block]"r"(block), [ff_pw_4]"f"(ff_pw_4),
1256  [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_22a3]"f"(ff_ph_22a3)
1257  : "memory"
1258  );
1259 #else
1260  int i, t0, t1, t2, t3;
1261  int16_t tmp[16];
1262 
1263  for (i = 0; i < 4; i++) {
1264  t0 = block[0 + i] + block[8 + i];
1265  t1 = block[0 + i] - block[8 + i];
1266  t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1267  t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1268  block[ 0 + i] = 0;
1269  block[ 4 + i] = 0;
1270  block[ 8 + i] = 0;
1271  block[12 + i] = 0;
1272 
1273  tmp[i * 4 + 0] = t0 + t3;
1274  tmp[i * 4 + 1] = t1 + t2;
1275  tmp[i * 4 + 2] = t1 - t2;
1276  tmp[i * 4 + 3] = t0 - t3;
1277  }
1278 
1279  for (i = 0; i < 4; i++) {
1280  t0 = tmp[0 + i] + tmp[8 + i];
1281  t1 = tmp[0 + i] - tmp[8 + i];
1282  t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1283  t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1284 
1285  dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1286  dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1287  dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1288  dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1289  dst += stride;
1290  }
1291 #endif
1292 }
1293 
1294 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1295 {
1296 #if 1
1297  int dc = (block[0] + 4) >> 3;
1298  double ftmp[6];
1299  DECLARE_VAR_LOW32;
1300 
1301  block[0] = 0;
1302 
1303  __asm__ volatile (
1304  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1305  "mtc1 %[dc], %[ftmp5] \n\t"
1306  MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1307  MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1308  MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1309  MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1310  "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1311  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1312  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1313  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1314  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1315  "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1316  "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1317  "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1318  "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1319  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1320  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1321  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1322  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1323  MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1324  MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1325  MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1326  MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1327  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1328  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1329  [ftmp4]"=&f"(ftmp[4]),
1330  RESTRICT_ASM_LOW32
1331  [ftmp5]"=&f"(ftmp[5])
1332  : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1333  [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1334  [dc]"r"(dc)
1335  : "memory"
1336  );
1337 #else
1338  int i, dc = (block[0] + 4) >> 3;
1339 
1340  block[0] = 0;
1341 
1342  for (i = 0; i < 4; i++) {
1343  dst[0] = av_clip_uint8(dst[0] + dc);
1344  dst[1] = av_clip_uint8(dst[1] + dc);
1345  dst[2] = av_clip_uint8(dst[2] + dc);
1346  dst[3] = av_clip_uint8(dst[3] + dc);
1347  dst += stride;
1348  }
1349 #endif
1350 }
1351 
1352 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1353  ptrdiff_t stride)
1354 {
1355  ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride);
1356  ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride);
1357  ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride);
1358  ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1359 }
1360 
1361 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1362  ptrdiff_t stride)
1363 {
1364  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1365  ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1366  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1367  ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1368 }
1369 
1370 // loop filter applied to edges between macroblocks
1371 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1372  int flim_I, int hev_thresh)
1373 {
1374  vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1375  vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1376 }
1377 
1378 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1379  int flim_I, int hev_thresh)
1380 {
1381  vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1382  vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1383  hev_thresh);
1384 }
1385 
1387  int flim_E, int flim_I, int hev_thresh)
1388 {
1389  vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1390  vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1391 }
1392 
1394  int flim_E, int flim_I, int hev_thresh)
1395 {
1396  vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1397  vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1398 }
1399 
1400 // loop filter applied to inner macroblock edges
1402  int flim_E, int flim_I, int hev_thresh)
1403 {
1404  int i;
1405 
1406  for (i = 0; i < 16; i++)
1407  if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1408  int hv = hev(dst + i * 1, stride, hev_thresh);
1409  if (hv)
1410  vp8_filter_common_is4tap(dst + i * 1, stride);
1411  else
1412  vp8_filter_common_isnot4tap(dst + i * 1, stride);
1413  }
1414 }
1415 
1417  int flim_E, int flim_I, int hev_thresh)
1418 {
1419  int i;
1420 
1421  for (i = 0; i < 16; i++)
1422  if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1423  int hv = hev(dst + i * stride, 1, hev_thresh);
1424  if (hv)
1425  vp8_filter_common_is4tap(dst + i * stride, 1);
1426  else
1427  vp8_filter_common_isnot4tap(dst + i * stride, 1);
1428  }
1429 }
1430 
1432  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1433 {
1434  vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1435  vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1436 }
1437 
1439  ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1440 {
1441  vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1442  vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1443 }
1444 
1445 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1446 {
1447  int i;
1448 
1449  for (i = 0; i < 16; i++)
1450  if (vp8_simple_limit(dst + i, stride, flim))
1452 }
1453 
1454 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1455 {
1456  int i;
1457 
1458  for (i = 0; i < 16; i++)
1459  if (vp8_simple_limit(dst + i * stride, 1, flim))
1460  vp8_filter_common_is4tap(dst + i * stride, 1);
1461 }
1462 
1463 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1464  ptrdiff_t srcstride, int h, int x, int y)
1465 {
1466 #if 1
1467  double ftmp[2];
1468  uint64_t tmp[2];
1469  mips_reg addr[2];
1470  DECLARE_VAR_ALL64;
1471 
1472  __asm__ volatile (
1473  "1: \n\t"
1474  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1475  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1476  "ldl %[tmp0], 0x0f(%[src]) \n\t"
1477  "ldr %[tmp0], 0x08(%[src]) \n\t"
1478  MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1479  "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1480  "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1481  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1482  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1483  "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1484  "sdr %[tmp0], 0x08(%[dst]) \n\t"
1485  "addiu %[h], %[h], -0x02 \n\t"
1486  MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1487  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1488  "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1489  "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1490  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1491  "bnez %[h], 1b \n\t"
1492  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1493  [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1494  RESTRICT_ASM_ALL64
1495  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1496  [dst]"+&r"(dst), [src]"+&r"(src),
1497  [h]"+&r"(h)
1498  : [dststride]"r"((mips_reg)dststride),
1499  [srcstride]"r"((mips_reg)srcstride)
1500  : "memory"
1501  );
1502 #else
1503  int i;
1504 
1505  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1506  memcpy(dst, src, 16);
1507 #endif
1508 }
1509 
1510 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1511  ptrdiff_t srcstride, int h, int x, int y)
1512 {
1513 #if 1
1514  double ftmp[1];
1515  uint64_t tmp[1];
1516  mips_reg addr[2];
1517  DECLARE_VAR_ALL64;
1518 
1519  __asm__ volatile (
1520  "1: \n\t"
1521  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1522  MMI_ULDC1(%[ftmp0], %[src], 0x00)
1523  "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1524  "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1525  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1526  MMI_SDC1(%[ftmp0], %[dst], 0x00)
1527  "addiu %[h], %[h], -0x02 \n\t"
1528  "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1529  "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1530  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1531  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1532  "bnez %[h], 1b \n\t"
1533  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1534  RESTRICT_ASM_ALL64
1535  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1536  [dst]"+&r"(dst), [src]"+&r"(src),
1537  [h]"+&r"(h)
1538  : [dststride]"r"((mips_reg)dststride),
1539  [srcstride]"r"((mips_reg)srcstride)
1540  : "memory"
1541  );
1542 #else
1543  int i;
1544 
1545  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1546  memcpy(dst, src, 8);
1547 #endif
1548 }
1549 
1550 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1551  ptrdiff_t srcstride, int h, int x, int y)
1552 {
1553 #if 1
1554  double ftmp[1];
1555  uint64_t tmp[1];
1556  mips_reg addr[2];
1557  DECLARE_VAR_LOW32;
1558 
1559  __asm__ volatile (
1560  "1: \n\t"
1561  PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1562  MMI_LWC1(%[ftmp0], %[src], 0x00)
1563  "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1564  "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1565  PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1566  MMI_SWC1(%[ftmp0], %[dst], 0x00)
1567  "addiu %[h], %[h], -0x02 \n\t"
1568  "swl %[tmp0], 0x03(%[addr1]) \n\t"
1569  "swr %[tmp0], 0x00(%[addr1]) \n\t"
1570  PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1571  PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1572  "bnez %[h], 1b \n\t"
1573  : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1574  RESTRICT_ASM_LOW32
1575  [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1576  [dst]"+&r"(dst), [src]"+&r"(src),
1577  [h]"+&r"(h)
1578  : [dststride]"r"((mips_reg)dststride),
1579  [srcstride]"r"((mips_reg)srcstride)
1580  : "memory"
1581  );
1582 #else
1583  int i;
1584 
1585  for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1586  memcpy(dst, src, 4);
1587 #endif
1588 }
1589 
1590 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1591  ptrdiff_t srcstride, int h, int mx, int my)
1592 {
1593 #if 1
1594  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1595  double ftmp[9];
1596  uint32_t tmp[1];
1597  mips_reg src1, dst1;
1598  DECLARE_VAR_ALL64;
1599 
1600  /*
1601  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1602  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1603  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1604  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1605  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1606  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1607  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1608  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1609 
1610  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1611  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1612  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1613  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1614  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1615  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1616  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1617  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1618  */
1619  __asm__ volatile (
1620  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1621  "li %[tmp0], 0x07 \n\t"
1622  "mtc1 %[tmp0], %[ftmp4] \n\t"
1623 
1624  "1: \n\t"
1625  // 0 - 7
1626  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1627  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1628  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1629  // 8 - 15
1630  PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1631 
1632  "addiu %[h], %[h], -0x01 \n\t"
1633  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1634  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1635  "bnez %[h], 1b \n\t"
1636  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1637  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1638  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1639  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1640  [ftmp8]"=&f"(ftmp[8]),
1641  [tmp0]"=&r"(tmp[0]),
1642  RESTRICT_ASM_ALL64
1643  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1644  [h]"+&r"(h),
1645  [dst]"+&r"(dst), [src]"+&r"(src)
1646  : [ff_pw_64]"f"(ff_pw_64),
1647  [srcstride]"r"((mips_reg)srcstride),
1648  [dststride]"r"((mips_reg)dststride),
1649  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1650  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1651  : "memory"
1652  );
1653 #else
1654  const uint8_t *filter = subpel_filters[mx - 1];
1655  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1656  int x, y;
1657 
1658  for (y = 0; y < h; y++) {
1659  for (x = 0; x < 16; x++)
1660  dst[x] = FILTER_4TAP(src, filter, 1);
1661  dst += dststride;
1662  src += srcstride;
1663  }
1664 #endif
1665 }
1666 
1667 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1668  ptrdiff_t srcstride, int h, int mx, int my)
1669 {
1670 #if 1
1671  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1672  double ftmp[9];
1673  uint32_t tmp[1];
1674  DECLARE_VAR_ALL64;
1675 
1676  /*
1677  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1678  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1679  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1680  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1681  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1682  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1683  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1684  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1685  */
1686  __asm__ volatile (
1687  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1688  "li %[tmp0], 0x07 \n\t"
1689  "mtc1 %[tmp0], %[ftmp4] \n\t"
1690 
1691  "1: \n\t"
1692  PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1693 
1694  "addiu %[h], %[h], -0x01 \n\t"
1695  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1696  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1697  "bnez %[h], 1b \n\t"
1698  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1699  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1700  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1701  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1702  [ftmp8]"=&f"(ftmp[8]),
1703  [tmp0]"=&r"(tmp[0]),
1704  RESTRICT_ASM_ALL64
1705  [h]"+&r"(h),
1706  [dst]"+&r"(dst), [src]"+&r"(src)
1707  : [ff_pw_64]"f"(ff_pw_64),
1708  [srcstride]"r"((mips_reg)srcstride),
1709  [dststride]"r"((mips_reg)dststride),
1710  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1711  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1712  : "memory"
1713  );
1714 #else
1715  const uint8_t *filter = subpel_filters[mx - 1];
1716  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1717  int x, y;
1718 
1719  for (y = 0; y < h; y++) {
1720  for (x = 0; x < 8; x++)
1721  dst[x] = FILTER_4TAP(src, filter, 1);
1722  dst += dststride;
1723  src += srcstride;
1724  }
1725 #endif
1726 }
1727 
1728 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1729  ptrdiff_t srcstride, int h, int mx, int my)
1730 {
1731 #if 1
1732  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1733  double ftmp[6];
1734  uint32_t tmp[1];
1735  DECLARE_VAR_LOW32;
1736 
1737  /*
1738  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1739  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1740  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1741  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1742  */
1743  __asm__ volatile (
1744  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1745  "li %[tmp0], 0x07 \n\t"
1746  "mtc1 %[tmp0], %[ftmp4] \n\t"
1747 
1748  "1: \n\t"
1749  PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1750 
1751  "addiu %[h], %[h], -0x01 \n\t"
1752  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1753  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1754  "bnez %[h], 1b \n\t"
1755  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1756  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1757  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1758  [tmp0]"=&r"(tmp[0]),
1759  RESTRICT_ASM_LOW32
1760  [h]"+&r"(h),
1761  [dst]"+&r"(dst), [src]"+&r"(src)
1762  : [ff_pw_64]"f"(ff_pw_64),
1763  [srcstride]"r"((mips_reg)srcstride),
1764  [dststride]"r"((mips_reg)dststride),
1765  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1766  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1767  : "memory"
1768  );
1769 #else
1770  const uint8_t *filter = subpel_filters[mx - 1];
1771  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1772  int x, y;
1773 
1774  for (y = 0; y < h; y++) {
1775  for (x = 0; x < 4; x++)
1776  dst[x] = FILTER_4TAP(src, filter, 1);
1777  dst += dststride;
1778  src += srcstride;
1779  }
1780 #endif
1781 }
1782 
1783 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1784  ptrdiff_t srcstride, int h, int mx, int my)
1785 {
1786 #if 1
1787  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1788  double ftmp[9];
1789  uint32_t tmp[1];
1790  mips_reg src1, dst1;
1791  DECLARE_VAR_ALL64;
1792 
1793  /*
1794  dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1795  dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1796  dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1797  dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1798  dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1799  dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1800  dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1801  dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1802 
1803  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1804  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1805  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1806  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1807  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1808  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1809  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1810  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1811  */
1812  __asm__ volatile (
1813  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1814  "li %[tmp0], 0x07 \n\t"
1815  "mtc1 %[tmp0], %[ftmp4] \n\t"
1816 
1817  "1: \n\t"
1818  // 0 - 7
1819  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1820  PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1821  PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1822  // 8 - 15
1823  PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1824 
1825  "addiu %[h], %[h], -0x01 \n\t"
1826  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1827  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1828  "bnez %[h], 1b \n\t"
1829  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1830  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1831  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1832  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1833  [ftmp8]"=&f"(ftmp[8]),
1834  [tmp0]"=&r"(tmp[0]),
1835  RESTRICT_ASM_ALL64
1836  [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1837  [h]"+&r"(h),
1838  [dst]"+&r"(dst), [src]"+&r"(src)
1839  : [ff_pw_64]"f"(ff_pw_64),
1840  [srcstride]"r"((mips_reg)srcstride),
1841  [dststride]"r"((mips_reg)dststride),
1842  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1843  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1844  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1845  : "memory"
1846  );
1847 #else
1848  const uint8_t *filter = subpel_filters[mx - 1];
1849  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1850  int x, y;
1851 
1852  for (y = 0; y < h; y++) {
1853  for (x = 0; x < 16; x++)
1854  dst[x] = FILTER_6TAP(src, filter, 1);
1855  dst += dststride;
1856  src += srcstride;
1857  }
1858 #endif
1859 }
1860 
1861 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1862  ptrdiff_t srcstride, int h, int mx, int my)
1863 {
1864 #if 1
1865  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1866  double ftmp[9];
1867  uint32_t tmp[1];
1868  DECLARE_VAR_ALL64;
1869 
1870  /*
1871  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1872  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1873  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1874  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1875  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1876  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1877  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1878  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1879  */
1880  __asm__ volatile (
1881  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1882  "li %[tmp0], 0x07 \n\t"
1883  "mtc1 %[tmp0], %[ftmp4] \n\t"
1884 
1885  "1: \n\t"
1886  PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1887 
1888  "addiu %[h], %[h], -0x01 \n\t"
1889  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1890  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1891  "bnez %[h], 1b \n\t"
1892  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1893  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1894  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1895  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1896  [ftmp8]"=&f"(ftmp[8]),
1897  [tmp0]"=&r"(tmp[0]),
1898  RESTRICT_ASM_ALL64
1899  [h]"+&r"(h),
1900  [dst]"+&r"(dst), [src]"+&r"(src)
1901  : [ff_pw_64]"f"(ff_pw_64),
1902  [srcstride]"r"((mips_reg)srcstride),
1903  [dststride]"r"((mips_reg)dststride),
1904  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1905  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1906  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1907  : "memory"
1908  );
1909 #else
1910  const uint8_t *filter = subpel_filters[mx - 1];
1911  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1912  int x, y;
1913 
1914  for (y = 0; y < h; y++) {
1915  for (x = 0; x < 8; x++)
1916  dst[x] = FILTER_6TAP(src, filter, 1);
1917  dst += dststride;
1918  src += srcstride;
1919  }
1920 #endif
1921 }
1922 
1923 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1924  ptrdiff_t srcstride, int h, int mx, int my)
1925 {
1926 #if 1
1927  const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1928  double ftmp[6];
1929  uint32_t tmp[1];
1930  DECLARE_VAR_LOW32;
1931 
1932  /*
1933  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1934  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1935  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1936  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1937  */
1938  __asm__ volatile (
1939  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1940  "li %[tmp0], 0x07 \n\t"
1941  "mtc1 %[tmp0], %[ftmp4] \n\t"
1942 
1943  "1: \n\t"
1944  PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1945 
1946  "addiu %[h], %[h], -0x01 \n\t"
1947  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1948  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1949  "bnez %[h], 1b \n\t"
1950  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1951  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1952  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1953  [tmp0]"=&r"(tmp[0]),
1954  RESTRICT_ASM_LOW32
1955  [h]"+&r"(h),
1956  [dst]"+&r"(dst), [src]"+&r"(src)
1957  : [ff_pw_64]"f"(ff_pw_64),
1958  [srcstride]"r"((mips_reg)srcstride),
1959  [dststride]"r"((mips_reg)dststride),
1960  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1961  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1962  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1963  : "memory"
1964  );
1965 #else
1966  const uint8_t *filter = subpel_filters[mx - 1];
1967  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1968  int x, y;
1969 
1970  for (y = 0; y < h; y++) {
1971  for (x = 0; x < 4; x++)
1972  dst[x] = FILTER_6TAP(src, filter, 1);
1973  dst += dststride;
1974  src += srcstride;
1975  }
1976 #endif
1977 }
1978 
1979 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1980  ptrdiff_t srcstride, int h, int mx, int my)
1981 {
1982 #if 1
1983  const uint64_t *filter = fourtap_subpel_filters[my - 1];
1984  double ftmp[9];
1985  uint32_t tmp[1];
1986  mips_reg src0, src1, dst0;
1987  DECLARE_VAR_ALL64;
1988 
1989  /*
1990  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
1991  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1992  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1993  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1994  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
1995  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
1996  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
1997  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
1998 
1999  dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2000  dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2001  dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2002  dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2003  dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2004  dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2005  dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2006  dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2007  */
2008  __asm__ volatile (
2009  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2010  "li %[tmp0], 0x07 \n\t"
2011  "mtc1 %[tmp0], %[ftmp4] \n\t"
2012 
2013  "1: \n\t"
2014  // 0 - 7
2015  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2016  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2017  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2018  // 8 - 15
2019  PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2020 
2021  "addiu %[h], %[h], -0x01 \n\t"
2022  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2023  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2024  "bnez %[h], 1b \n\t"
2025  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2026  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2027  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2028  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2029  [ftmp8]"=&f"(ftmp[8]),
2030  [tmp0]"=&r"(tmp[0]),
2031  RESTRICT_ASM_ALL64
2032  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2033  [src1]"=&r"(src1),
2034  [h]"+&r"(h),
2035  [dst]"+&r"(dst), [src]"+&r"(src)
2036  : [ff_pw_64]"f"(ff_pw_64),
2037  [srcstride]"r"((mips_reg)srcstride),
2038  [dststride]"r"((mips_reg)dststride),
2039  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2040  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2041  : "memory"
2042  );
2043 #else
2044  const uint8_t *filter = subpel_filters[my - 1];
2045  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2046  int x, y;
2047 
2048  for (y = 0; y < h; y++) {
2049  for (x = 0; x < 16; x++)
2050  dst[x] = FILTER_4TAP(src, filter, srcstride);
2051  dst += dststride;
2052  src += srcstride;
2053  }
2054 #endif
2055 }
2056 
2057 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2058  ptrdiff_t srcstride, int h, int mx, int my)
2059 {
2060 #if 1
2061  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2062  double ftmp[9];
2063  uint32_t tmp[1];
2064  mips_reg src1;
2065  DECLARE_VAR_ALL64;
2066 
2067  /*
2068  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2069  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2070  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2071  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2072  dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2073  dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2074  dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2075  dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2076  */
2077  __asm__ volatile (
2078  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2079  "li %[tmp0], 0x07 \n\t"
2080  "mtc1 %[tmp0], %[ftmp4] \n\t"
2081 
2082  "1: \n\t"
2083  PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2084 
2085  "addiu %[h], %[h], -0x01 \n\t"
2086  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2087  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2088  "bnez %[h], 1b \n\t"
2089  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2090  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2091  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2092  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2093  [ftmp8]"=&f"(ftmp[8]),
2094  [tmp0]"=&r"(tmp[0]),
2095  RESTRICT_ASM_ALL64
2096  [src1]"=&r"(src1),
2097  [h]"+&r"(h),
2098  [dst]"+&r"(dst), [src]"+&r"(src)
2099  : [ff_pw_64]"f"(ff_pw_64),
2100  [srcstride]"r"((mips_reg)srcstride),
2101  [dststride]"r"((mips_reg)dststride),
2102  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2103  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2104  : "memory"
2105  );
2106 #else
2107  const uint8_t *filter = subpel_filters[my - 1];
2108  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2109  int x, y;
2110 
2111  for (y = 0; y < h; y++) {
2112  for (x = 0; x < 8; x++)
2113  dst[x] = FILTER_4TAP(src, filter, srcstride);
2114  dst += dststride;
2115  src += srcstride;
2116  }
2117 #endif
2118 }
2119 
2120 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2121  ptrdiff_t srcstride, int h, int mx, int my)
2122 {
2123 #if 1
2124  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2125  double ftmp[6];
2126  uint32_t tmp[1];
2127  mips_reg src1;
2128  DECLARE_VAR_LOW32;
2129 
2130  /*
2131  dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2132  dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2133  dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2134  dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2135  */
2136  __asm__ volatile (
2137  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2138  "li %[tmp0], 0x07 \n\t"
2139  "mtc1 %[tmp0], %[ftmp4] \n\t"
2140 
2141  "1: \n\t"
2142  PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2143 
2144  "addiu %[h], %[h], -0x01 \n\t"
2145  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2146  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2147  "bnez %[h], 1b \n\t"
2148  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2149  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2150  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2151  [tmp0]"=&r"(tmp[0]),
2152  RESTRICT_ASM_LOW32
2153  [src1]"=&r"(src1),
2154  [h]"+&r"(h),
2155  [dst]"+&r"(dst), [src]"+&r"(src)
2156  : [ff_pw_64]"f"(ff_pw_64),
2157  [srcstride]"r"((mips_reg)srcstride),
2158  [dststride]"r"((mips_reg)dststride),
2159  [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2160  [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2161  : "memory"
2162  );
2163 #else
2164  const uint8_t *filter = subpel_filters[my - 1];
2165  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2166  int x, y;
2167 
2168  for (y = 0; y < h; y++) {
2169  for (x = 0; x < 4; x++)
2170  dst[x] = FILTER_4TAP(src, filter, srcstride);
2171  dst += dststride;
2172  src += srcstride;
2173  }
2174 #endif
2175 }
2176 
2177 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2178  ptrdiff_t srcstride, int h, int mx, int my)
2179 {
2180 #if 1
2181  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2182  double ftmp[9];
2183  uint32_t tmp[1];
2184  mips_reg src0, src1, dst0;
2185  DECLARE_VAR_ALL64;
2186 
2187  /*
2188  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2189  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2190  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2191  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2192  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2193  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2194  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2195  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2196 
2197  dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2198  dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2199  dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2200  dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2201  dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2202  dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2203  dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2204  dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2205  */
2206  __asm__ volatile (
2207  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2208  "li %[tmp0], 0x07 \n\t"
2209  "mtc1 %[tmp0], %[ftmp4] \n\t"
2210 
2211  "1: \n\t"
2212  // 0 - 7
2213  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2214  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2215  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2216  // 8 - 15
2217  PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2218 
2219  "addiu %[h], %[h], -0x01 \n\t"
2220  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2221  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2222  "bnez %[h], 1b \n\t"
2223  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2224  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2225  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2226  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2227  [ftmp8]"=&f"(ftmp[8]),
2228  [tmp0]"=&r"(tmp[0]),
2229  RESTRICT_ASM_ALL64
2230  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2231  [src1]"=&r"(src1),
2232  [h]"+&r"(h),
2233  [dst]"+&r"(dst), [src]"+&r"(src)
2234  : [ff_pw_64]"f"(ff_pw_64),
2235  [srcstride]"r"((mips_reg)srcstride),
2236  [dststride]"r"((mips_reg)dststride),
2237  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2238  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2239  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2240  : "memory"
2241  );
2242 #else
2243  const uint8_t *filter = subpel_filters[my - 1];
2244  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2245  int x, y;
2246 
2247  for (y = 0; y < h; y++) {
2248  for (x = 0; x < 16; x++)
2249  dst[x] = FILTER_6TAP(src, filter, srcstride);
2250  dst += dststride;
2251  src += srcstride;
2252  }
2253 #endif
2254 }
2255 
2256 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2257  ptrdiff_t srcstride, int h, int mx, int my)
2258 {
2259 #if 1
2260  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2261  double ftmp[9];
2262  uint32_t tmp[1];
2263  mips_reg src1;
2264  DECLARE_VAR_ALL64;
2265 
2266  /*
2267  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2268  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2269  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2270  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2271  dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2272  dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2273  dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2274  dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2275  */
2276  __asm__ volatile (
2277  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2278  "li %[tmp0], 0x07 \n\t"
2279  "mtc1 %[tmp0], %[ftmp4] \n\t"
2280 
2281  "1: \n\t"
2282  PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2283 
2284  "addiu %[h], %[h], -0x01 \n\t"
2285  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2286  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2287  "bnez %[h], 1b \n\t"
2288  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2289  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2290  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2291  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2292  [ftmp8]"=&f"(ftmp[8]),
2293  [tmp0]"=&r"(tmp[0]),
2294  RESTRICT_ASM_ALL64
2295  [src1]"=&r"(src1),
2296  [h]"+&r"(h),
2297  [dst]"+&r"(dst), [src]"+&r"(src)
2298  : [ff_pw_64]"f"(ff_pw_64),
2299  [srcstride]"r"((mips_reg)srcstride),
2300  [dststride]"r"((mips_reg)dststride),
2301  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2302  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2303  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2304  : "memory"
2305  );
2306 #else
2307  const uint8_t *filter = subpel_filters[my - 1];
2308  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2309  int x, y;
2310 
2311  for (y = 0; y < h; y++) {
2312  for (x = 0; x < 8; x++)
2313  dst[x] = FILTER_6TAP(src, filter, srcstride);
2314  dst += dststride;
2315  src += srcstride;
2316  }
2317 #endif
2318 }
2319 
2320 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2321  ptrdiff_t srcstride, int h, int mx, int my)
2322 {
2323 #if 1
2324  const uint64_t *filter = fourtap_subpel_filters[my - 1];
2325  double ftmp[6];
2326  uint32_t tmp[1];
2327  mips_reg src1;
2328  DECLARE_VAR_LOW32;
2329 
2330  /*
2331  dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2332  dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2333  dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2334  dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2335  */
2336  __asm__ volatile (
2337  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2338  "li %[tmp0], 0x07 \n\t"
2339  "mtc1 %[tmp0], %[ftmp4] \n\t"
2340 
2341  "1: \n\t"
2342  PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2343 
2344  "addiu %[h], %[h], -0x01 \n\t"
2345  PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2346  PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2347  "bnez %[h], 1b \n\t"
2348  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2349  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2350  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2351  [tmp0]"=&r"(tmp[0]),
2352  RESTRICT_ASM_LOW32
2353  [src1]"=&r"(src1),
2354  [h]"+&r"(h),
2355  [dst]"+&r"(dst), [src]"+&r"(src)
2356  : [ff_pw_64]"f"(ff_pw_64),
2357  [srcstride]"r"((mips_reg)srcstride),
2358  [dststride]"r"((mips_reg)dststride),
2359  [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2360  [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2361  [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2362  : "memory"
2363  );
2364 #else
2365  const uint8_t *filter = subpel_filters[my - 1];
2366  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2367  int x, y;
2368 
2369  for (y = 0; y < h; y++) {
2370  for (x = 0; x < 4; x++)
2371  dst[x] = FILTER_6TAP(src, filter, srcstride);
2372  dst += dststride;
2373  src += srcstride;
2374  }
2375 #endif
2376 }
2377 
2378 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2379  ptrdiff_t srcstride, int h, int mx, int my)
2380 {
2381 #if 1
2382  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2383  uint8_t *tmp = tmp_array;
2384 
2385  src -= srcstride;
2386  ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2387  tmp = tmp_array + 16;
2388  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2389 #else
2390  const uint8_t *filter = subpel_filters[mx - 1];
2391  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2392  int x, y;
2393  uint8_t tmp_array[560];
2394  uint8_t *tmp = tmp_array;
2395 
2396  src -= srcstride;
2397 
2398  for (y = 0; y < h + 3; y++) {
2399  for (x = 0; x < 16; x++)
2400  tmp[x] = FILTER_4TAP(src, filter, 1);
2401  tmp += 16;
2402  src += srcstride;
2403  }
2404 
2405  tmp = tmp_array + 16;
2406  filter = subpel_filters[my - 1];
2407 
2408  for (y = 0; y < h; y++) {
2409  for (x = 0; x < 16; x++)
2410  dst[x] = FILTER_4TAP(tmp, filter, 16);
2411  dst += dststride;
2412  tmp += 16;
2413  }
2414 #endif
2415 }
2416 
2417 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2418  ptrdiff_t srcstride, int h, int mx, int my)
2419 {
2420 #if 1
2421  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2422  uint8_t *tmp = tmp_array;
2423 
2424  src -= srcstride;
2425  ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2426  tmp = tmp_array + 8;
2427  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2428 #else
2429  const uint8_t *filter = subpel_filters[mx - 1];
2430  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2431  int x, y;
2432  uint8_t tmp_array[152];
2433  uint8_t *tmp = tmp_array;
2434 
2435  src -= srcstride;
2436 
2437  for (y = 0; y < h + 3; y++) {
2438  for (x = 0; x < 8; x++)
2439  tmp[x] = FILTER_4TAP(src, filter, 1);
2440  tmp += 8;
2441  src += srcstride;
2442  }
2443 
2444  tmp = tmp_array + 8;
2445  filter = subpel_filters[my - 1];
2446 
2447  for (y = 0; y < h; y++) {
2448  for (x = 0; x < 8; x++)
2449  dst[x] = FILTER_4TAP(tmp, filter, 8);
2450  dst += dststride;
2451  tmp += 8;
2452  }
2453 #endif
2454 }
2455 
2456 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2457  ptrdiff_t srcstride, int h, int mx, int my)
2458 {
2459 #if 1
2460  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2461  uint8_t *tmp = tmp_array;
2462 
2463  src -= srcstride;
2464  ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2465  tmp = tmp_array + 4;
2466  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2467 #else
2468  const uint8_t *filter = subpel_filters[mx - 1];
2469  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2470  int x, y;
2471  uint8_t tmp_array[44];
2472  uint8_t *tmp = tmp_array;
2473 
2474  src -= srcstride;
2475 
2476  for (y = 0; y < h + 3; y++) {
2477  for (x = 0; x < 4; x++)
2478  tmp[x] = FILTER_4TAP(src, filter, 1);
2479  tmp += 4;
2480  src += srcstride;
2481  }
2482  tmp = tmp_array + 4;
2483  filter = subpel_filters[my - 1];
2484 
2485  for (y = 0; y < h; y++) {
2486  for (x = 0; x < 4; x++)
2487  dst[x] = FILTER_4TAP(tmp, filter, 4);
2488  dst += dststride;
2489  tmp += 4;
2490  }
2491 #endif
2492 }
2493 
2494 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2495  ptrdiff_t srcstride, int h, int mx, int my)
2496 {
2497 #if 1
2498  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2499  uint8_t *tmp = tmp_array;
2500 
2501  src -= 2 * srcstride;
2502  ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2503  tmp = tmp_array + 32;
2504  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2505 #else
2506  const uint8_t *filter = subpel_filters[mx - 1];
2507  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2508  int x, y;
2509  uint8_t tmp_array[592];
2510  uint8_t *tmp = tmp_array;
2511 
2512  src -= 2 * srcstride;
2513 
2514  for (y = 0; y < h + 5; y++) {
2515  for (x = 0; x < 16; x++)
2516  tmp[x] = FILTER_4TAP(src, filter, 1);
2517  tmp += 16;
2518  src += srcstride;
2519  }
2520 
2521  tmp = tmp_array + 32;
2522  filter = subpel_filters[my - 1];
2523 
2524  for (y = 0; y < h; y++) {
2525  for (x = 0; x < 16; x++)
2526  dst[x] = FILTER_6TAP(tmp, filter, 16);
2527  dst += dststride;
2528  tmp += 16;
2529  }
2530 #endif
2531 }
2532 
2533 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2534  ptrdiff_t srcstride, int h, int mx, int my)
2535 {
2536 #if 1
2537  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2538  uint8_t *tmp = tmp_array;
2539 
2540  src -= 2 * srcstride;
2541  ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2542  tmp = tmp_array + 16;
2543  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2544 #else
2545  const uint8_t *filter = subpel_filters[mx - 1];
2546  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2547  int x, y;
2548  uint8_t tmp_array[168];
2549  uint8_t *tmp = tmp_array;
2550 
2551  src -= 2 * srcstride;
2552 
2553  for (y = 0; y < h + 5; y++) {
2554  for (x = 0; x < 8; x++)
2555  tmp[x] = FILTER_4TAP(src, filter, 1);
2556  tmp += 8;
2557  src += srcstride;
2558  }
2559 
2560  tmp = tmp_array + 16;
2561  filter = subpel_filters[my - 1];
2562 
2563  for (y = 0; y < h; y++) {
2564  for (x = 0; x < 8; x++)
2565  dst[x] = FILTER_6TAP(tmp, filter, 8);
2566  dst += dststride;
2567  tmp += 8;
2568  }
2569 #endif
2570 }
2571 
2572 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2573  ptrdiff_t srcstride, int h, int mx, int my)
2574 {
2575 #if 1
2576  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2577  uint8_t *tmp = tmp_array;
2578 
2579  src -= 2 * srcstride;
2580  ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2581  tmp = tmp_array + 8;
2582  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2583 #else
2584  const uint8_t *filter = subpel_filters[mx - 1];
2585  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2586  int x, y;
2587  uint8_t tmp_array[52];
2588  uint8_t *tmp = tmp_array;
2589 
2590  src -= 2 * srcstride;
2591 
2592  for (y = 0; y < h + 5; y++) {
2593  for (x = 0; x < 4; x++)
2594  tmp[x] = FILTER_4TAP(src, filter, 1);
2595  tmp += 4;
2596  src += srcstride;
2597  }
2598 
2599  tmp = tmp_array + 8;
2600  filter = subpel_filters[my - 1];
2601 
2602  for (y = 0; y < h; y++) {
2603  for (x = 0; x < 4; x++)
2604  dst[x] = FILTER_6TAP(tmp, filter, 4);
2605  dst += dststride;
2606  tmp += 4;
2607  }
2608 #endif
2609 }
2610 
2611 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2612  ptrdiff_t srcstride, int h, int mx, int my)
2613 {
2614 #if 1
2615  DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2616  uint8_t *tmp = tmp_array;
2617 
2618  src -= srcstride;
2619  ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2620  tmp = tmp_array + 16;
2621  ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2622 #else
2623  const uint8_t *filter = subpel_filters[mx - 1];
2624  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2625  int x, y;
2626  uint8_t tmp_array[560];
2627  uint8_t *tmp = tmp_array;
2628 
2629  src -= srcstride;
2630 
2631  for (y = 0; y < h + 3; y++) {
2632  for (x = 0; x < 16; x++)
2633  tmp[x] = FILTER_6TAP(src, filter, 1);
2634  tmp += 16;
2635  src += srcstride;
2636  }
2637 
2638  tmp = tmp_array + 16;
2639  filter = subpel_filters[my - 1];
2640 
2641  for (y = 0; y < h; y++) {
2642  for (x = 0; x < 16; x++)
2643  dst[x] = FILTER_4TAP(tmp, filter, 16);
2644  dst += dststride;
2645  tmp += 16;
2646  }
2647 #endif
2648 }
2649 
2650 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2651  ptrdiff_t srcstride, int h, int mx, int my)
2652 {
2653 #if 1
2654  DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2655  uint8_t *tmp = tmp_array;
2656 
2657  src -= srcstride;
2658  ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2659  tmp = tmp_array + 8;
2660  ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2661 #else
2662  const uint8_t *filter = subpel_filters[mx - 1];
2663  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2664  int x, y;
2665  uint8_t tmp_array[152];
2666  uint8_t *tmp = tmp_array;
2667 
2668  src -= srcstride;
2669 
2670  for (y = 0; y < h + 3; y++) {
2671  for (x = 0; x < 8; x++)
2672  tmp[x] = FILTER_6TAP(src, filter, 1);
2673  tmp += 8;
2674  src += srcstride;
2675  }
2676 
2677  tmp = tmp_array + 8;
2678  filter = subpel_filters[my - 1];
2679 
2680  for (y = 0; y < h; y++) {
2681  for (x = 0; x < 8; x++)
2682  dst[x] = FILTER_4TAP(tmp, filter, 8);
2683  dst += dststride;
2684  tmp += 8;
2685  }
2686 #endif
2687 }
2688 
2689 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2690  ptrdiff_t srcstride, int h, int mx, int my)
2691 {
2692 #if 1
2693  DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2694  uint8_t *tmp = tmp_array;
2695 
2696  src -= srcstride;
2697  ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2698  tmp = tmp_array + 4;
2699  ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2700 #else
2701  const uint8_t *filter = subpel_filters[mx - 1];
2702  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2703  int x, y;
2704  uint8_t tmp_array[44];
2705  uint8_t *tmp = tmp_array;
2706 
2707  src -= srcstride;
2708 
2709  for (y = 0; y < h + 3; y++) {
2710  for (x = 0; x < 4; x++)
2711  tmp[x] = FILTER_6TAP(src, filter, 1);
2712  tmp += 4;
2713  src += srcstride;
2714  }
2715 
2716  tmp = tmp_array + 4;
2717  filter = subpel_filters[my - 1];
2718 
2719  for (y = 0; y < h; y++) {
2720  for (x = 0; x < 4; x++)
2721  dst[x] = FILTER_4TAP(tmp, filter, 4);
2722  dst += dststride;
2723  tmp += 4;
2724  }
2725 #endif
2726 }
2727 
2728 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2729  ptrdiff_t srcstride, int h, int mx, int my)
2730 {
2731 #if 1
2732  DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2733  uint8_t *tmp = tmp_array;
2734 
2735  src -= 2 * srcstride;
2736  ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2737  tmp = tmp_array + 32;
2738  ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2739 #else
2740  const uint8_t *filter = subpel_filters[mx - 1];
2741  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2742  int x, y;
2743  uint8_t tmp_array[592];
2744  uint8_t *tmp = tmp_array;
2745 
2746  src -= 2 * srcstride;
2747 
2748  for (y = 0; y < h + 5; y++) {
2749  for (x = 0; x < 16; x++)
2750  tmp[x] = FILTER_6TAP(src, filter, 1);
2751  tmp += 16;
2752  src += srcstride;
2753  }
2754 
2755  tmp = tmp_array + 32;
2756  filter = subpel_filters[my - 1];
2757 
2758  for (y = 0; y < h; y++) {
2759  for (x = 0; x < 16; x++)
2760  dst[x] = FILTER_6TAP(tmp, filter, 16);
2761  dst += dststride;
2762  tmp += 16;
2763  }
2764 #endif
2765 }
2766 
2767 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2768  ptrdiff_t srcstride, int h, int mx, int my)
2769 {
2770 #if 1
2771  DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2772  uint8_t *tmp = tmp_array;
2773 
2774  src -= 2 * srcstride;
2775  ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2776  tmp = tmp_array + 16;
2777  ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2778 #else
2779  const uint8_t *filter = subpel_filters[mx - 1];
2780  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2781  int x, y;
2782  uint8_t tmp_array[168];
2783  uint8_t *tmp = tmp_array;
2784 
2785  src -= 2 * srcstride;
2786 
2787  for (y = 0; y < h + 5; y++) {
2788  for (x = 0; x < 8; x++)
2789  tmp[x] = FILTER_6TAP(src, filter, 1);
2790  tmp += 8;
2791  src += srcstride;
2792  }
2793 
2794  tmp = tmp_array + 16;
2795  filter = subpel_filters[my - 1];
2796 
2797  for (y = 0; y < h; y++) {
2798  for (x = 0; x < 8; x++)
2799  dst[x] = FILTER_6TAP(tmp, filter, 8);
2800  dst += dststride;
2801  tmp += 8;
2802  }
2803 #endif
2804 }
2805 
2806 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2807  ptrdiff_t srcstride, int h, int mx, int my)
2808 {
2809 #if 1
2810  DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2811  uint8_t *tmp = tmp_array;
2812 
2813  src -= 2 * srcstride;
2814  ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2815  tmp = tmp_array + 8;
2816  ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2817 #else
2818  const uint8_t *filter = subpel_filters[mx - 1];
2819  const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2820  int x, y;
2821  uint8_t tmp_array[52];
2822  uint8_t *tmp = tmp_array;
2823 
2824  src -= 2 * srcstride;
2825 
2826  for (y = 0; y < h + 5; y++) {
2827  for (x = 0; x < 4; x++)
2828  tmp[x] = FILTER_6TAP(src, filter, 1);
2829  tmp += 4;
2830  src += srcstride;
2831  }
2832 
2833  tmp = tmp_array + 8;
2834  filter = subpel_filters[my - 1];
2835 
2836  for (y = 0; y < h; y++) {
2837  for (x = 0; x < 4; x++)
2838  dst[x] = FILTER_6TAP(tmp, filter, 4);
2839  dst += dststride;
2840  tmp += 4;
2841  }
2842 #endif
2843 }
2844 
2845 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2846  ptrdiff_t sstride, int h, int mx, int my)
2847 {
2848 #if 1
2849  int a = 8 - mx, b = mx;
2850  double ftmp[7];
2851  uint32_t tmp[1];
2852  mips_reg dst0, src0;
2853  DECLARE_VAR_ALL64;
2854 
2855  /*
2856  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2857  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2858  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2859  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2860  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2861  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2862  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2863  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2864 
2865  dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2866  dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2867  dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2868  dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2869  dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2870  dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2871  dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2872  dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2873  */
2874  __asm__ volatile (
2875  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2876  "li %[tmp0], 0x03 \n\t"
2877  "mtc1 %[tmp0], %[ftmp4] \n\t"
2878  "pshufh %[a], %[a], %[ftmp0] \n\t"
2879  "pshufh %[b], %[b], %[ftmp0] \n\t"
2880 
2881  "1: \n\t"
2882  // 0 - 7
2883  PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2884  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2885  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2886  // 8 - 15
2887  PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2888 
2889  "addiu %[h], %[h], -0x01 \n\t"
2890  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2891  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2892  "bnez %[h], 1b \n\t"
2893  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2894  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2895  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2896  [ftmp6]"=&f"(ftmp[6]),
2897  [tmp0]"=&r"(tmp[0]),
2898  RESTRICT_ASM_ALL64
2899  [dst0]"=&r"(dst0), [src0]"=&r"(src0),
2900  [h]"+&r"(h),
2901  [dst]"+&r"(dst), [src]"+&r"(src),
2902  [a]"+&f"(a), [b]"+&f"(b)
2903  : [sstride]"r"((mips_reg)sstride),
2904  [dstride]"r"((mips_reg)dstride),
2905  [ff_pw_4]"f"(ff_pw_4)
2906  : "memory"
2907  );
2908 #else
2909  int a = 8 - mx, b = mx;
2910  int x, y;
2911 
2912  for (y = 0; y < h; y++) {
2913  for (x = 0; x < 16; x++)
2914  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2915  dst += dstride;
2916  src += sstride;
2917  }
2918 #endif
2919 }
2920 
2921 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2922  ptrdiff_t sstride, int h, int mx, int my)
2923 {
2924 #if 1
2925  int c = 8 - my, d = my;
2926  double ftmp[7];
2927  uint32_t tmp[1];
2928  mips_reg src0, src1, dst0;
2929  DECLARE_VAR_ALL64;
2930 
2931  /*
2932  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
2933  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2934  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2935  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2936  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
2937  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
2938  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
2939  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
2940  */
2941  __asm__ volatile (
2942  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2943  "li %[tmp0], 0x03 \n\t"
2944  "mtc1 %[tmp0], %[ftmp4] \n\t"
2945  "pshufh %[c], %[c], %[ftmp0] \n\t"
2946  "pshufh %[d], %[d], %[ftmp0] \n\t"
2947 
2948  "1: \n\t"
2949  // 0 - 7
2950  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
2951  PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2952  PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2953  // 8 - 15
2954  PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
2955 
2956  "addiu %[h], %[h], -0x01 \n\t"
2957  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2958  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2959  "bnez %[h], 1b \n\t"
2960  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2961  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2962  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2963  [ftmp6]"=&f"(ftmp[6]),
2964  [tmp0]"=&r"(tmp[0]),
2965  RESTRICT_ASM_ALL64
2966  [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2967  [src1]"=&r"(src1),
2968  [h]"+&r"(h),
2969  [dst]"+&r"(dst), [src]"+&r"(src),
2970  [c]"+&f"(c), [d]"+&f"(d)
2971  : [sstride]"r"((mips_reg)sstride),
2972  [dstride]"r"((mips_reg)dstride),
2973  [ff_pw_4]"f"(ff_pw_4)
2974  : "memory"
2975  );
2976 #else
2977  int c = 8 - my, d = my;
2978  int x, y;
2979 
2980  for (y = 0; y < h; y++) {
2981  for (x = 0; x < 16; x++)
2982  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
2983  dst += dstride;
2984  src += sstride;
2985  }
2986 #endif
2987 }
2988 
2989 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2990  ptrdiff_t sstride, int h, int mx, int my)
2991 {
2992 #if 1
2993  DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
2994  uint8_t *tmp = tmp_array;
2995 
2996  ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
2997  ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
2998 #else
2999  int a = 8 - mx, b = mx;
3000  int c = 8 - my, d = my;
3001  int x, y;
3002  uint8_t tmp_array[528];
3003  uint8_t *tmp = tmp_array;
3004 
3005  for (y = 0; y < h + 1; y++) {
3006  for (x = 0; x < 16; x++)
3007  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3008  tmp += 16;
3009  src += sstride;
3010  }
3011 
3012  tmp = tmp_array;
3013 
3014  for (y = 0; y < h; y++) {
3015  for (x = 0; x < 16; x++)
3016  dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3017  dst += dstride;
3018  tmp += 16;
3019  }
3020 #endif
3021 }
3022 
3023 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3024  ptrdiff_t sstride, int h, int mx, int my)
3025 {
3026 #if 1
3027  int a = 8 - mx, b = mx;
3028  double ftmp[7];
3029  uint32_t tmp[1];
3030  DECLARE_VAR_ALL64;
3031 
3032  /*
3033  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3034  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3035  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3036  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3037  dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3038  dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3039  dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3040  dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3041  */
3042  __asm__ volatile (
3043  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3044  "li %[tmp0], 0x03 \n\t"
3045  "mtc1 %[tmp0], %[ftmp4] \n\t"
3046  "pshufh %[a], %[a], %[ftmp0] \n\t"
3047  "pshufh %[b], %[b], %[ftmp0] \n\t"
3048 
3049  "1: \n\t"
3050  PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3051 
3052  "addiu %[h], %[h], -0x01 \n\t"
3053  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3054  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3055  "bnez %[h], 1b \n\t"
3056  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3057  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3058  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3059  [ftmp6]"=&f"(ftmp[6]),
3060  [tmp0]"=&r"(tmp[0]),
3061  RESTRICT_ASM_ALL64
3062  [h]"+&r"(h),
3063  [dst]"+&r"(dst), [src]"+&r"(src),
3064  [a]"+&f"(a), [b]"+&f"(b)
3065  : [sstride]"r"((mips_reg)sstride),
3066  [dstride]"r"((mips_reg)dstride),
3067  [ff_pw_4]"f"(ff_pw_4)
3068  : "memory"
3069  );
3070 #else
3071  int a = 8 - mx, b = mx;
3072  int x, y;
3073 
3074  for (y = 0; y < h; y++) {
3075  for (x = 0; x < 8; x++)
3076  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3077  dst += dstride;
3078  src += sstride;
3079  }
3080 #endif
3081 }
3082 
3083 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3084  ptrdiff_t sstride, int h, int mx, int my)
3085 {
3086 #if 1
3087  int c = 8 - my, d = my;
3088  double ftmp[7];
3089  uint32_t tmp[1];
3090  mips_reg src1;
3091  DECLARE_VAR_ALL64;
3092 
3093  /*
3094  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3095  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3096  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3097  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3098  dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3099  dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3100  dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3101  dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3102  */
3103  __asm__ volatile (
3104  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3105  "li %[tmp0], 0x03 \n\t"
3106  "mtc1 %[tmp0], %[ftmp4] \n\t"
3107  "pshufh %[c], %[c], %[ftmp0] \n\t"
3108  "pshufh %[d], %[d], %[ftmp0] \n\t"
3109 
3110  "1: \n\t"
3111  PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3112 
3113  "addiu %[h], %[h], -0x01 \n\t"
3114  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3115  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3116  "bnez %[h], 1b \n\t"
3117  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3118  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3119  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3120  [ftmp6]"=&f"(ftmp[6]),
3121  [tmp0]"=&r"(tmp[0]),
3122  RESTRICT_ASM_ALL64
3123  [src1]"=&r"(src1),
3124  [h]"+&r"(h),
3125  [dst]"+&r"(dst), [src]"+&r"(src),
3126  [c]"+&f"(c), [d]"+&f"(d)
3127  : [sstride]"r"((mips_reg)sstride),
3128  [dstride]"r"((mips_reg)dstride),
3129  [ff_pw_4]"f"(ff_pw_4)
3130  : "memory"
3131  );
3132 #else
3133  int c = 8 - my, d = my;
3134  int x, y;
3135 
3136  for (y = 0; y < h; y++) {
3137  for (x = 0; x < 8; x++)
3138  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3139  dst += dstride;
3140  src += sstride;
3141  }
3142 #endif
3143 }
3144 
3145 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3146  ptrdiff_t sstride, int h, int mx, int my)
3147 {
3148 #if 1
3149  DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3150  uint8_t *tmp = tmp_array;
3151 
3152  ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3153  ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3154 #else
3155  int a = 8 - mx, b = mx;
3156  int c = 8 - my, d = my;
3157  int x, y;
3158  uint8_t tmp_array[136];
3159  uint8_t *tmp = tmp_array;
3160 
3161  for (y = 0; y < h + 1; y++) {
3162  for (x = 0; x < 8; x++)
3163  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3164  tmp += 8;
3165  src += sstride;
3166  }
3167 
3168  tmp = tmp_array;
3169 
3170  for (y = 0; y < h; y++) {
3171  for (x = 0; x < 8; x++)
3172  dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3173  dst += dstride;
3174  tmp += 8;
3175  }
3176 #endif
3177 }
3178 
3179 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3180  ptrdiff_t sstride, int h, int mx, int my)
3181 {
3182 #if 1
3183  int a = 8 - mx, b = mx;
3184  double ftmp[5];
3185  uint32_t tmp[1];
3186  DECLARE_VAR_LOW32;
3187  DECLARE_VAR_ALL64;
3188 
3189  /*
3190  dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3191  dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3192  dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3193  dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3194  */
3195  __asm__ volatile (
3196  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3197  "li %[tmp0], 0x03 \n\t"
3198  "mtc1 %[tmp0], %[ftmp4] \n\t"
3199  "pshufh %[a], %[a], %[ftmp0] \n\t"
3200  "pshufh %[b], %[b], %[ftmp0] \n\t"
3201 
3202  "1: \n\t"
3203  PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3204 
3205  "addiu %[h], %[h], -0x01 \n\t"
3206  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3207  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3208  "bnez %[h], 1b \n\t"
3209  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3210  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3211  [ftmp4]"=&f"(ftmp[4]),
3212  [tmp0]"=&r"(tmp[0]),
3213  RESTRICT_ASM_LOW32
3214  RESTRICT_ASM_ALL64
3215  [h]"+&r"(h),
3216  [dst]"+&r"(dst), [src]"+&r"(src),
3217  [a]"+&f"(a), [b]"+&f"(b)
3218  : [sstride]"r"((mips_reg)sstride),
3219  [dstride]"r"((mips_reg)dstride),
3220  [ff_pw_4]"f"(ff_pw_4)
3221  : "memory"
3222  );
3223 #else
3224  int a = 8 - mx, b = mx;
3225  int x, y;
3226 
3227  for (y = 0; y < h; y++) {
3228  for (x = 0; x < 4; x++)
3229  dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3230  dst += dstride;
3231  src += sstride;
3232  }
3233 #endif
3234 }
3235 
3236 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3237  ptrdiff_t sstride, int h, int mx, int my)
3238 {
3239 #if 1
3240  int c = 8 - my, d = my;
3241  double ftmp[7];
3242  uint32_t tmp[1];
3243  mips_reg src1;
3244  DECLARE_VAR_LOW32;
3245  DECLARE_VAR_ALL64;
3246 
3247  /*
3248  dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3249  dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3250  dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3251  dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3252  */
3253  __asm__ volatile (
3254  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3255  "li %[tmp0], 0x03 \n\t"
3256  "mtc1 %[tmp0], %[ftmp4] \n\t"
3257  "pshufh %[c], %[c], %[ftmp0] \n\t"
3258  "pshufh %[d], %[d], %[ftmp0] \n\t"
3259 
3260  "1: \n\t"
3261  PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3262 
3263  "addiu %[h], %[h], -0x01 \n\t"
3264  PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3265  PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3266  "bnez %[h], 1b \n\t"
3267  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3268  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3269  [ftmp4]"=&f"(ftmp[4]),
3270  [tmp0]"=&r"(tmp[0]),
3271  RESTRICT_ASM_LOW32
3272  RESTRICT_ASM_ALL64
3273  [src1]"=&r"(src1),
3274  [h]"+&r"(h),
3275  [dst]"+&r"(dst), [src]"+&r"(src),
3276  [c]"+&f"(c), [d]"+&f"(d)
3277  : [sstride]"r"((mips_reg)sstride),
3278  [dstride]"r"((mips_reg)dstride),
3279  [ff_pw_4]"f"(ff_pw_4)
3280  : "memory"
3281  );
3282 #else
3283  int c = 8 - my, d = my;
3284  int x, y;
3285 
3286  for (y = 0; y < h; y++) {
3287  for (x = 0; x < 4; x++)
3288  dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3289  dst += dstride;
3290  src += sstride;
3291  }
3292 #endif
3293 }
3294 
3295 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3296  ptrdiff_t sstride, int h, int mx, int my)
3297 {
3298 #if 1
3299  DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3300  uint8_t *tmp = tmp_array;
3301 
3302  ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3303  ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3304 #else
3305  int a = 8 - mx, b = mx;
3306  int c = 8 - my, d = my;
3307  int x, y;
3308  uint8_t tmp_array[36];
3309  uint8_t *tmp = tmp_array;
3310 
3311  for (y = 0; y < h + 1; y++) {
3312  for (x = 0; x < 4; x++)
3313  tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3314  tmp += 4;
3315  src += sstride;
3316  }
3317 
3318  tmp = tmp_array;
3319 
3320  for (y = 0; y < h; y++) {
3321  for (x = 0; x < 4; x++)
3322  dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3323  dst += dstride;
3324  tmp += 4;
3325  }
3326 #endif
3327 }
DECLARE_UINT32_T
#define DECLARE_UINT32_T
Definition: vp8dsp_mmi.c:31
PUT_VP8_EPEL4_V6_MMI
#define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:257
stride
int stride
Definition: mace.c:144
q1
static const uint8_t q1[256]
Definition: twofish.c:96
ff_pw_64
const uint64_t ff_pw_64
Definition: constants.c:45
mem_internal.h
ff_put_vp8_epel4_h4_mmi
void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1728
FILTER_4TAP
#define FILTER_4TAP(src, F, stride)
Definition: vp8dsp.c:486
vp8_filter_common_isnot4tap
static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:692
ff_vp8_h_loop_filter16_mmi
void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1378
filter1
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:359
ff_vp8_v_loop_filter_simple_mmi
void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1445
av_unused
#define av_unused
Definition: attributes.h:131
ff_pw_4
const uint64_t ff_pw_4
Definition: constants.c:29
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:27
w
uint8_t w
Definition: llviddspenc.c:39
t0
#define t0
Definition: regdef.h:28
b
#define b
Definition: input.c:41
ff_put_vp8_pixels16_mmi
void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1463
ff_put_vp8_bilinear16_v_mmi
void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2921
t1
#define t1
Definition: regdef.h:29
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
mips_reg
#define mips_reg
Definition: asmdefs.h:44
RESTRICT_ASM_DOUBLE_1
#define RESTRICT_ASM_DOUBLE_1
Definition: vp8dsp_mmi.c:32
ff_crop_tab
#define ff_crop_tab
Definition: motionpixels_tablegen.c:26
ff_vp8_h_loop_filter_simple_mmi
void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:1454
PUT_VP8_EPEL8_V4_MMI
#define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:491
t10
#define t10
Definition: regdef.h:55
ff_vp8_luma_dc_wht_mmi
void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:969
vp8_simple_limit
static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride, int flim)
Definition: vp8dsp_mmi.c:719
PUT_VP8_BILINEAR4_H_MMI
#define PUT_VP8_BILINEAR4_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:559
TRANSPOSE_4H
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:269
ff_put_vp8_bilinear16_h_mmi
void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2845
ff_put_vp8_bilinear8_v_mmi
void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3083
vp8_filter_common_is4tap
static av_always_inline void vp8_filter_common_is4tap(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:667
ff_put_vp8_epel8_h4v4_mmi
void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2417
constants.h
val
static double val(void *priv, double ch)
Definition: aeval.c:76
fourtap_subpel_filters
static const uint64_t fourtap_subpel_filters[7][6]
Definition: vp8dsp_mmi.c:619
DECLARE_DOUBLE_2
#define DECLARE_DOUBLE_2
Definition: vp8dsp_mmi.c:30
mmiutils.h
a1
#define a1
Definition: regdef.h:47
ff_put_vp8_epel16_h6v6_mmi
void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2728
ff_put_vp8_epel8_h4v6_mmi
void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2533
mask
static const uint16_t mask[17]
Definition: lzw.c:38
vp8_v_loop_filter8_mmi
static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:785
ff_put_vp8_epel4_h4v6_mmi
void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2572
AV_ZERO64
#define AV_ZERO64(d)
Definition: intreadwrite.h:633
ff_put_vp8_epel16_v6_mmi
void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2177
PUT_VP8_BILINEAR4_V_MMI
#define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:601
PUT_VP8_BILINEAR8_V_MMI
#define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)
Definition: vp8dsp_mmi.c:576
ff_vp8_h_loop_filter16_inner_mmi
void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1416
ff_put_vp8_epel8_v4_mmi
void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2057
ff_vp8_idct_add_mmi
void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1127
ff_put_vp8_epel4_h4v4_mmi
void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2456
q0
static const uint8_t q0[256]
Definition: twofish.c:77
E
#define E
Definition: avdct.c:32
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
FILTER_6TAP
#define FILTER_6TAP(src, F, stride)
Definition: vp8dsp.c:481
ff_put_vp8_epel4_h6_mmi
void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1923
PUT_VP8_BILINEAR8_H_MMI
#define PUT_VP8_BILINEAR8_H_MMI(src, dst)
Definition: vp8dsp_mmi.c:535
RESTRICT_ASM_UINT32_T
#define RESTRICT_ASM_UINT32_T
Definition: vp8dsp_mmi.c:34
PUT_VP8_EPEL4_V4_MMI
#define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:300
ff_put_vp8_bilinear4_h_mmi
void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3179
ff_put_vp8_epel8_h6v6_mmi
void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2767
ff_put_vp8_epel8_h4_mmi
void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1667
MMI_VP8_LOOP_FILTER
#define MMI_VP8_LOOP_FILTER
Definition: vp8dsp_mmi.c:48
src
#define src
Definition: vp8dsp.c:255
ff_put_vp8_epel4_h6v6_mmi
void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2806
vp8_v_loop_filter8_inner_mmi
static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:855
ff_put_vp8_epel16_h6_mmi
void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1783
PUT_VP8_EPEL4_H4_MMI
#define PUT_VP8_EPEL4_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:229
ff_vp8_v_loop_filter16_inner_mmi
void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1401
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
RESTRICT_ASM_DOUBLE_2
#define RESTRICT_ASM_DOUBLE_2
Definition: vp8dsp_mmi.c:33
ff_vp8_v_loop_filter8uv_mmi
void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1386
t11
#define t11
Definition: regdef.h:56
vp8_h_loop_filter8_mmi
static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:870
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_vp8_idct_dc_add4y_mmi
void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1352
ff_put_vp8_epel8_h6v4_mmi
void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2650
ff_put_vp8_epel16_v4_mmi
void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1979
ff_put_vp8_bilinear8_hv_mmi
void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3145
ff_put_vp8_bilinear4_v_mmi
void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3236
ff_put_vp8_epel8_h6_mmi
void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1861
t12
#define t12
Definition: regdef.h:58
ff_vp8_idct_dc_add_mmi
void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1294
ff_vp8_idct_dc_add4uv_mmi
void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
Definition: vp8dsp_mmi.c:1361
FFMIN
#define FFMIN(a, b)
Definition: common.h:105
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:50
vp8_normal_limit
static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride, int E, int I)
Definition: vp8dsp_mmi.c:767
a0
#define a0
Definition: regdef.h:46
src0
#define src0
Definition: h264pred.c:139
ff_put_vp8_epel16_h4v4_mmi
void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2378
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:117
ff_put_vp8_epel4_v4_mmi
void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2120
src1
#define src1
Definition: h264pred.c:140
i
int i
Definition: input.c:407
ff_put_vp8_bilinear16_hv_mmi
void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2989
ff_put_vp8_bilinear8_h_mmi
void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3023
t3
#define t3
Definition: regdef.h:31
ff_vp8_v_loop_filter8uv_inner_mmi
void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1431
ff_vp8_v_loop_filter16_mmi
void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1371
ff_put_vp8_epel16_h4_mmi
void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:1590
clip_int8
#define clip_int8(n)
Definition: vp8dsp_mmi.c:666
ff_put_vp8_bilinear4_hv_mmi
void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:3295
a2
#define a2
Definition: regdef.h:48
DECLARE_DOUBLE_1
#define DECLARE_DOUBLE_1
Definition: vp8dsp_mmi.c:29
av_always_inline
#define av_always_inline
Definition: attributes.h:49
uint8_t
uint8_t
Definition: audio_convert.c:194
PUT_VP8_EPEL8_V6_MMI
#define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)
Definition: vp8dsp_mmi.c:429
ff_put_vp8_epel16_h6v4_mmi
void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2611
PUT_VP8_EPEL8_H4_MMI
#define PUT_VP8_EPEL8_H4_MMI(src, dst)
Definition: vp8dsp_mmi.c:388
ff_put_vp8_epel8_v6_mmi
void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2256
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
vp8_h_loop_filter8_inner_mmi
static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:954
ff_vp8_h_loop_filter8uv_inner_mmi
void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1438
vp8dsp_mips.h
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:47
ff_vp8_luma_dc_wht_dc_mmi
void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
Definition: vp8dsp_mmi.c:1103
ff_put_vp8_epel16_h4v6_mmi
void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2494
t2
#define t2
Definition: regdef.h:30
ff_put_vp8_pixels4_mmi
void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1550
TRANSPOSE_8B
#define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 8x8 byte packaged data.
Definition: mmiutils.h:285
dstV
uint16_t * dstV
Definition: input.c:403
PUT_VP8_EPEL4_H6_MMI
#define PUT_VP8_EPEL4_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:192
cm
#define cm
Definition: dvbsubdec.c:37
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:48
ff_put_vp8_epel4_h6v4_mmi
void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2689
av_clip_uint8
#define av_clip_uint8
Definition: common.h:128
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:351
filter_mbedge
static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
Definition: vp8dsp_mmi.c:740
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:730
ff_put_vp8_epel4_v6_mmi
void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int mx, int my)
Definition: vp8dsp_mmi.c:2320
ff_put_vp8_pixels8_mmi
void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int h, int x, int y)
Definition: vp8dsp_mmi.c:1510
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
PUT_VP8_EPEL8_H6_MMI
#define PUT_VP8_EPEL8_H6_MMI(src, dst)
Definition: vp8dsp_mmi.c:331
h
h
Definition: vp9dsp_template.c:2038
MAX_NEG_CROP
#define MAX_NEG_CROP
Definition: mathops.h:31
ff_vp8_h_loop_filter8uv_mmi
void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
Definition: vp8dsp_mmi.c:1393
subpel_filters
static const uint8_t subpel_filters[7][6]
Definition: vp8dsp.c:457