FFmpeg
vp9_intra_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
23 #include "vp9dsp_mips.h"
24 
25 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
26 { \
27  out0 = __msa_subs_u_h(out0, in0); \
28  out1 = __msa_subs_u_h(out1, in1); \
29 }
30 
31 void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
32  const uint8_t *src)
33 {
34  uint32_t row;
35  v16u8 src0;
36 
37  src0 = LD_UB(src);
38 
39  for (row = 16; row--;) {
40  ST_UB(src0, dst);
41  dst += dst_stride;
42  }
43 }
44 
45 void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left,
46  const uint8_t *src)
47 {
48  uint32_t row;
49  v16u8 src1, src2;
50 
51  src1 = LD_UB(src);
52  src2 = LD_UB(src + 16);
53 
54  for (row = 32; row--;) {
55  ST_UB2(src1, src2, dst, 16);
56  dst += dst_stride;
57  }
58 }
59 
60 void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
61  const uint8_t *top)
62 {
63  uint32_t row, inp;
64  v16u8 src0, src1, src2, src3;
65 
66  src += 12;
67  for (row = 4; row--;) {
68  inp = LW(src);
69  src -= 4;
70 
71  src0 = (v16u8) __msa_fill_b(inp >> 24);
72  src1 = (v16u8) __msa_fill_b(inp >> 16);
73  src2 = (v16u8) __msa_fill_b(inp >> 8);
74  src3 = (v16u8) __msa_fill_b(inp);
75 
76  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
77  dst += (4 * dst_stride);
78  }
79 }
80 
81 void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src,
82  const uint8_t *top)
83 {
84  uint32_t row, inp;
85  v16u8 src0, src1, src2, src3;
86 
87  src += 28;
88  for (row = 8; row--;) {
89  inp = LW(src);
90  src -= 4;
91 
92  src0 = (v16u8) __msa_fill_b(inp >> 24);
93  src1 = (v16u8) __msa_fill_b(inp >> 16);
94  src2 = (v16u8) __msa_fill_b(inp >> 8);
95  src3 = (v16u8) __msa_fill_b(inp);
96 
97  ST_UB2(src0, src0, dst, 16);
98  dst += dst_stride;
99  ST_UB2(src1, src1, dst, 16);
100  dst += dst_stride;
101  ST_UB2(src2, src2, dst, 16);
102  dst += dst_stride;
103  ST_UB2(src3, src3, dst, 16);
104  dst += dst_stride;
105  }
106 }
107 
108 void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
109  const uint8_t *src_top)
110 {
111  uint32_t val0, val1;
112  v16i8 store, src = { 0 };
113  v8u16 sum_h;
114  v4u32 sum_w;
115  v2u64 sum_d;
116 
117  val0 = LW(src_top);
118  val1 = LW(src_left);
119  INSERT_W2_SB(val0, val1, src);
120  sum_h = __msa_hadd_u_h((v16u8) src, (v16u8) src);
121  sum_w = __msa_hadd_u_w(sum_h, sum_h);
122  sum_d = __msa_hadd_u_d(sum_w, sum_w);
123  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3);
124  store = __msa_splati_b((v16i8) sum_w, 0);
125  val0 = __msa_copy_u_w((v4i32) store, 0);
126 
127  SW4(val0, val0, val0, val0, dst, dst_stride);
128 }
129 
130 #define INTRA_DC_TL_4x4(dir) \
131 void ff_dc_##dir##_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, \
132  const uint8_t *left, \
133  const uint8_t *top) \
134 { \
135  uint32_t val0; \
136  v16i8 store, data = { 0 }; \
137  v8u16 sum_h; \
138  v4u32 sum_w; \
139  \
140  val0 = LW(dir); \
141  data = (v16i8) __msa_insert_w((v4i32) data, 0, val0); \
142  sum_h = __msa_hadd_u_h((v16u8) data, (v16u8) data); \
143  sum_w = __msa_hadd_u_w(sum_h, sum_h); \
144  sum_w = (v4u32) __msa_srari_w((v4i32) sum_w, 2); \
145  store = __msa_splati_b((v16i8) sum_w, 0); \
146  val0 = __msa_copy_u_w((v4i32) store, 0); \
147  \
148  SW4(val0, val0, val0, val0, dst, dst_stride); \
149 }
150 INTRA_DC_TL_4x4(top);
152 
153 void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left,
154  const uint8_t *src_top)
155 {
156  uint64_t val0, val1;
157  v16i8 store;
158  v16u8 src = { 0 };
159  v8u16 sum_h;
160  v4u32 sum_w;
161  v2u64 sum_d;
162 
163  val0 = LD(src_top);
164  val1 = LD(src_left);
165  INSERT_D2_UB(val0, val1, src);
166  sum_h = __msa_hadd_u_h(src, src);
167  sum_w = __msa_hadd_u_w(sum_h, sum_h);
168  sum_d = __msa_hadd_u_d(sum_w, sum_w);
169  sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
170  sum_d = __msa_hadd_u_d(sum_w, sum_w);
171  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4);
172  store = __msa_splati_b((v16i8) sum_w, 0);
173  val0 = __msa_copy_u_d((v2i64) store, 0);
174 
175  SD4(val0, val0, val0, val0, dst, dst_stride);
176  dst += (4 * dst_stride);
177  SD4(val0, val0, val0, val0, dst, dst_stride);
178 }
179 
180 #define INTRA_DC_TL_8x8(dir) \
181 void ff_dc_##dir##_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, \
182  const uint8_t *left, \
183  const uint8_t *top) \
184 { \
185  uint64_t val0; \
186  v16i8 store; \
187  v16u8 data = { 0 }; \
188  v8u16 sum_h; \
189  v4u32 sum_w; \
190  v2u64 sum_d; \
191  \
192  val0 = LD(dir); \
193  data = (v16u8) __msa_insert_d((v2i64) data, 0, val0); \
194  sum_h = __msa_hadd_u_h(data, data); \
195  sum_w = __msa_hadd_u_w(sum_h, sum_h); \
196  sum_d = __msa_hadd_u_d(sum_w, sum_w); \
197  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 3); \
198  store = __msa_splati_b((v16i8) sum_w, 0); \
199  val0 = __msa_copy_u_d((v2i64) store, 0); \
200  \
201  SD4(val0, val0, val0, val0, dst, dst_stride); \
202  dst += (4 * dst_stride); \
203  SD4(val0, val0, val0, val0, dst, dst_stride); \
204 }
205 
206 INTRA_DC_TL_8x8(top);
208 
209 void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
210  const uint8_t *src_left, const uint8_t *src_top)
211 {
212  v16u8 top, left, out;
213  v8u16 sum_h, sum_top, sum_left;
214  v4u32 sum_w;
215  v2u64 sum_d;
216 
217  top = LD_UB(src_top);
218  left = LD_UB(src_left);
219  HADD_UB2_UH(top, left, sum_top, sum_left);
220  sum_h = sum_top + sum_left;
221  sum_w = __msa_hadd_u_w(sum_h, sum_h);
222  sum_d = __msa_hadd_u_d(sum_w, sum_w);
223  sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
224  sum_d = __msa_hadd_u_d(sum_w, sum_w);
225  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5);
226  out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
227 
228  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
229  dst += (8 * dst_stride);
230  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
231 }
232 
233 #define INTRA_DC_TL_16x16(dir) \
234 void ff_dc_##dir##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \
235  const uint8_t *left, \
236  const uint8_t *top) \
237 { \
238  v16u8 data, out; \
239  v8u16 sum_h; \
240  v4u32 sum_w; \
241  v2u64 sum_d; \
242  \
243  data = LD_UB(dir); \
244  sum_h = __msa_hadd_u_h(data, data); \
245  sum_w = __msa_hadd_u_w(sum_h, sum_h); \
246  sum_d = __msa_hadd_u_d(sum_w, sum_w); \
247  sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \
248  sum_d = __msa_hadd_u_d(sum_w, sum_w); \
249  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 4); \
250  out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \
251  \
252  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \
253  dst += (8 * dst_stride); \
254  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \
255 }
256 INTRA_DC_TL_16x16(top);
258 
259 void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
260  const uint8_t *src_left, const uint8_t *src_top)
261 {
262  uint32_t row;
263  v16u8 top0, top1, left0, left1, out;
264  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
265  v4u32 sum_w;
266  v2u64 sum_d;
267 
268  LD_UB2(src_top, 16, top0, top1);
269  LD_UB2(src_left, 16, left0, left1);
270  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
271  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
272  sum_h = sum_top0 + sum_top1;
273  sum_h += sum_left0 + sum_left1;
274  sum_w = __msa_hadd_u_w(sum_h, sum_h);
275  sum_d = __msa_hadd_u_d(sum_w, sum_w);
276  sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d);
277  sum_d = __msa_hadd_u_d(sum_w, sum_w);
278  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 6);
279  out = (v16u8) __msa_splati_b((v16i8) sum_w, 0);
280 
281  for (row = 16; row--;)
282  {
283  ST_UB2(out, out, dst, 16);
284  dst += dst_stride;
285  ST_UB2(out, out, dst, 16);
286  dst += dst_stride;
287  }
288 }
289 
290 #define INTRA_DC_TL_32x32(dir) \
291 void ff_dc_##dir##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \
292  const uint8_t *left, \
293  const uint8_t *top) \
294 { \
295  uint32_t row; \
296  v16u8 data0, data1, out; \
297  v8u16 sum_h, sum_data0, sum_data1; \
298  v4u32 sum_w; \
299  v2u64 sum_d; \
300  \
301  LD_UB2(dir, 16, data0, data1); \
302  HADD_UB2_UH(data0, data1, sum_data0, sum_data1); \
303  sum_h = sum_data0 + sum_data1; \
304  sum_w = __msa_hadd_u_w(sum_h, sum_h); \
305  sum_d = __msa_hadd_u_d(sum_w, sum_w); \
306  sum_w = (v4u32) __msa_pckev_w((v4i32) sum_d, (v4i32) sum_d); \
307  sum_d = __msa_hadd_u_d(sum_w, sum_w); \
308  sum_w = (v4u32) __msa_srari_w((v4i32) sum_d, 5); \
309  out = (v16u8) __msa_splati_b((v16i8) sum_w, 0); \
310  \
311  for (row = 16; row--;) \
312  { \
313  ST_UB2(out, out, dst, 16); \
314  dst += dst_stride; \
315  ST_UB2(out, out, dst, 16); \
316  dst += dst_stride; \
317  } \
318 }
319 INTRA_DC_TL_32x32(top);
321 
322 #define INTRA_PREDICT_VALDC_16X16_MSA(val) \
323 void ff_dc_##val##_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, \
324  const uint8_t *left, const uint8_t *top) \
325 { \
326  v16u8 out = (v16u8) __msa_ldi_b(val); \
327  \
328  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \
329  dst += (8 * dst_stride); \
330  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \
331 }
332 
336 
337 #define INTRA_PREDICT_VALDC_32X32_MSA(val) \
338 void ff_dc_##val##_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, \
339  const uint8_t *left, const uint8_t *top) \
340 { \
341  uint32_t row; \
342  v16u8 out = (v16u8) __msa_ldi_b(val); \
343  \
344  for (row = 16; row--;) \
345  { \
346  ST_UB2(out, out, dst, 16); \
347  dst += dst_stride; \
348  ST_UB2(out, out, dst, 16); \
349  dst += dst_stride; \
350  } \
351 }
352 
356 
357 void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,
358  const uint8_t *src_left, const uint8_t *src_top_ptr)
359 {
360  uint32_t left;
361  uint8_t top_left = src_top_ptr[-1];
362  v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
363  v16u8 src0, src1, src2, src3;
364  v8u16 src_top_left, vec0, vec1, vec2, vec3;
365 
366  src_top_left = (v8u16) __msa_fill_h(top_left);
367  src_top = LD_SB(src_top_ptr);
368  left = LW(src_left);
369  src_left0 = __msa_fill_b(left >> 24);
370  src_left1 = __msa_fill_b(left >> 16);
371  src_left2 = __msa_fill_b(left >> 8);
372  src_left3 = __msa_fill_b(left);
373 
374  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
375  src_left3, src_top, src0, src1, src2, src3);
376  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
377  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
378  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
379  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
380  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
381  ST_W2(tmp0, 0, 2, dst, dst_stride);
382  ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
383 }
384 
385 void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
386  const uint8_t *src_left, const uint8_t *src_top_ptr)
387 {
388  uint8_t top_left = src_top_ptr[-1];
389  uint32_t loop_cnt, left;
390  v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
391  v8u16 src_top_left, vec0, vec1, vec2, vec3;
392  v16u8 src0, src1, src2, src3;
393 
394  src_top = LD_SB(src_top_ptr);
395  src_top_left = (v8u16) __msa_fill_h(top_left);
396 
397  src_left += 4;
398  for (loop_cnt = 2; loop_cnt--;) {
399  left = LW(src_left);
400  src_left0 = __msa_fill_b(left >> 24);
401  src_left1 = __msa_fill_b(left >> 16);
402  src_left2 = __msa_fill_b(left >> 8);
403  src_left3 = __msa_fill_b(left);
404  src_left -= 4;
405 
406  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
407  src_left3, src_top, src0, src1, src2, src3);
408  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
409  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
410  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
411  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
412  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
413  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
414  dst += (4 * dst_stride);
415  }
416 }
417 
418 void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride,
419  const uint8_t *src_left, const uint8_t *src_top_ptr)
420 {
421  uint8_t top_left = src_top_ptr[-1];
422  uint32_t loop_cnt, left;
423  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
424  v8u16 src_top_left, res_r, res_l;
425 
426  src_top = LD_SB(src_top_ptr);
427  src_top_left = (v8u16) __msa_fill_h(top_left);
428 
429  src_left += 12;
430  for (loop_cnt = 4; loop_cnt--;) {
431  left = LW(src_left);
432  src_left0 = __msa_fill_b(left >> 24);
433  src_left1 = __msa_fill_b(left >> 16);
434  src_left2 = __msa_fill_b(left >> 8);
435  src_left3 = __msa_fill_b(left);
436  src_left -= 4;
437 
438  ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
439  HADD_UB2_UH(res_r, res_l, res_r, res_l);
440  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
441 
442  SAT_UH2_UH(res_r, res_l, 7);
443  PCKEV_ST_SB(res_r, res_l, dst);
444  dst += dst_stride;
445 
446  ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
447  HADD_UB2_UH(res_r, res_l, res_r, res_l);
448  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
449  SAT_UH2_UH(res_r, res_l, 7);
450  PCKEV_ST_SB(res_r, res_l, dst);
451  dst += dst_stride;
452 
453  ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
454  HADD_UB2_UH(res_r, res_l, res_r, res_l);
455  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
456  SAT_UH2_UH(res_r, res_l, 7);
457  PCKEV_ST_SB(res_r, res_l, dst);
458  dst += dst_stride;
459 
460  ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
461  HADD_UB2_UH(res_r, res_l, res_r, res_l);
462  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
463  SAT_UH2_UH(res_r, res_l, 7);
464  PCKEV_ST_SB(res_r, res_l, dst);
465  dst += dst_stride;
466  }
467 }
468 
469 void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride,
470  const uint8_t *src_left, const uint8_t *src_top_ptr)
471 {
472  uint8_t top_left = src_top_ptr[-1];
473  uint32_t loop_cnt, left;
474  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
475  v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
476 
477  src_top0 = LD_SB(src_top_ptr);
478  src_top1 = LD_SB(src_top_ptr + 16);
479  src_top_left = (v8u16) __msa_fill_h(top_left);
480 
481  src_left += 28;
482  for (loop_cnt = 8; loop_cnt--;) {
483  left = LW(src_left);
484  src_left0 = __msa_fill_b(left >> 24);
485  src_left1 = __msa_fill_b(left >> 16);
486  src_left2 = __msa_fill_b(left >> 8);
487  src_left3 = __msa_fill_b(left);
488  src_left -= 4;
489 
490  ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
491  ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
492  HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
493  res_l1);
494  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
495  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
496  SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
497  PCKEV_ST_SB(res_r0, res_l0, dst);
498  PCKEV_ST_SB(res_r1, res_l1, dst + 16);
499  dst += dst_stride;
500 
501  ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
502  ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
503  HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
504  res_l1);
505  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
506  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
507  SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
508  PCKEV_ST_SB(res_r0, res_l0, dst);
509  PCKEV_ST_SB(res_r1, res_l1, dst + 16);
510  dst += dst_stride;
511 
512  ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
513  ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
514  HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
515  res_l1);
516  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
517  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
518  SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
519  PCKEV_ST_SB(res_r0, res_l0, dst);
520  PCKEV_ST_SB(res_r1, res_l1, dst + 16);
521  dst += dst_stride;
522 
523  ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
524  ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
525  HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
526  res_l1);
527  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
528  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
529  SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
530  PCKEV_ST_SB(res_r0, res_l0, dst);
531  PCKEV_ST_SB(res_r1, res_l1, dst + 16);
532  dst += dst_stride;
533  }
534 }
PCKEV_ST_SB
#define PCKEV_ST_SB(in0, in1, pdst)
Definition: generic_macros_msa.h:2799
out
FILE * out
Definition: movenc.c:55
ff_tm_8x8_msa
void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
Definition: vp9_intra_msa.c:385
HADD_UB4_UH
#define HADD_UB4_UH(...)
Definition: generic_macros_msa.h:1082
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
src1
const pixel * src1
Definition: h264pred_template.c:421
INTRA_DC_TL_32x32
#define INTRA_DC_TL_32x32(dir)
Definition: vp9_intra_msa.c:290
ff_dc_32x32_msa
void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)
Definition: vp9_intra_msa.c:259
ILVR_B2_UH
#define ILVR_B2_UH(...)
Definition: generic_macros_msa.h:1339
ff_hor_16x16_msa
void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, const uint8_t *top)
Definition: vp9_intra_msa.c:60
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
INSERT_W2_SB
#define INSERT_W2_SB(...)
Definition: generic_macros_msa.h:1144
HADD_UB2_UH
#define HADD_UB2_UH(...)
Definition: generic_macros_msa.h:1067
IPRED_SUBS_UH2_UH
#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1)
Definition: vp9_intra_msa.c:25
generic_macros_msa.h
sum_d
static void sum_d(const int *input, int *output, int len)
Definition: dcadct.c:51
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
ff_vert_32x32_msa
void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, const uint8_t *src)
Definition: vp9_intra_msa.c:45
ILVR_B4_UB
#define ILVR_B4_UB(...)
Definition: generic_macros_msa.h:1359
ff_tm_16x16_msa
void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
Definition: vp9_intra_msa.c:418
SAT_UH2_UH
#define SAT_UH2_UH(...)
Definition: generic_macros_msa.h:1567
vp9dsp_mips.h
ILVL_B2_UH
#define ILVL_B2_UH(...)
Definition: generic_macros_msa.h:1264
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
ILVRL_B2_UH
#define ILVRL_B2_UH(...)
Definition: generic_macros_msa.h:1497
vp9dsp.h
ff_dc_8x8_msa
void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)
Definition: vp9_intra_msa.c:153
LW
#define LW(psrc)
Definition: generic_macros_msa.h:104
INTRA_DC_TL_16x16
#define INTRA_DC_TL_16x16(dir)
Definition: vp9_intra_msa.c:233
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:256
INTRA_DC_TL_8x8
#define INTRA_DC_TL_8x8(dir)
Definition: vp9_intra_msa.c:180
SW4
#define SW4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:241
INTRA_PREDICT_VALDC_32X32_MSA
#define INTRA_PREDICT_VALDC_32X32_MSA(val)
Definition: vp9_intra_msa.c:337
src2
const pixel * src2
Definition: h264pred_template.c:422
ff_vert_16x16_msa
void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *left, const uint8_t *src)
Definition: vp9_intra_msa.c:31
ff_tm_32x32_msa
void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
Definition: vp9_intra_msa.c:469
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
LD_UB2
#define LD_UB2(...)
Definition: generic_macros_msa.h:277
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
ff_hor_32x32_msa
void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, const uint8_t *top)
Definition: vp9_intra_msa.c:81
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
INTRA_DC_TL_4x4
#define INTRA_DC_TL_4x4(dir)
Definition: vp9_intra_msa.c:130
src0
const pixel *const src0
Definition: h264pred_template.c:420
PCKEV_B2_SB
#define PCKEV_B2_SB(...)
Definition: generic_macros_msa.h:1719
ff_dc_4x4_msa
void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)
Definition: vp9_intra_msa.c:108
LD
#define LD(psrc)
Definition: generic_macros_msa.h:137
INTRA_PREDICT_VALDC_16X16_MSA
#define INTRA_PREDICT_VALDC_16X16_MSA(val)
Definition: vp9_intra_msa.c:322
INSERT_D2_UB
#define INSERT_D2_UB(...)
Definition: generic_macros_msa.h:1169
src
#define src
Definition: vp8dsp.c:248
ff_tm_4x4_msa
void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top_ptr)
Definition: vp9_intra_msa.c:357
ff_dc_16x16_msa
void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src_left, const uint8_t *src_top)
Definition: vp9_intra_msa.c:209
SAT_UH4_UH
#define SAT_UH4_UH(...)
Definition: generic_macros_msa.h:1575