VirtualBox

source: vbox/trunk/src/libs/libpng-1.6.43/mips/filter_msa_intrinsics.c

Last change on this file was 103316, checked in by vboxsync, 10 months ago

libpng-1.6.42: Applied and adjusted our libpng changes to 1.6.42. bugref:8515

  • Property svn:eol-style set to native
File size: 28.8 KB
Line 
1
2/* filter_msa_intrinsics.c - MSA optimised filter functions
3 *
4 * Copyright (c) 2018-2024 Cosmin Truta
5 * Copyright (c) 2016 Glenn Randers-Pehrson
6 * Written by Mandar Sahastrabuddhe, August 2016
7 *
8 * This code is released under the libpng license.
9 * For conditions of distribution and use, see the disclaimer
10 * and license in png.h
11 */
12
13#include <stdio.h>
14#include "../pngpriv.h"
15
16#ifdef PNG_READ_SUPPORTED
17
18/* This code requires -mfpu=msa on the command line: */
19#if PNG_MIPS_MSA_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
20
21#include <msa.h>
22#include <stdint.h>
23
24/* libpng row pointers are not necessarily aligned to any particular boundary,
25 * however this code will only work with appropriate alignment. mips/mips_init.c
26 * checks for this (and will not compile unless it is done). This code uses
27 * variants of png_aligncast to avoid compiler warnings.
28 */
29#define png_ptr(type,pointer) png_aligncast(type *,pointer)
30#define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
31
32/* The following relies on a variable 'temp_pointer' being declared with type
33 * 'type'. This is written this way just to hide the GCC strict aliasing
34 * warning; note that the code is safe because there never is an alias between
35 * the input and output pointers.
36 */
37#define png_ldr(type,pointer)\
38 (temp_pointer = png_ptr(type,pointer), *temp_pointer)
39
40#if PNG_MIPS_MSA_OPT > 0
41
42#ifdef CLANG_BUILD
43 #define MSA_SRLI_B(a, b) __msa_srli_b((v16i8) a, b)
44
45 #define LW(psrc) \
46 ( { \
47 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
48 uint32_t val_m; \
49 \
50 asm volatile ( \
51 "lw %[val_m], %[psrc_lw_m] \n\t" \
52 \
53 : [val_m] "=r" (val_m) \
54 : [psrc_lw_m] "m" (*psrc_lw_m) \
55 ); \
56 \
57 val_m; \
58 } )
59
60 #define SH(val, pdst) \
61 { \
62 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
63 uint16_t val_m = (val); \
64 \
65 asm volatile ( \
66 "sh %[val_m], %[pdst_sh_m] \n\t" \
67 \
68 : [pdst_sh_m] "=m" (*pdst_sh_m) \
69 : [val_m] "r" (val_m) \
70 ); \
71 }
72
73 #define SW(val, pdst) \
74 { \
75 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
76 uint32_t val_m = (val); \
77 \
78 asm volatile ( \
79 "sw %[val_m], %[pdst_sw_m] \n\t" \
80 \
81 : [pdst_sw_m] "=m" (*pdst_sw_m) \
82 : [val_m] "r" (val_m) \
83 ); \
84 }
85
86 #if (__mips == 64)
87 #define SD(val, pdst) \
88 { \
89 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
90 uint64_t val_m = (val); \
91 \
92 asm volatile ( \
93 "sd %[val_m], %[pdst_sd_m] \n\t" \
94 \
95 : [pdst_sd_m] "=m" (*pdst_sd_m) \
96 : [val_m] "r" (val_m) \
97 ); \
98 }
99 #else
100 #define SD(val, pdst) \
101 { \
102 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
103 uint32_t val0_m, val1_m; \
104 \
105 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
106 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
107 \
108 SW(val0_m, pdst_sd_m); \
109 SW(val1_m, pdst_sd_m + 4); \
110 }
111 #endif
112#else
113 #define MSA_SRLI_B(a, b) (a >> b)
114
115#if (__mips_isa_rev >= 6)
116 #define LW(psrc) \
117 ( { \
118 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
119 uint32_t val_m; \
120 \
121 asm volatile ( \
122 "lw %[val_m], %[psrc_lw_m] \n\t" \
123 \
124 : [val_m] "=r" (val_m) \
125 : [psrc_lw_m] "m" (*psrc_lw_m) \
126 ); \
127 \
128 val_m; \
129 } )
130
131 #define SH(val, pdst) \
132 { \
133 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
134 uint16_t val_m = (val); \
135 \
136 asm volatile ( \
137 "sh %[val_m], %[pdst_sh_m] \n\t" \
138 \
139 : [pdst_sh_m] "=m" (*pdst_sh_m) \
140 : [val_m] "r" (val_m) \
141 ); \
142 }
143
144 #define SW(val, pdst) \
145 { \
146 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
147 uint32_t val_m = (val); \
148 \
149 asm volatile ( \
150 "sw %[val_m], %[pdst_sw_m] \n\t" \
151 \
152 : [pdst_sw_m] "=m" (*pdst_sw_m) \
153 : [val_m] "r" (val_m) \
154 ); \
155 }
156
157 #if (__mips == 64)
158 #define SD(val, pdst) \
159 { \
160 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
161 uint64_t val_m = (val); \
162 \
163 asm volatile ( \
164 "sd %[val_m], %[pdst_sd_m] \n\t" \
165 \
166 : [pdst_sd_m] "=m" (*pdst_sd_m) \
167 : [val_m] "r" (val_m) \
168 ); \
169 }
170 #else
171 #define SD(val, pdst) \
172 { \
173 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
174 uint32_t val0_m, val1_m; \
175 \
176 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
177 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
178 \
179 SW(val0_m, pdst_sd_m); \
180 SW(val1_m, pdst_sd_m + 4); \
181 }
182 #endif
183#else // !(__mips_isa_rev >= 6)
184 #define LW(psrc) \
185 ( { \
186 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
187 uint32_t val_m; \
188 \
189 asm volatile ( \
190 "ulw %[val_m], %[psrc_lw_m] \n\t" \
191 \
192 : [val_m] "=r" (val_m) \
193 : [psrc_lw_m] "m" (*psrc_lw_m) \
194 ); \
195 \
196 val_m; \
197 } )
198
199 #define SH(val, pdst) \
200 { \
201 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
202 uint16_t val_m = (val); \
203 \
204 asm volatile ( \
205 "ush %[val_m], %[pdst_sh_m] \n\t" \
206 \
207 : [pdst_sh_m] "=m" (*pdst_sh_m) \
208 : [val_m] "r" (val_m) \
209 ); \
210 }
211
212 #define SW(val, pdst) \
213 { \
214 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
215 uint32_t val_m = (val); \
216 \
217 asm volatile ( \
218 "usw %[val_m], %[pdst_sw_m] \n\t" \
219 \
220 : [pdst_sw_m] "=m" (*pdst_sw_m) \
221 : [val_m] "r" (val_m) \
222 ); \
223 }
224
225 #define SD(val, pdst) \
226 { \
227 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
228 uint32_t val0_m, val1_m; \
229 \
230 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
231 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
232 \
233 SW(val0_m, pdst_sd_m); \
234 SW(val1_m, pdst_sd_m + 4); \
235 }
236
237 #define SW_ZERO(pdst) \
238 { \
239 uint8_t *pdst_m = (uint8_t *) (pdst); \
240 \
241 asm volatile ( \
242 "usw $0, %[pdst_m] \n\t" \
243 \
244 : [pdst_m] "=m" (*pdst_m) \
245 : \
246 ); \
247 }
248#endif // (__mips_isa_rev >= 6)
249#endif
250
251#define LD_B(RTYPE, psrc) *((RTYPE *) (psrc))
252#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
253#define LD_B2(RTYPE, psrc, stride, out0, out1) \
254{ \
255 out0 = LD_B(RTYPE, (psrc)); \
256 out1 = LD_B(RTYPE, (psrc) + stride); \
257}
258#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
259#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
260{ \
261 LD_B2(RTYPE, (psrc), stride, out0, out1); \
262 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
263}
264#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
265
266#define ST_B(RTYPE, in, pdst) *((RTYPE *) (pdst)) = (in)
267#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
268#define ST_B2(RTYPE, in0, in1, pdst, stride) \
269{ \
270 ST_B(RTYPE, in0, (pdst)); \
271 ST_B(RTYPE, in1, (pdst) + stride); \
272}
273#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
274#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
275{ \
276 ST_B2(RTYPE, in0, in1, (pdst), stride); \
277 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
278}
279#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
280
281#define ADD2(in0, in1, in2, in3, out0, out1) \
282{ \
283 out0 = in0 + in1; \
284 out1 = in2 + in3; \
285}
286#define ADD3(in0, in1, in2, in3, in4, in5, \
287 out0, out1, out2) \
288{ \
289 ADD2(in0, in1, in2, in3, out0, out1); \
290 out2 = in4 + in5; \
291}
292#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
293 out0, out1, out2, out3) \
294{ \
295 ADD2(in0, in1, in2, in3, out0, out1); \
296 ADD2(in4, in5, in6, in7, out2, out3); \
297}
298
299#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
300{ \
301 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
302 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
303}
304#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
305
306#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
307{ \
308 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
309 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
310}
311#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
312
313#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
314{ \
315 v16i8 zero_m = { 0 }; \
316 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
317 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
318}
319#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
320
321#define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
322{ \
323 v16i8 zero_m = { 0 }; \
324 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
325 out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
326}
327#define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
328
329#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
330{ \
331 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
332 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
333}
334#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
335
336#define ADD_ABS_H3(RTYPE, in0, in1, in2, out0, out1, out2) \
337{ \
338 RTYPE zero = {0}; \
339 \
340 out0 = __msa_add_a_h((v8i16) zero, in0); \
341 out1 = __msa_add_a_h((v8i16) zero, in1); \
342 out2 = __msa_add_a_h((v8i16) zero, in2); \
343}
344#define ADD_ABS_H3_SH(...) ADD_ABS_H3(v8i16, __VA_ARGS__)
345
346#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
347{ \
348 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
349 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
350}
351#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
352
353#define CMP_AND_SELECT(inp0, inp1, inp2, inp3, inp4, inp5, out0) \
354{ \
355 v8i16 _sel_h0, _sel_h1; \
356 v16u8 _sel_b0, _sel_b1; \
357 _sel_h0 = (v8i16) __msa_clt_u_h((v8u16) inp1, (v8u16) inp0); \
358 _sel_b0 = (v16u8) __msa_pckev_b((v16i8) _sel_h0, (v16i8) _sel_h0); \
359 inp0 = (v8i16) __msa_bmnz_v((v16u8) inp0, (v16u8) inp1, (v16u8) _sel_h0); \
360 inp4 = (v16u8) __msa_bmnz_v(inp3, inp4, _sel_b0); \
361 _sel_h1 = (v8i16) __msa_clt_u_h((v8u16) inp2, (v8u16) inp0); \
362 _sel_b1 = (v16u8) __msa_pckev_b((v16i8) _sel_h1, (v16i8) _sel_h1); \
363 inp4 = (v16u8) __msa_bmnz_v(inp4, inp5, _sel_b1); \
364 out0 += inp4; \
365}
366
367void png_read_filter_row_up_msa(png_row_infop row_info, png_bytep row,
368 png_const_bytep prev_row)
369{
370 size_t i, cnt, cnt16, cnt32;
371 size_t istop = row_info->rowbytes;
372 png_bytep rp = row;
373 png_const_bytep pp = prev_row;
374 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
375
376 for (i = 0; i < (istop >> 6); i++)
377 {
378 LD_UB4(rp, 16, src0, src1, src2, src3);
379 LD_UB4(pp, 16, src4, src5, src6, src7);
380 pp += 64;
381
382 ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
383 src0, src1, src2, src3);
384
385 ST_UB4(src0, src1, src2, src3, rp, 16);
386 rp += 64;
387 }
388
389 if (istop & 0x3F)
390 {
391 cnt32 = istop & 0x20;
392 cnt16 = istop & 0x10;
393 cnt = istop & 0xF;
394
395 if(cnt32)
396 {
397 if (cnt16 && cnt)
398 {
399 LD_UB4(rp, 16, src0, src1, src2, src3);
400 LD_UB4(pp, 16, src4, src5, src6, src7);
401
402 ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
403 src0, src1, src2, src3);
404
405 ST_UB4(src0, src1, src2, src3, rp, 16);
406 rp += 64;
407 }
408 else if (cnt16 || cnt)
409 {
410 LD_UB2(rp, 16, src0, src1);
411 LD_UB2(pp, 16, src4, src5);
412 pp += 32;
413 src2 = LD_UB(rp + 32);
414 src6 = LD_UB(pp);
415
416 ADD3(src0, src4, src1, src5, src2, src6, src0, src1, src2);
417
418 ST_UB2(src0, src1, rp, 16);
419 rp += 32;
420 ST_UB(src2, rp);
421 rp += 16;
422 }
423 else
424 {
425 LD_UB2(rp, 16, src0, src1);
426 LD_UB2(pp, 16, src4, src5);
427
428 ADD2(src0, src4, src1, src5, src0, src1);
429
430 ST_UB2(src0, src1, rp, 16);
431 rp += 32;
432 }
433 }
434 else if (cnt16 && cnt)
435 {
436 LD_UB2(rp, 16, src0, src1);
437 LD_UB2(pp, 16, src4, src5);
438
439 ADD2(src0, src4, src1, src5, src0, src1);
440
441 ST_UB2(src0, src1, rp, 16);
442 rp += 32;
443 }
444 else if (cnt16 || cnt)
445 {
446 src0 = LD_UB(rp);
447 src4 = LD_UB(pp);
448 pp += 16;
449
450 src0 += src4;
451
452 ST_UB(src0, rp);
453 rp += 16;
454 }
455 }
456}
457
458void png_read_filter_row_sub4_msa(png_row_infop row_info, png_bytep row,
459 png_const_bytep prev_row)
460{
461 size_t count;
462 size_t istop = row_info->rowbytes;
463 png_bytep src = row;
464 png_bytep nxt = row + 4;
465 int32_t inp0;
466 v16u8 src0, src1, src2, src3, src4;
467 v16u8 dst0, dst1;
468 v16u8 zero = { 0 };
469
470 istop -= 4;
471
472 inp0 = LW(src);
473 src += 4;
474 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
475
476 for (count = 0; count < istop; count += 16)
477 {
478 src1 = LD_UB(src);
479 src += 16;
480
481 src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 4);
482 src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 8);
483 src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 12);
484 src1 += src0;
485 src2 += src1;
486 src3 += src2;
487 src4 += src3;
488 src0 = src4;
489 ILVEV_W2_UB(src1, src2, src3, src4, dst0, dst1);
490 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
491
492 ST_UB(dst0, nxt);
493 nxt += 16;
494 }
495}
496
497void png_read_filter_row_sub3_msa(png_row_infop row_info, png_bytep row,
498 png_const_bytep prev_row)
499{
500 size_t count;
501 size_t istop = row_info->rowbytes;
502 png_bytep src = row;
503 png_bytep nxt = row + 3;
504 int64_t out0;
505 int32_t inp0, out1;
506 v16u8 src0, src1, src2, src3, src4, dst0, dst1;
507 v16u8 zero = { 0 };
508 v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
509 v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
510
511 istop -= 3;
512
513 inp0 = LW(src);
514 src += 3;
515 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
516
517 for (count = 0; count < istop; count += 12)
518 {
519 src1 = LD_UB(src);
520 src += 12;
521
522 src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 3);
523 src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 6);
524 src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 9);
525 src1 += src0;
526 src2 += src1;
527 src3 += src2;
528 src4 += src3;
529 src0 = src4;
530 VSHF_B2_UB(src1, src2, src3, src4, mask0, mask0, dst0, dst1);
531 dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
532 out0 = __msa_copy_s_d((v2i64) dst0, 0);
533 out1 = __msa_copy_s_w((v4i32) dst0, 2);
534
535 SD(out0, nxt);
536 nxt += 8;
537 SW(out1, nxt);
538 nxt += 4;
539 }
540}
541
542void png_read_filter_row_avg4_msa(png_row_infop row_info, png_bytep row,
543 png_const_bytep prev_row)
544{
545 size_t i;
546 png_bytep src = row;
547 png_bytep nxt = row;
548 png_const_bytep pp = prev_row;
549 size_t istop = row_info->rowbytes - 4;
550 int32_t inp0, inp1, out0;
551 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
552 v16u8 zero = { 0 };
553
554 inp0 = LW(pp);
555 pp += 4;
556 inp1 = LW(src);
557 src += 4;
558 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
559 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
560 src0 = (v16u8) MSA_SRLI_B(src0, 1);
561 src1 += src0;
562 out0 = __msa_copy_s_w((v4i32) src1, 0);
563 SW(out0, nxt);
564 nxt += 4;
565
566 for (i = 0; i < istop; i += 16)
567 {
568 src2 = LD_UB(pp);
569 pp += 16;
570 src6 = LD_UB(src);
571 src += 16;
572
573 SLDI_B2_0_UB(src2, src6, src3, src7, 4);
574 SLDI_B2_0_UB(src2, src6, src4, src8, 8);
575 SLDI_B2_0_UB(src2, src6, src5, src9, 12);
576 src2 = __msa_ave_u_b(src2, src1);
577 src6 += src2;
578 src3 = __msa_ave_u_b(src3, src6);
579 src7 += src3;
580 src4 = __msa_ave_u_b(src4, src7);
581 src8 += src4;
582 src5 = __msa_ave_u_b(src5, src8);
583 src9 += src5;
584 src1 = src9;
585 ILVEV_W2_UB(src6, src7, src8, src9, dst0, dst1);
586 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
587
588 ST_UB(dst0, nxt);
589 nxt += 16;
590 }
591}
592
593void png_read_filter_row_avg3_msa(png_row_infop row_info, png_bytep row,
594 png_const_bytep prev_row)
595{
596 size_t i;
597 png_bytep src = row;
598 png_bytep nxt = row;
599 png_const_bytep pp = prev_row;
600 size_t istop = row_info->rowbytes - 3;
601 int64_t out0;
602 int32_t inp0, inp1, out1;
603 int16_t out2;
604 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
605 v16u8 zero = { 0 };
606 v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
607 v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
608
609 inp0 = LW(pp);
610 pp += 3;
611 inp1 = LW(src);
612 src += 3;
613 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
614 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
615 src0 = (v16u8) MSA_SRLI_B(src0, 1);
616 src1 += src0;
617 out2 = __msa_copy_s_h((v8i16) src1, 0);
618 SH(out2, nxt);
619 nxt += 2;
620 nxt[0] = src1[2];
621 nxt++;
622
623 for (i = 0; i < istop; i += 12)
624 {
625 src2 = LD_UB(pp);
626 pp += 12;
627 src6 = LD_UB(src);
628 src += 12;
629
630 SLDI_B2_0_UB(src2, src6, src3, src7, 3);
631 SLDI_B2_0_UB(src2, src6, src4, src8, 6);
632 SLDI_B2_0_UB(src2, src6, src5, src9, 9);
633 src2 = __msa_ave_u_b(src2, src1);
634 src6 += src2;
635 src3 = __msa_ave_u_b(src3, src6);
636 src7 += src3;
637 src4 = __msa_ave_u_b(src4, src7);
638 src8 += src4;
639 src5 = __msa_ave_u_b(src5, src8);
640 src9 += src5;
641 src1 = src9;
642 VSHF_B2_UB(src6, src7, src8, src9, mask0, mask0, dst0, dst1);
643 dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
644 out0 = __msa_copy_s_d((v2i64) dst0, 0);
645 out1 = __msa_copy_s_w((v4i32) dst0, 2);
646
647 SD(out0, nxt);
648 nxt += 8;
649 SW(out1, nxt);
650 nxt += 4;
651 }
652}
653
654void png_read_filter_row_paeth4_msa(png_row_infop row_info,
655 png_bytep row,
656 png_const_bytep prev_row)
657{
658 int32_t count, rp_end;
659 png_bytep nxt;
660 png_const_bytep prev_nxt;
661 int32_t inp0, inp1, res0;
662 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
663 v16u8 src10, src11, src12, src13, dst0, dst1;
664 v8i16 vec0, vec1, vec2;
665 v16u8 zero = { 0 };
666
667 nxt = row;
668 prev_nxt = prev_row;
669
670 inp0 = LW(nxt);
671 inp1 = LW(prev_nxt);
672 prev_nxt += 4;
673 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
674 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
675
676 src1 += src0;
677 res0 = __msa_copy_s_w((v4i32) src1, 0);
678
679 SW(res0, nxt);
680 nxt += 4;
681
682 /* Remainder */
683 rp_end = row_info->rowbytes - 4;
684
685 for (count = 0; count < rp_end; count += 16)
686 {
687 src2 = LD_UB(prev_nxt);
688 prev_nxt += 16;
689 src6 = LD_UB(prev_row);
690 prev_row += 16;
691 src10 = LD_UB(nxt);
692
693 SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 4);
694 SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 8);
695 SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 12);
696 ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1);
697 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
698 vec2 = vec0 + vec1;
699 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
700 CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10);
701 ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1);
702 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
703 vec2 = vec0 + vec1;
704 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
705 CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11);
706 ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1);
707 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
708 vec2 = vec0 + vec1;
709 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
710 CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12);
711 ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1);
712 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
713 vec2 = vec0 + vec1;
714 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
715 CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13);
716 src1 = src13;
717 ILVEV_W2_UB(src10, src11, src12, src1, dst0, dst1);
718 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
719
720 ST_UB(dst0, nxt);
721 nxt += 16;
722 }
723}
724
725void png_read_filter_row_paeth3_msa(png_row_infop row_info,
726 png_bytep row,
727 png_const_bytep prev_row)
728{
729 int32_t count, rp_end;
730 png_bytep nxt;
731 png_const_bytep prev_nxt;
732 int64_t out0;
733 int32_t inp0, inp1, out1;
734 int16_t out2;
735 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
736 v16u8 src10, src11, src12, src13;
737 v8i16 vec0, vec1, vec2;
738 v16u8 zero = { 0 };
739 v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
740 v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
741
742 nxt = row;
743 prev_nxt = prev_row;
744
745 inp0 = LW(nxt);
746 inp1 = LW(prev_nxt);
747 prev_nxt += 3;
748 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
749 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
750
751 src1 += src0;
752 out2 = __msa_copy_s_h((v8i16) src1, 0);
753
754 SH(out2, nxt);
755 nxt += 2;
756 nxt[0] = src1[2];
757 nxt++;
758
759 /* Remainder */
760 rp_end = row_info->rowbytes - 3;
761
762 for (count = 0; count < rp_end; count += 12)
763 {
764 src2 = LD_UB(prev_nxt);
765 prev_nxt += 12;
766 src6 = LD_UB(prev_row);
767 prev_row += 12;
768 src10 = LD_UB(nxt);
769
770 SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 3);
771 SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 6);
772 SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 9);
773 ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1);
774 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
775 vec2 = vec0 + vec1;
776 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
777 CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10);
778 ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1);
779 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
780 vec2 = vec0 + vec1;
781 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
782 CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11);
783 ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1);
784 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
785 vec2 = vec0 + vec1;
786 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
787 CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12);
788 ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1);
789 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
790 vec2 = vec0 + vec1;
791 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
792 CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13);
793 src1 = src13;
794 VSHF_B2_UB(src10, src11, src12, src13, mask0, mask0, dst0, dst1);
795 dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
796 out0 = __msa_copy_s_d((v2i64) dst0, 0);
797 out1 = __msa_copy_s_w((v4i32) dst0, 2);
798
799 SD(out0, nxt);
800 nxt += 8;
801 SW(out1, nxt);
802 nxt += 4;
803 }
804}
805
806#endif /* PNG_MIPS_MSA_OPT > 0 */
807#endif /* PNG_MIPS_MSA_IMPLEMENTATION == 1 (intrinsics) */
808#endif /* READ */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette