diff options
author | Shiyou Yin <yinshiyou-hf@loongson.cn> | 2020-06-02 11:02:32 +0800 |
---|---|---|
committer | Commit Bot <commit-bot@chromium.org> | 2020-07-22 20:17:39 +0000 |
commit | 5c6cdd0747a7bb991e58021f6e977a7f14b28719 (patch) | |
tree | 99f26558145358e12f4eecd13e8d36bf1edc9901 | |
parent | 6d603ec3f57dafddc424ef895e5d903915e94ba6 (diff) | |
download | libyuv-5c6cdd0747a7bb991e58021f6e977a7f14b28719.tar.gz |
ARGBToJ420 MMI and MSA version match C.
In commit 6cd1ff, C version has been updated.
This patch update the MMI and MSA version to mach C version.
Change-Id: Iea811e232f9c6019a80364d165f0255a37ce41b4
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2227755
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | source/row_mmi.cc | 36 | ||||
-rw-r--r-- | source/row_msa.cc | 222 |
2 files changed, 162 insertions, 96 deletions
diff --git a/source/row_mmi.cc b/source/row_mmi.cc index 57c70a36..9a8e2cb2 100644 --- a/source/row_mmi.cc +++ b/source/row_mmi.cc @@ -2632,8 +2632,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, uint64_t src_rgb1; uint64_t ftmp[12]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x002b0054007f0002; - const uint64_t mask_v = 0x0002007f006b0014; + const uint64_t mask_u = 0x0015002a003f0002; + const uint64_t mask_v = 0x0002003f0035000a; __asm__ volatile( "1: \n\t" @@ -2646,8 +2646,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" @@ -2663,8 +2663,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2689,8 +2689,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" @@ -2706,8 +2706,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2732,8 +2732,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" @@ -2749,8 +2749,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" @@ -2775,8 +2775,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" @@ -2792,8 +2792,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" - "pavgh %[src0], %[src_lo], %[src0] \n\t" - "pavgh %[src1], %[src_hi], %[src1] \n\t" + "paddh %[src0], %[src_lo], %[src0] \n\t" + "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" diff --git a/source/row_msa.cc b/source/row_msa.cc index c1703cf2..27fa4460 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -205,33 +205,37 @@ extern "C" { } #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ - shf0, shf1, shf2, shf3, v_out, u_out) \ + shf0, shf1, shf2, shf3, shift, u_out, v_out) \ { \ - v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \ + v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ \ - vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \ - vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \ - vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \ - vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \ - vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \ - vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \ - vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \ - vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \ - reg0_m = __msa_dotp_u_h(vec0_m, const1); \ - reg1_m = __msa_dotp_u_h(vec1_m, const1); \ - reg2_m = __msa_dotp_u_h(vec4_m, const1); \ - reg3_m = __msa_dotp_u_h(vec5_m, const1); \ - reg0_m += const3; \ - reg1_m += const3; \ - reg2_m += const3; \ - reg3_m += const3; \ - reg0_m -= __msa_dotp_u_h(vec2_m, const0); \ - reg1_m -= __msa_dotp_u_h(vec3_m, const0); \ - reg2_m -= __msa_dotp_u_h(vec6_m, const2); \ - reg3_m -= __msa_dotp_u_h(vec7_m, const2); \ - v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \ - u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ + vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_w(vec0_m, const0); \ + reg1_m = __msa_dotp_u_w(vec1_m, const0); \ + reg2_m = __msa_dotp_u_w(vec4_m, const0); \ + reg3_m = __msa_dotp_u_w(vec5_m, const0); \ + reg0_m += const1; \ + reg1_m += const1; \ + reg2_m += const1; \ + reg3_m += const1; \ + reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \ + reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \ + reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \ + reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \ + reg0_m = __msa_srl_w(reg0_m, shift); \ + reg1_m = __msa_srl_w(reg1_m, shift); \ + reg2_m = __msa_srl_w(reg2_m, shift); \ + reg3_m = __msa_srl_w(reg3_m, shift); \ + u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ + v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ } // Takes ARGB input and calculates U and V. @@ -2530,61 +2534,123 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, int x; const uint8_t* s = src_rgb0; const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 vec0, vec1, vec2, vec3; - v16u8 dst0, dst1; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; - v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F); - v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14); - v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 vec0, vec1, vec2, vec3; + v8u16 dst0, dst1, dst2, dst3; + v16u8 zero = {0}; + v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15}; + v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15}; + v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; + v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f); + v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080); + v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a); + v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a); + v4i32 shift = __msa_fill_w(0x00000008); for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((void*)s, 0); - src1 = (v16u8)__msa_ld_b((void*)s, 16); - src2 = (v16u8)__msa_ld_b((void*)s, 32); - src3 = (v16u8)__msa_ld_b((void*)s, 48); - src4 = (v16u8)__msa_ld_b((void*)t, 0); - src5 = (v16u8)__msa_ld_b((void*)t, 16); - src6 = (v16u8)__msa_ld_b((void*)t, 32); - src7 = (v16u8)__msa_ld_b((void*)t, 48); - src0 = __msa_aver_u_b(src0, src4); - src1 = __msa_aver_u_b(src1, src5); - src2 = __msa_aver_u_b(src2, src6); - src3 = __msa_aver_u_b(src3, src7); - src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); - src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); - src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); - src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); - vec0 = __msa_aver_u_b(src4, src6); - vec1 = __msa_aver_u_b(src5, src7); - src0 = (v16u8)__msa_ld_b((void*)s, 64); - src1 = (v16u8)__msa_ld_b((void*)s, 80); - src2 = (v16u8)__msa_ld_b((void*)s, 96); - src3 = (v16u8)__msa_ld_b((void*)s, 112); - src4 = (v16u8)__msa_ld_b((void*)t, 64); - src5 = (v16u8)__msa_ld_b((void*)t, 80); - src6 = (v16u8)__msa_ld_b((void*)t, 96); - src7 = (v16u8)__msa_ld_b((void*)t, 112); - src0 = __msa_aver_u_b(src0, src4); - src1 = __msa_aver_u_b(src1, src5); - src2 = __msa_aver_u_b(src2, src6); - src3 = __msa_aver_u_b(src3, src7); - src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); - src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); - src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); - src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); - vec2 = __msa_aver_u_b(src4, src6); - vec3 = __msa_aver_u_b(src5, src7); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54, - const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_v); - ST_UB(dst1, dst_u); + src1 = __msa_ld_b((void*)s, 0); + src3 = __msa_ld_b((void*)s, 16); + src5 = __msa_ld_b((void*)t, 0); + src7 = __msa_ld_b((void*)t, 16); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec0 = __msa_aver_u_h(src4, src5); + vec1 = __msa_aver_u_h(src6, src7); + + src1 = __msa_ld_b((void*)s, 32); + src3 = __msa_ld_b((void*)s, 48); + src5 = __msa_ld_b((void*)t, 32); + src7 = __msa_ld_b((void*)t, 48); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec2 = __msa_aver_u_h(src4, src5); + vec3 = __msa_aver_u_h(src6, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, + const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, + shuffler2, shuffler3, shift, dst0, dst1); + + src1 = __msa_ld_b((void*)s, 64); + src3 = __msa_ld_b((void*)s, 80); + src5 = __msa_ld_b((void*)t, 64); + src7 = __msa_ld_b((void*)t, 80); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec0 = __msa_aver_u_h(src4, src5); + vec1 = __msa_aver_u_h(src6, src7); + + src1 = __msa_ld_b((void*)s, 96); + src3 = __msa_ld_b((void*)s, 112); + src5 = __msa_ld_b((void*)t, 96); + src7 = __msa_ld_b((void*)t, 112); + src0 = __msa_ilvr_b(zero, src1); + src1 = __msa_ilvl_b(zero, src1); + src2 = __msa_ilvr_b(zero, src3); + src3 = __msa_ilvl_b(zero, src3); + src4 = __msa_ilvr_b(zero, src5); + src5 = __msa_ilvl_b(zero, src5); + src6 = __msa_ilvr_b(zero, src7); + src7 = __msa_ilvl_b(zero, src7); + src0 += src4; + src1 += src5; + src2 += src6; + src3 += src7; + src4 = __msa_ilvev_d(src1, src0); + src5 = __msa_ilvod_d(src1, src0); + src6 = __msa_ilvev_d(src3, src2); + src7 = __msa_ilvod_d(src3, src2); + vec2 = __msa_aver_u_h(src4, src5); + vec3 = __msa_aver_u_h(src6, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, + const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, + shuffler2, shuffler3, shift, dst2, dst3); + + dst0 = (v8u16)__msa_pckev_b(dst2, dst0); + dst1 = (v8u16)__msa_pckev_b(dst3, dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); s += 128; t += 128; dst_v += 16; |