1 files changed, 250 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Lut3d_advsimd.S b/renderscript-toolkit/src/main/cpp/Lut3d_advsimd.S
new file mode 100644
index 0000000..edcb038
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Lut3d_advsimd.S
@@ -0,0 +1,250 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
+
+            smov        x6, \src0
+            smov        x7, \src1
+
+            add         x6, x6, x3
+            add         x7, x7, x3
+
+            ld1         {v16.2s}, [x6], x4
+            ld1         {v17.2s}, [x7], x4
+
+            ld1         {v18.2s}, [x6], x5
+            ld1         {v19.2s}, [x7], x5
+
+            dup         v8.8b, \yr0
+            dup         v9.8b, \yr1
+            /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
+            zip1        v12.16b, v5.16b, v16.16b
+            zip1        v13.16b, v5.16b, v17.16b
+            umlsl       v12.8h, v16.8b, v8.8b
+            umlsl       v13.8h, v17.8b, v9.8b
+            umlal       v12.8h, v18.8b, v8.8b
+            umlal       v13.8h, v19.8b, v9.8b
+
+            ld1         {v18.2s}, [x6]
+            ld1         {v19.2s}, [x7]
+
+            sub         x6, x6, x4
+            sub         x7, x7, x4
+
+            ld1         {v16.2s}, [x6]
+            ld1         {v17.2s}, [x7]
+
+            /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
+            zip1        v14.16b, v5.16b, v16.16b
+            zip1        v15.16b, v5.16b, v17.16b
+            umlsl       v14.8h, v16.8b, v8.8b
+            umlsl       v15.8h, v17.8b, v9.8b
+            umlal       v14.8h, v18.8b, v8.8b
+            umlal       v15.8h, v19.8b, v9.8b
+
+            /* Z interpolate, lane 0 v12/v14 -> v10 */
+            ushll       v8.4s, v12.4h, #8
+            ushll2      v9.4s, v12.8h, #8
+            umlsl       v8.4s, v12.4h, \zr0
+            umlsl2      v9.4s, v12.8h, \zr0
+            umlal       v8.4s, v14.4h, \zr0
+            umlal2      v9.4s, v14.8h, \zr0
+            rshrn       v10.4h, v8.4s, #8
+            rshrn2      v10.8h, v9.4s, #8
+
+            /* Z interpolate, lane 1 v13/v15 -> v11 */
+            ushll       v8.4s, v13.4h, #8
+            ushll2      v9.4s, v13.8h, #8
+            umlsl       v8.4s, v13.4h, \zr1
+            umlsl2      v9.4s, v13.8h, \zr1
+            umlal       v8.4s, v15.4h, \zr1
+            umlal2      v9.4s, v15.8h, \zr1
+            rshrn       v11.4h, v8.4s, #8
+            rshrn2      v11.8h, v9.4s, #8
+
+            /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
+            ushll       v8.4s, v10.4h, #8
+            ushll       v9.4s, v11.4h, #8
+            umlsl       v8.4s, v10.4h, \xr0
+            umlsl       v9.4s, v11.4h, \xr1
+            umlal2      v8.4s, v10.8h, \xr0
+            umlal2      v9.4s, v11.8h, \xr1
+            shrn        v14.4h, v8.4s, #8
+            shrn2       v14.8h, v9.4s, #8
+
+            /* pack lanes 0-1 -> v6 */
+.ifc \dst, v20.16b
+            uqrshrn2    \dst, v14.8h, #8
+.else ; .ifc \dst, v21.16b
+            uqrshrn2    \dst, v14.8h, #8
+.else
+            uqrshrn     \dst, v14.8h, #8
+.endif ; .endif
+.endm
+
+/* void rsdIntrinsic3DLUT_K(
+ *          void *dst,          // x0
+ *          void const *in,     // x1
+ *          size_t count,       // x2
+ *          void const *lut,    // x3
+ *          int32_t pitchy,     // w4
+ *          int32_t pitchz,     // w5
+ *          int dimx,           // w6
+ *          int dimy,           // w7
+ *          int dimz);          // [sp]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+            ldr         w8, [sp]
+            stp         d8, d9, [sp, #-64]!
+            stp         d10, d11, [sp, #16]
+            stp         d12, d13, [sp, #32]
+            stp         d14, d15, [sp, #48]
+            movi        v4.8b, #1
+            ins         v4.h[0], w6
+            ins         v4.h[1], w7
+            ins         v4.h[2], w8
+            ins         v4.s[2], w4
+            ins         v4.s[3], w5
+            movi        v5.16b, #0
+
+            subs        x2, x2, #8
+            bge         2f
+            cmn         x2, #8    // same as cmp x2, #-8
+            ble         9f
+            b           4f
+
+            .align 6
+1:          st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
+/* x0  = dst
+ * x1  = src
+ * x2  = count
+ * x3  = lut
+ * x4  = pitchy
+ * x5  = pitchz
+ * x6 = offset0
+ * x7 = offset1
+ */
+2:          ld4         {v0.8b-v3.8b}, [x1], #32
+/* v0,v1,v2,v3 source data
+ * v4 dimensions and pitches
+ */
+3:          uxtl        v0.8h, v0.8b
+            uxtl        v1.8h, v1.8b
+            uxtl        v2.8h, v2.8b
+            mul         v0.8h, v0.8h, v4.h[0]
+            mul         v1.8h, v1.8h, v4.h[1]
+            mul         v2.8h, v2.8h, v4.h[2]
+
+/* ursra below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero.  Strictly this is
+ * correct, except for the llegal access problem.
+ */
+            usra        v0.8h, v0.8h, #8
+            usra        v1.8h, v1.8h, #8
+            usra        v2.8h, v2.8h, #8
+
+            ushr        v12.8h, v0.8h, #8
+            ushr        v13.8h, v1.8h, #8
+            ushr        v14.8h, v2.8h, #8
+            bic         v0.8h, #0xff, LSL #8
+            xtn         v1.8b, v1.8h
+            bic         v2.8h, #0xff, LSL #8
+
+/* v0.8h,v1.8b,v2.hb fractional offset
+ * v12.8h,v13.8h,v14.8h integer offset
+ */
+
+            ushll       v6.4s, v12.4h, #2
+            ushll2      v7.4s, v12.8h, #2
+            uxtl        v8.4s, v13.4h
+            uxtl2       v9.4s, v13.8h
+            uxtl        v10.4s, v14.4h
+            uxtl2       v11.4s, v14.8h
+            mla         v6.4s, v8.4s,  v4.s[2]
+            mla         v7.4s, v9.4s,  v4.s[2]
+            mla         v6.4s, v10.4s, v4.s[3]
+            mla         v7.4s, v11.4s, v4.s[3]
+
+/* v6,v7 list of table offsets */
+
+        /* lanes 0 and 1 */
+            lanepair    dst=v20.8b,  src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
+
+        /* lanes 2 and 3 */
+            lanepair    dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
+
+        /* lanes 4 and 5 */
+            lanepair    dst=v21.8b,  src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
+
+        /* lanes 6 and 7 */
+            lanepair    dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
+
+            uzp1        v6.16b, v20.16b, v21.16b
+            uzp2        v7.16b, v20.16b, v21.16b
+            uzp1        v20.16b, v6.16b, v7.16b
+            uzp2        v22.16b, v6.16b, v7.16b
+            mov         v21.d[0], v20.d[1]
+
+            subs        x2, x2, #8
+            mov         v23.8b, v3.8b
+
+            bge         1b
+
+            cmn         x2, #8    // same as cmp x2, #-8
+            blt         1f
+
+            st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
+            beq         9f
+
+            /* fill the vector  with a safe value */
+4:          ld4r        {v0.8b-v3.8b}, [x1]
+            tbz         x2, #2, 2f
+            ld4         {v0.b-v3.b}[0], [x1], #4
+            ld4         {v0.b-v3.b}[1], [x1], #4
+            ld4         {v0.b-v3.b}[2], [x1], #4
+            ld4         {v0.b-v3.b}[3], [x1], #4
+2:          tbz         x2, #1, 2f
+            ld4         {v0.b-v3.b}[4], [x1], #4
+            ld4         {v0.b-v3.b}[5], [x1], #4
+2:          tbz         x2, #0, 2f
+            ld4         {v0.b-v3.b}[6], [x1], #4
+2:          b           3b
+
+1:          tst         x2, #4
+            beq         2f
+            st4         {v20.b-v23.b}[0], [x0], #4
+            st4         {v20.b-v23.b}[1], [x0], #4
+            st4         {v20.b-v23.b}[2], [x0], #4
+            st4         {v20.b-v23.b}[3], [x0], #4
+2:          tst         x2, #2
+            beq         2f
+            st4         {v20.b-v23.b}[4], [x0], #4
+            st4         {v20.b-v23.b}[5], [x0], #4
+2:          tst         x2, #1
+            beq         9f
+            st4         {v20.b-v23.b}[6], [x0], #4
+
+9:          ldp         d14, d15, [sp, #48]
+            ldp         d12, d13, [sp, #32]
+            ldp         d10, d11, [sp, #16]
+            ldp         d8, d9, [sp], #64
+            ret
+END(rsdIntrinsic3DLUT_K)