diff options
Diffstat (limited to 'renderscript-toolkit/src/main/cpp/Blend_advsimd.S')
-rw-r--r-- | renderscript-toolkit/src/main/cpp/Blend_advsimd.S | 622 |
1 files changed, 622 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Blend_advsimd.S b/renderscript-toolkit/src/main/cpp/Blend_advsimd.S new file mode 100644 index 0000000..e5cb29b --- /dev/null +++ b/renderscript-toolkit/src/main/cpp/Blend_advsimd.S @@ -0,0 +1,622 @@ +/* + * Copyright (C) 2013-2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: +#define END(f) .size f, .-f; + +#define BLEND_LIST(X) \ + X(0, CLEAR) \ + X(1, SRC) \ + X(2, DST) \ + X(3, SRC_OVER) \ + X(4, DST_OVER) \ + X(5, SRC_IN) \ + X(6, DST_IN) \ + X(7, SRC_OUT) \ + X(8, DST_OUT) \ + X(9, SRC_ATOP) \ + X(10, DST_ATOP) \ + X(11, XOR) \ + X(12, MULTIPLY) \ + X(13, ADD) \ + X(14, SUBTRACT) + +/* This operation was not enabled in the original RenderScript. We could + * enable it. + * + * X(15, DIFFERENCE) \ + */ + +/* For every blend operation supported, define a macro with just the arithmetic + * component. The rest can be handled later on. + * + * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11 + * contain the data from the source buffer. Both have already been split out + * into one colour component per register (if necessary). q3 and q11 contain + * the alpha components. + * + * At the same time as defining the assembly macro, define a corresponding + * preprocessor macro indicating any other requirements. + * zipped=0 -- The macro does not require the RGBA components to be + * separated. + * lddst=0 -- The macro does not require data from the destination buffer. + * ldsrc=0 -- The macro does not require data from the source buffer. + * nowrap=1 -- The macro requires no wrapper at all, and should simply be + * inserted without any surrounding load/store or loop code. + */ + +#define params_CLEAR zipped=0, lddst=0, ldsrc=0 +.macro blend_kernel_CLEAR + movi v0.16b, #0 + movi v1.16b, #0 + movi v2.16b, #0 + movi v3.16b, #0 +.endm + +#define params_SRC zipped=0, lddst=0 +.macro blend_kernel_SRC + mov v0.16b, v8.16b + mov v1.16b, v9.16b + mov v2.16b, v10.16b + mov v3.16b, v11.16b +.endm + +#define params_DST nowrap=1 +.macro blend_kernel_DST + /* nop */ +.endm + +#define params_SRC_OVER zipped=1 +.macro blend_kernel_SRC_OVER + mvn v7.16b, v11.16b + + umull2 v12.8h, v7.16b, v0.16b + umull v0.8h, v7.8b, v0.8b + umull2 v13.8h, v7.16b, v1.16b + umull v1.8h, v7.8b, v1.8b + umull2 v14.8h, v7.16b, v2.16b + umull v2.8h, v7.8b, v2.8b + umull2 v15.8h, v7.16b, v3.16b + umull v3.8h, v7.8b, v3.8b + + rshrn v4.8b, v0.8h, #8 + rshrn2 v4.16b, v12.8h, #8 + rshrn v5.8b, v1.8h, #8 + rshrn2 v5.16b, v13.8h, #8 + rshrn v6.8b, v2.8h, #8 + rshrn2 v6.16b, v14.8h, #8 + rshrn v7.8b, v3.8h, #8 + rshrn2 v7.16b, v15.8h, #8 + + uaddw v0.8h, v0.8h, v4.8b + uaddw2 v12.8h, v12.8h, v4.16b + uaddw v1.8h, v1.8h, v5.8b + uaddw2 v13.8h, v13.8h, v5.16b + uaddw v2.8h, v2.8h, v6.8b + uaddw2 v14.8h, v14.8h, v6.16b + uaddw v3.8h, v3.8h, v7.8b + uaddw2 v15.8h, v15.8h, v7.16b + + rshrn v0.8b, v0.8h, #8 + rshrn2 v0.16b, v12.8h, #8 + rshrn v1.8b, v1.8h, #8 + rshrn2 v1.16b, v13.8h, #8 + rshrn v2.8b, v2.8h, #8 + rshrn2 v2.16b, v14.8h, #8 + rshrn v3.8b, v3.8h, #8 + rshrn2 v3.16b, v15.8h, #8 + + uqadd v0.16b, v0.16b, v8.16b + uqadd v1.16b, v1.16b, v9.16b + uqadd v2.16b, v2.16b, v10.16b + uqadd v3.16b, v3.16b, v11.16b +.endm + +#define params_DST_OVER zipped=1 +.macro blend_kernel_DST_OVER + mvn v7.16b, v3.16b + + umull2 v12.8h, v7.16b, v8.16b + umull v8.8h, v7.8b, v8.8b + umull2 v13.8h, v7.16b, v9.16b + umull v9.8h, v7.8b, v9.8b + umull2 v14.8h, v7.16b, v10.16b + umull v10.8h, v7.8b, v10.8b + umull2 v15.8h, v7.16b, v11.16b + umull v11.8h, v7.8b, v11.8b + + rshrn v4.8b, v8.8h, #8 + rshrn2 v4.16b, v12.8h, #8 + rshrn v5.8b, v9.8h, #8 + rshrn2 v5.16b, v13.8h, #8 + rshrn v6.8b, v10.8h, #8 + rshrn2 v6.16b, v14.8h, #8 + rshrn v7.8b, v11.8h, #8 + rshrn2 v7.16b, v15.8h, #8 + + uaddw v8.8h, v8.8h, v4.8b + uaddw2 v12.8h, v12.8h, v4.16b + uaddw v9.8h, v9.8h, v5.8b + uaddw2 v13.8h, v13.8h, v5.16b + uaddw v10.8h, v10.8h, v6.8b + uaddw2 v14.8h, v14.8h, v6.16b + uaddw v11.8h, v11.8h, v7.8b + uaddw2 v15.8h, v15.8h, v7.16b + + rshrn v8.8b, v8.8h, #8 + rshrn2 v8.16b, v12.8h, #8 + rshrn v9.8b, v9.8h, #8 + rshrn2 v9.16b, v13.8h, #8 + rshrn v10.8b, v10.8h, #8 + rshrn2 v10.16b, v14.8h, #8 + rshrn v11.8b, v11.8h, #8 + rshrn2 v11.16b, v15.8h, #8 + + uqadd v0.16b, v0.16b, v8.16b + uqadd v1.16b, v1.16b, v9.16b + uqadd v2.16b, v2.16b, v10.16b + uqadd v3.16b, v3.16b, v11.16b +.endm + +#define params_SRC_IN zipped=1 +.macro blend_kernel_SRC_IN + umull2 v12.8h, v3.16b, v8.16b + umull v0.8h, v3.8b, v8.8b + umull2 v13.8h, v3.16b, v9.16b + umull v1.8h, v3.8b, v9.8b + umull2 v14.8h, v3.16b, v10.16b + umull v2.8h, v3.8b, v10.8b + umull2 v15.8h, v3.16b, v11.16b + umull v3.8h, v3.8b, v11.8b + + rshrn v4.8b, v0.8h, #8 + rshrn2 v4.16b, v12.8h, #8 + rshrn v5.8b, v1.8h, #8 + rshrn2 v5.16b, v13.8h, #8 + rshrn v6.8b, v2.8h, #8 + rshrn2 v6.16b, v14.8h, #8 + rshrn v7.8b, v3.8h, #8 + rshrn2 v7.16b, v15.8h, #8 + + uaddw v0.8h, v0.8h, v4.8b + uaddw2 v12.8h, v12.8h, v4.16b + uaddw v1.8h, v1.8h, v5.8b + uaddw2 v13.8h, v13.8h, v5.16b + uaddw v2.8h, v2.8h, v6.8b + uaddw2 v14.8h, v14.8h, v6.16b + uaddw v3.8h, v3.8h, v7.8b + uaddw2 v15.8h, v15.8h, v7.16b + + rshrn v0.8b, v0.8h, #8 + rshrn2 v0.16b, v12.8h, #8 + rshrn v1.8b, v1.8h, #8 + rshrn2 v1.16b, v13.8h, #8 + rshrn v2.8b, v2.8h, #8 + rshrn2 v2.16b, v14.8h, #8 + rshrn v3.8b, v3.8h, #8 + rshrn2 v3.16b, v15.8h, #8 +.endm + +#define params_DST_IN zipped=1 +.macro blend_kernel_DST_IN + umull2 v12.8h, v0.16b, v11.16b + umull v0.8h, v0.8b, v11.8b + umull2 v13.8h, v1.16b, v11.16b + umull v1.8h, v1.8b, v11.8b + umull2 v14.8h, v2.16b, v11.16b + umull v2.8h, v2.8b, v11.8b + umull2 v15.8h, v3.16b, v11.16b + umull v3.8h, v3.8b, v11.8b + + rshrn v4.8b, v0.8h, #8 + rshrn2 v4.16b, v12.8h, #8 + rshrn v5.8b, v1.8h, #8 + rshrn2 v5.16b, v13.8h, #8 + rshrn v6.8b, v2.8h, #8 + rshrn2 v6.16b, v14.8h, #8 + rshrn v7.8b, v3.8h, #8 + rshrn2 v7.16b, v15.8h, #8 + + uaddw v0.8h, v0.8h, v4.8b + uaddw2 v12.8h, v12.8h, v4.16b + uaddw v1.8h, v1.8h, v5.8b + uaddw2 v13.8h, v13.8h, v5.16b + uaddw v2.8h, v2.8h, v6.8b + uaddw2 v14.8h, v14.8h, v6.16b + uaddw v3.8h, v3.8h, v7.8b + uaddw2 v15.8h, v15.8h, v7.16b + + rshrn v0.8b, v0.8h, #8 + rshrn2 v0.16b, v12.8h, #8 + rshrn v1.8b, v1.8h, #8 + rshrn2 v1.16b, v13.8h, #8 + rshrn v2.8b, v2.8h, #8 + rshrn2 v2.16b, v14.8h, #8 + rshrn v3.8b, v3.8h, #8 + rshrn2 v3.16b, v15.8h, #8 +.endm + +#define params_SRC_OUT zipped=1 +.macro blend_kernel_SRC_OUT + mvn v3.16b, v3.16b + blend_kernel_SRC_IN +.endm + + +#define params_DST_OUT zipped=1 +.macro blend_kernel_DST_OUT + mvn v11.16b, v11.16b + blend_kernel_DST_IN +.endm + +#define params_SRC_ATOP zipped=1 +.macro blend_kernel_SRC_ATOP + mvn v11.16b, v11.16b + + umull2 v12.8h, v11.16b, v0.16b + umull v0.8h, v11.8b, v0.8b + umull2 v13.8h, v11.16b, v1.16b + umull v1.8h, v11.8b, v1.8b + umull2 v14.8h, v11.16b, v2.16b + umull v2.8h, v11.8b, v2.8b + + umull2 v4.8h, v3.16b, v8.16b + umull v8.8h, v3.8b, v8.8b + umull2 v5.8h, v3.16b, v9.16b + umull v9.8h, v3.8b, v9.8b + umull2 v6.8h, v3.16b, v10.16b + umull v10.8h, v3.8b, v10.8b + + uqadd v12.8h, v12.8h, v4.8h + uqadd v0.8h, v0.8h, v8.8h + uqadd v13.8h, v13.8h, v5.8h + uqadd v1.8h, v1.8h, v9.8h + uqadd v14.8h, v14.8h, v6.8h + uqadd v2.8h, v2.8h, v10.8h + + urshr v8.8h, v0.8h, #8 + urshr v4.8h, v12.8h, #8 + urshr v9.8h, v1.8h, #8 + urshr v5.8h, v13.8h, #8 + urshr v10.8h, v2.8h, #8 + urshr v6.8h, v14.8h, #8 + + uqadd v0.8h, v0.8h, v8.8h + uqadd v12.8h, v12.8h, v4.8h + uqadd v1.8h, v1.8h, v9.8h + uqadd v13.8h, v13.8h, v5.8h + uqadd v2.8h, v2.8h, v10.8h + uqadd v14.8h, v14.8h, v6.8h + + uqrshrn v0.8b, v0.8h, #8 + uqrshrn2 v0.16b, v12.8h, #8 + uqrshrn v1.8b, v1.8h, #8 + uqrshrn2 v1.16b, v13.8h, #8 + uqrshrn v2.8b, v2.8h, #8 + uqrshrn2 v2.16b, v14.8h, #8 +.endm + +#define params_DST_ATOP zipped=1 +.macro blend_kernel_DST_ATOP + mvn v3.16b, v3.16b + + umull2 v12.8h, v11.16b, v0.16b + umull v0.8h, v11.8b, v0.8b + umull2 v13.8h, v11.16b, v1.16b + umull v1.8h, v11.8b, v1.8b + umull2 v14.8h, v11.16b, v2.16b + umull v2.8h, v11.8b, v2.8b + + umull2 v4.8h, v3.16b, v8.16b + umull v8.8h, v3.8b, v8.8b + umull2 v5.8h, v3.16b, v9.16b + umull v9.8h, v3.8b, v9.8b + umull2 v6.8h, v3.16b, v10.16b + umull v10.8h, v3.8b, v10.8b + + uqadd v12.8h, v12.8h, v4.8h + uqadd v0.8h, v0.8h, v8.8h + uqadd v13.8h, v13.8h, v5.8h + uqadd v1.8h, v1.8h, v9.8h + uqadd v14.8h, v14.8h, v6.8h + uqadd v2.8h, v2.8h, v10.8h + + urshr v8.8h, v0.8h, #8 + urshr v4.8h, v12.8h, #8 + urshr v9.8h, v1.8h, #8 + urshr v5.8h, v13.8h, #8 + urshr v10.8h, v2.8h, #8 + urshr v6.8h, v14.8h, #8 + + uqadd v0.8h, v0.8h, v8.8h + uqadd v12.8h, v12.8h, v4.8h + uqadd v1.8h, v1.8h, v9.8h + uqadd v13.8h, v13.8h, v5.8h + uqadd v2.8h, v2.8h, v10.8h + uqadd v14.8h, v14.8h, v6.8h + + uqrshrn v0.8b, v0.8h, #8 + uqrshrn2 v0.16b, v12.8h, #8 + uqrshrn v1.8b, v1.8h, #8 + uqrshrn2 v1.16b, v13.8h, #8 + uqrshrn v2.8b, v2.8h, #8 + uqrshrn2 v2.16b, v14.8h, #8 + + mov v3.16b, v11.16b +.endm + +#define params_MULTIPLY zipped=0 +.macro blend_kernel_MULTIPLY + umull2 v12.8h, v0.16b, v8.16b + umull v0.8h, v0.8b, v8.8b + umull2 v13.8h, v1.16b, v9.16b + umull v1.8h, v1.8b, v9.8b + umull2 v14.8h, v2.16b, v10.16b + umull v2.8h, v2.8b, v10.8b + umull2 v15.8h, v3.16b, v11.16b + umull v3.8h, v3.8b, v11.8b + + rshrn v4.8b, v0.8h, #8 + rshrn2 v4.16b, v12.8h, #8 + rshrn v5.8b, v1.8h, #8 + rshrn2 v5.16b, v13.8h, #8 + rshrn v6.8b, v2.8h, #8 + rshrn2 v6.16b, v14.8h, #8 + rshrn v7.8b, v3.8h, #8 + rshrn2 v7.16b, v15.8h, #8 + + uaddw v0.8h, v0.8h, v4.8b + uaddw2 v12.8h, v12.8h, v4.16b + uaddw v1.8h, v1.8h, v5.8b + uaddw2 v13.8h, v13.8h, v5.16b + uaddw v2.8h, v2.8h, v6.8b + uaddw2 v14.8h, v14.8h, v6.16b + uaddw v3.8h, v3.8h, v7.8b + uaddw2 v15.8h, v15.8h, v7.16b + + rshrn v0.8b, v0.8h, #8 + rshrn2 v0.16b, v12.8h, #8 + rshrn v1.8b, v1.8h, #8 + rshrn2 v1.16b, v13.8h, #8 + rshrn v2.8b, v2.8h, #8 + rshrn2 v2.16b, v14.8h, #8 + rshrn v3.8b, v3.8h, #8 + rshrn2 v3.16b, v15.8h, #8 +.endm + +#define params_ADD zipped=0 +.macro blend_kernel_ADD + uqadd v0.16b, v0.16b, v8.16b + uqadd v1.16b, v1.16b, v9.16b + uqadd v2.16b, v2.16b, v10.16b + uqadd v3.16b, v3.16b, v11.16b +.endm + +#define params_SUBTRACT zipped=0 +.macro blend_kernel_SUBTRACT + uqsub v0.16b, v0.16b, v8.16b + uqsub v1.16b, v1.16b, v9.16b + uqsub v2.16b, v2.16b, v10.16b + uqsub v3.16b, v3.16b, v11.16b +.endm + +#define params_DIFFERENCE zipped=0 +.macro blend_kernel_DIFFERENCE + uabd v0.16b, v0.16b, v8.16b + uabd v1.16b, v1.16b, v9.16b + uabd v2.16b, v2.16b, v10.16b + uabd v3.16b, v3.16b, v11.16b +.endm + +#define params_XOR zipped=0 +.macro blend_kernel_XOR + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b +.endm + + +/* Define the wrapper code which will load and store the data, iterate the + * correct number of times, and safely handle the remainder at the end of the + * loop. Various sections of assembly code are dropped or substituted for + * simpler operations if they're not needed. + */ +.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1 +.if \nowrap + \kernel +.else + sub x3, sp, #32 + sub sp, sp, #64 + st1 {v8.1d - v11.1d}, [sp] + st1 {v12.1d - v15.1d}, [x3] + subs x2, x2, #64 + b 2f +.align 4 +1: + .if \lddst + .if \zipped + ld4 {v0.16b - v3.16b}, [x0] + .else + ld1 {v0.16b - v3.16b}, [x0] + .endif + .endif + .if \ldsrc + .if \zipped + ld4 {v8.16b - v11.16b}, [x1], #64 + .else + ld1 {v8.16b - v11.16b}, [x1], #64 + .endif + .endif + .if \pld +#if 0 /* TODO: test this on real hardware */ + .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif + .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif +#endif + .endif + + \kernel + + subs x2, x2, #64 + .if \zipped + st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64 + .else + st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64 + .endif + +2: bge 1b + adds x2, x2, #64 + beq 2f + + /* To handle the tail portion of the data (something less than 64 + * bytes) load small power-of-two chunks into working registers. It + * doesn't matter where they end up in the register; the same process + * will store them back out using the same positions and the operations + * don't require data to interact with its neighbours. + */ + movi v0.16b, #0 + movi v1.16b, #0 + movi v2.16b, #0 + movi v3.16b, #0 + + movi v8.16b, #0 + movi v9.16b, #0 + movi v10.16b, #0 + movi v11.16b, #0 + + tbz x2, #5, 1f + .if \lddst ; ld1 {v2.16b,v3.16b}, [x0], #32 ; .endif + .if \ldsrc ; ld1 {v10.16b,v11.16b}, [x1], #32 ; .endif +1: tbz x2, #4, 1f + .if \lddst ; ld1 {v1.16b}, [x0], #16 ; .endif + .if \ldsrc ; ld1 {v9.16b}, [x1], #16 ; .endif +1: tbz x2, #3, 1f + .if \lddst ; ld1 {v0.d}[1], [x0], #8 ; .endif + .if \ldsrc ; ld1 {v8.d}[1], [x1], #8 ; .endif +1: tbz x2, #2, 1f + .if \lddst ; ld1 {v0.s}[1], [x0], #4 ; .endif + .if \ldsrc ; ld1 {v8.s}[1], [x1], #4 ; .endif +1: tbz x2, #1, 1f + .if \lddst ; ld1 {v0.h}[1], [x0], #2 ; .endif + .if \ldsrc ; ld1 {v8.h}[1], [x1], #2 ; .endif +1: tbz x2, #0, 1f + .if \lddst ; ld1 {v0.b}[1], [x0], #1 ; .endif + .if \ldsrc ; ld1 {v8.b}[1], [x1], #1 ; .endif +1: + .if \lddst ; sub x0, x0, x2 ; .endif + +.if \zipped + /* One small impediment in the process above is that some of the load + * operations can't perform byte-wise structure deinterleaving at the + * same time as loading only part of a register. So the data is loaded + * linearly and unpacked manually at this point. + */ + uzp1 v4.16b, v0.16b, v1.16b + uzp2 v5.16b, v0.16b, v1.16b + uzp1 v6.16b, v2.16b, v3.16b + uzp2 v7.16b, v2.16b, v3.16b + uzp1 v0.16b, v4.16b, v6.16b + uzp2 v2.16b, v4.16b, v6.16b + uzp1 v1.16b, v5.16b, v7.16b + uzp2 v3.16b, v5.16b, v7.16b + + uzp1 v4.16b, v8.16b, v9.16b + uzp2 v5.16b, v8.16b, v9.16b + uzp1 v6.16b, v10.16b, v11.16b + uzp2 v7.16b, v10.16b, v11.16b + uzp1 v8.16b, v4.16b, v6.16b + uzp2 v10.16b, v4.16b, v6.16b + uzp1 v9.16b, v5.16b, v7.16b + uzp2 v11.16b, v5.16b, v7.16b + + \kernel + + zip1 v4.16b, v0.16b, v2.16b + zip2 v6.16b, v0.16b, v2.16b + zip1 v5.16b, v1.16b, v3.16b + zip2 v7.16b, v1.16b, v3.16b + zip1 v0.16b, v4.16b, v5.16b + zip2 v1.16b, v4.16b, v5.16b + zip1 v2.16b, v6.16b, v7.16b + zip2 v3.16b, v6.16b, v7.16b + .else + \kernel + .endif + + tbz x2, #5, 1f + st1 {v2.16b,v3.16b}, [x0], #32 +1: tbz x2, #4, 1f + st1 {v1.16b}, [x0], #16 +1: tbz x2, #3, 1f + st1 {v0.d}[1], [x0], #8 +1: tbz x2, #2, 1f + st1 {v0.s}[1], [x0], #4 +1: tbz x2, #1, 1f + st1 {v0.h}[1], [x0], #2 +1: tbz x2, #0, 2f + st1 {v0.b}[1], [x0], #1 +2: ld1 {v8.1d - v11.1d}, [sp], #32 + ld1 {v12.1d - v15.1d}, [sp], #32 +.endif + mov x0, #0 + ret +.endm + + +/* produce list of blend_line_XX() functions; each function uses the wrap_line + * macro, passing it the name of the operation macro it wants along with + * optional parameters to remove unnecessary operations. + */ +#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ; + BLEND_LIST(BLEND_X) +#undef BLEND_X + +#define BLEND_X(d, n) .set tablesize, d+1 ; + BLEND_LIST(BLEND_X) +#undef BLEND_X + +/* int rsdIntrinsicBlend_K( + * uchar4 *out, // x0 + * uchar4 const *in, // x1 + * int slot, // x2 + * size_t xstart, // x3 + * size_t xend); // x4 + */ +ENTRY(rsdIntrinsicBlend_K) + adrp x5, blendtable + add x5, x5, :lo12:blendtable + cmp w2, tablesize + bhs 1f + ldrsh x6, [x5, w2, uxtw #1] + add x0, x0, w3, uxtw #2 + add x1, x1, w3, uxtw #2 + sub w2, w4, w3 + ubfiz x2, x2, #2, #32 /* TODO: fix */ + cbz x6, 1f + adr x5, 2f + add x6, x5, x6 +2: br x6 +1: mov x0, #-1 + ret + +END(rsdIntrinsicBlend_K) + +.rodata +.set off,0 +blendtable: +#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ; + BLEND_LIST(BLEND_X) +#undef BLEND_X |