diff options
Diffstat (limited to 'android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S')
-rw-r--r-- | android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S | 1095 |
1 files changed, 1095 insertions, 0 deletions
diff --git a/android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S b/android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S new file mode 100644 index 0000000000..ecb1151893 --- /dev/null +++ b/android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S @@ -0,0 +1,1095 @@ +.text + +.align 8 // strategic alignment and padding that allows to use + // address value as loop termination condition... +.quad 0,0,0,0,0,0,0,0 + +iotas: +.quad 0x0000000000000001 +.quad 0x0000000000008082 +.quad 0x800000000000808a +.quad 0x8000000080008000 +.quad 0x000000000000808b +.quad 0x0000000080000001 +.quad 0x8000000080008081 +.quad 0x8000000000008009 +.quad 0x000000000000008a +.quad 0x0000000000000088 +.quad 0x0000000080008009 +.quad 0x000000008000000a +.quad 0x000000008000808b +.quad 0x800000000000008b +.quad 0x8000000000008089 +.quad 0x8000000000008003 +.quad 0x8000000000008002 +.quad 0x8000000000000080 +.quad 0x000000000000800a +.quad 0x800000008000000a +.quad 0x8000000080008081 +.quad 0x8000000000008080 +.quad 0x0000000080000001 +.quad 0x8000000080008008 + + +.align 5 +KeccakF1600_int: + adr x28,iotas +.long 0xd503233f // paciasp + stp x28,x30,[sp,#16] // 32 bytes on top are mine + b Loop +.align 4 +Loop: + ////////////////////////////////////////// Theta + eor x26,x0,x5 + stp x4,x9,[sp,#0] // offload pair... + eor x27,x1,x6 + eor x28,x2,x7 + eor x30,x3,x8 + eor x4,x4,x9 + eor x26,x26,x10 + eor x27,x27,x11 + eor x28,x28,x12 + eor x30,x30,x13 + eor x4,x4,x14 + eor x26,x26,x15 + eor x27,x27,x16 + eor x28,x28,x17 + eor x30,x30,x25 + eor x4,x4,x19 + eor x26,x26,x20 + eor x28,x28,x22 + eor x27,x27,x21 + eor x30,x30,x23 + eor x4,x4,x24 + + eor x9,x26,x28,ror#63 + + eor x1,x1,x9 + eor x6,x6,x9 + eor x11,x11,x9 + eor x16,x16,x9 + eor x21,x21,x9 + + eor x9,x27,x30,ror#63 + eor x28,x28,x4,ror#63 + eor x30,x30,x26,ror#63 + eor x4,x4,x27,ror#63 + + eor x27, x2,x9 // mov x27,x2 + eor x7,x7,x9 + eor x12,x12,x9 + eor x17,x17,x9 + eor x22,x22,x9 + + eor x0,x0,x4 + eor x5,x5,x4 + eor x10,x10,x4 + eor x15,x15,x4 + eor x20,x20,x4 + ldp x4,x9,[sp,#0] // re-load offloaded data + eor x26, x3,x28 // mov x26,x3 + eor x8,x8,x28 + eor x13,x13,x28 + eor x25,x25,x28 + eor x23,x23,x28 + + eor x28, x4,x30 // mov x28,x4 + eor x9,x9,x30 + eor x14,x14,x30 + eor x19,x19,x30 + eor x24,x24,x30 + + ////////////////////////////////////////// Rho+Pi + mov x30,x1 + ror x1,x6,#64-44 + //mov x27,x2 + ror x2,x12,#64-43 + //mov x26,x3 + ror x3,x25,#64-21 + //mov x28,x4 + ror x4,x24,#64-14 + + ror x6,x9,#64-20 + ror x12,x13,#64-25 + ror x25,x17,#64-15 + ror x24,x21,#64-2 + + ror x9,x22,#64-61 + ror x13,x19,#64-8 + ror x17,x11,#64-10 + ror x21,x8,#64-55 + + ror x22,x14,#64-39 + ror x19,x23,#64-56 + ror x11,x7,#64-6 + ror x8,x16,#64-45 + + ror x14,x20,#64-18 + ror x23,x15,#64-41 + ror x7,x10,#64-3 + ror x16,x5,#64-36 + + ror x5,x26,#64-28 + ror x10,x30,#64-1 + ror x15,x28,#64-27 + ror x20,x27,#64-62 + + ////////////////////////////////////////// Chi+Iota + bic x26,x2,x1 + bic x27,x3,x2 + bic x28,x0,x4 + bic x30,x1,x0 + eor x0,x0,x26 + bic x26,x4,x3 + eor x1,x1,x27 + ldr x27,[sp,#16] + eor x3,x3,x28 + eor x4,x4,x30 + eor x2,x2,x26 + ldr x30,[x27],#8 // Iota[i++] + + bic x26,x7,x6 + tst x27,#255 // are we done? + str x27,[sp,#16] + bic x27,x8,x7 + bic x28,x5,x9 + eor x0,x0,x30 // A[0][0] ^= Iota + bic x30,x6,x5 + eor x5,x5,x26 + bic x26,x9,x8 + eor x6,x6,x27 + eor x8,x8,x28 + eor x9,x9,x30 + eor x7,x7,x26 + + bic x26,x12,x11 + bic x27,x13,x12 + bic x28,x10,x14 + bic x30,x11,x10 + eor x10,x10,x26 + bic x26,x14,x13 + eor x11,x11,x27 + eor x13,x13,x28 + eor x14,x14,x30 + eor x12,x12,x26 + + bic x26,x17,x16 + bic x27,x25,x17 + bic x28,x15,x19 + bic x30,x16,x15 + eor x15,x15,x26 + bic x26,x19,x25 + eor x16,x16,x27 + eor x25,x25,x28 + eor x19,x19,x30 + eor x17,x17,x26 + + bic x26,x22,x21 + bic x27,x23,x22 + bic x28,x20,x24 + bic x30,x21,x20 + eor x20,x20,x26 + bic x26,x24,x23 + eor x21,x21,x27 + eor x23,x23,x28 + eor x24,x24,x30 + eor x22,x22,x26 + + bne Loop + + ldr x30,[sp,#24] +.long 0xd50323bf // autiasp + ret + + + +.align 5 +KeccakF1600: +.long 0xd503233f // paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#48 + + str x0,[sp,#32] // offload argument + mov x26,x0 + ldp x0,x1,[x0,#16*0] + ldp x2,x3,[x26,#16*1] + ldp x4,x5,[x26,#16*2] + ldp x6,x7,[x26,#16*3] + ldp x8,x9,[x26,#16*4] + ldp x10,x11,[x26,#16*5] + ldp x12,x13,[x26,#16*6] + ldp x14,x15,[x26,#16*7] + ldp x16,x17,[x26,#16*8] + ldp x25,x19,[x26,#16*9] + ldp x20,x21,[x26,#16*10] + ldp x22,x23,[x26,#16*11] + ldr x24,[x26,#16*12] + + bl KeccakF1600_int + + ldr x26,[sp,#32] + stp x0,x1,[x26,#16*0] + stp x2,x3,[x26,#16*1] + stp x4,x5,[x26,#16*2] + stp x6,x7,[x26,#16*3] + stp x8,x9,[x26,#16*4] + stp x10,x11,[x26,#16*5] + stp x12,x13,[x26,#16*6] + stp x14,x15,[x26,#16*7] + stp x16,x17,[x26,#16*8] + stp x25,x19,[x26,#16*9] + stp x20,x21,[x26,#16*10] + stp x22,x23,[x26,#16*11] + str x24,[x26,#16*12] + + ldp x19,x20,[x29,#16] + add sp,sp,#48 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 +.long 0xd50323bf // autiasp + ret + + +.globl _SHA3_absorb + +.align 5 +_SHA3_absorb: +.long 0xd503233f // paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + stp x0,x1,[sp,#32] // offload arguments + stp x2,x3,[sp,#48] + + mov x26,x0 // uint64_t A[5][5] + mov x27,x1 // const void *inp + mov x28,x2 // size_t len + mov x30,x3 // size_t bsz + ldp x0,x1,[x26,#16*0] + ldp x2,x3,[x26,#16*1] + ldp x4,x5,[x26,#16*2] + ldp x6,x7,[x26,#16*3] + ldp x8,x9,[x26,#16*4] + ldp x10,x11,[x26,#16*5] + ldp x12,x13,[x26,#16*6] + ldp x14,x15,[x26,#16*7] + ldp x16,x17,[x26,#16*8] + ldp x25,x19,[x26,#16*9] + ldp x20,x21,[x26,#16*10] + ldp x22,x23,[x26,#16*11] + ldr x24,[x26,#16*12] + b Loop_absorb + +.align 4 +Loop_absorb: + subs x26,x28,x30 // len - bsz + blo Labsorbed + + str x26,[sp,#48] // save len - bsz + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x0,x0,x26 + cmp x30,#8*(0+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x1,x1,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x2,x2,x26 + cmp x30,#8*(2+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x3,x3,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x4,x4,x26 + cmp x30,#8*(4+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x5,x5,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x6,x6,x26 + cmp x30,#8*(6+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x7,x7,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x8,x8,x26 + cmp x30,#8*(8+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x9,x9,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x10,x10,x26 + cmp x30,#8*(10+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x11,x11,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x12,x12,x26 + cmp x30,#8*(12+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x13,x13,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x14,x14,x26 + cmp x30,#8*(14+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x15,x15,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x16,x16,x26 + cmp x30,#8*(16+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x17,x17,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x25,x25,x26 + cmp x30,#8*(18+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x19,x19,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x20,x20,x26 + cmp x30,#8*(20+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x21,x21,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x22,x22,x26 + cmp x30,#8*(22+2) + blo Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x23,x23,x26 + beq Lprocess_block + ldr x26,[x27],#8 // *inp++ +#ifdef __AARCH64EB__ + rev x26,x26 +#endif + eor x24,x24,x26 + +Lprocess_block: + str x27,[sp,#40] // save inp + + bl KeccakF1600_int + + ldr x27,[sp,#40] // restore arguments + ldp x28,x30,[sp,#48] + b Loop_absorb + +.align 4 +Labsorbed: + ldr x27,[sp,#32] + stp x0,x1,[x27,#16*0] + stp x2,x3,[x27,#16*1] + stp x4,x5,[x27,#16*2] + stp x6,x7,[x27,#16*3] + stp x8,x9,[x27,#16*4] + stp x10,x11,[x27,#16*5] + stp x12,x13,[x27,#16*6] + stp x14,x15,[x27,#16*7] + stp x16,x17,[x27,#16*8] + stp x25,x19,[x27,#16*9] + stp x20,x21,[x27,#16*10] + stp x22,x23,[x27,#16*11] + str x24,[x27,#16*12] + + mov x0,x28 // return value + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 +.long 0xd50323bf // autiasp + ret + +.globl _SHA3_squeeze + +.align 5 +_SHA3_squeeze: +.long 0xd503233f // paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + mov x19,x0 // put aside arguments + mov x20,x1 + mov x21,x2 + mov x22,x3 + +Loop_squeeze: + ldr x4,[x0],#8 + cmp x21,#8 + blo Lsqueeze_tail +#ifdef __AARCH64EB__ + rev x4,x4 +#endif + str x4,[x20],#8 + subs x21,x21,#8 + beq Lsqueeze_done + + subs x3,x3,#8 + bhi Loop_squeeze + + mov x0,x19 + bl KeccakF1600 + mov x0,x19 + mov x3,x22 + b Loop_squeeze + +.align 4 +Lsqueeze_tail: + strb w4,[x20],#1 + lsr x4,x4,#8 + subs x21,x21,#1 + beq Lsqueeze_done + strb w4,[x20],#1 + lsr x4,x4,#8 + subs x21,x21,#1 + beq Lsqueeze_done + strb w4,[x20],#1 + lsr x4,x4,#8 + subs x21,x21,#1 + beq Lsqueeze_done + strb w4,[x20],#1 + lsr x4,x4,#8 + subs x21,x21,#1 + beq Lsqueeze_done + strb w4,[x20],#1 + lsr x4,x4,#8 + subs x21,x21,#1 + beq Lsqueeze_done + strb w4,[x20],#1 + lsr x4,x4,#8 + subs x21,x21,#1 + beq Lsqueeze_done + strb w4,[x20],#1 + +Lsqueeze_done: + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x29,x30,[sp],#48 +.long 0xd50323bf // autiasp + ret + + +.align 5 +KeccakF1600_ce: + mov x9,#12 + adr x10,iotas + b Loop_ce +.align 4 +Loop_ce: + ////////////////////////////////////////////////// Theta +.long 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b +.long 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b +.long 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b +.long 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b +.long 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b +.long 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b +.long 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b +.long 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b +.long 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b +.long 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b + +.long 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1] +.long 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2] +.long 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3] +.long 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4] +.long 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0] + + ////////////////////////////////////////////////// Theta+Rho+Pi +.long 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1] +.long 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20 +.long 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61 +.long 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39 +.long 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18 + +.long 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62 + +.long 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43 +.long 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25 +.long 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8 +.long 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56 +.long 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41 + +.long 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27 + + eor v0.16b,v0.16b,v29.16b + ldr x11,[x10],#8 + +.long 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3] +.long 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15 +.long 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10 +.long 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6 +.long 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3 + +.long 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // * + +.long 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14 +.long 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2 +.long 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55 +.long 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45 +.long 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36 + +.long 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0] + + ////////////////////////////////////////////////// Chi+Iota + dup v31.2d,x11 // borrow C[6] +.long 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // * +.long 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // * +.long 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b +.long 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b +.long 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b + +.long 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // * +.long 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // * +.long 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b +.long 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b +.long 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b + + eor v0.16b,v28.16b,v31.16b // Iota + +.long 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // * +.long 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // * +.long 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b +.long 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b +.long 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b + +.long 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // * +.long 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // * +.long 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b +.long 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b +.long 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b + +.long 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // * +.long 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // * +.long 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b +.long 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b +.long 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b + ////////////////////////////////////////////////// Theta +.long 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b +.long 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b +.long 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b +.long 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b +.long 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b +.long 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b +.long 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b +.long 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b +.long 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b +.long 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b + +.long 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1] +.long 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2] +.long 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3] +.long 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4] +.long 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0] + + ////////////////////////////////////////////////// Theta+Rho+Pi +.long 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1] +.long 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20 +.long 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61 +.long 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39 +.long 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18 + +.long 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62 + +.long 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43 +.long 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25 +.long 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8 +.long 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56 +.long 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41 + +.long 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27 + + eor v0.16b,v0.16b,v16.16b + ldr x11,[x10],#8 + +.long 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3] +.long 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15 +.long 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10 +.long 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6 +.long 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3 + +.long 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // * + +.long 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14 +.long 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2 +.long 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55 +.long 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45 +.long 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36 + +.long 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0] + + ////////////////////////////////////////////////// Chi+Iota + dup v21.2d,x11 // borrow C[6] +.long 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // * +.long 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // * +.long 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b +.long 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b +.long 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b + +.long 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // * +.long 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // * +.long 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b +.long 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b +.long 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b + + eor v0.16b,v15.16b,v21.16b // Iota + +.long 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // * +.long 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // * +.long 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b +.long 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b +.long 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b + +.long 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // * +.long 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // * +.long 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b +.long 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b +.long 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b + +.long 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // * +.long 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // * +.long 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b +.long 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b +.long 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b + subs x9,x9,#1 + bne Loop_ce + + ret + + + +.align 5 +KeccakF1600_cext: +.long 0xd503233f // paciasp + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + ldp d0,d1,[x0,#8*0] + ldp d2,d3,[x0,#8*2] + ldp d4,d5,[x0,#8*4] + ldp d6,d7,[x0,#8*6] + ldp d8,d9,[x0,#8*8] + ldp d10,d11,[x0,#8*10] + ldp d12,d13,[x0,#8*12] + ldp d14,d15,[x0,#8*14] + ldp d16,d17,[x0,#8*16] + ldp d18,d19,[x0,#8*18] + ldp d20,d21,[x0,#8*20] + ldp d22,d23,[x0,#8*22] + ldr d24,[x0,#8*24] + bl KeccakF1600_ce + ldr x30,[sp,#8] + stp d0,d1,[x0,#8*0] + stp d2,d3,[x0,#8*2] + stp d4,d5,[x0,#8*4] + stp d6,d7,[x0,#8*6] + stp d8,d9,[x0,#8*8] + stp d10,d11,[x0,#8*10] + stp d12,d13,[x0,#8*12] + stp d14,d15,[x0,#8*14] + stp d16,d17,[x0,#8*16] + stp d18,d19,[x0,#8*18] + stp d20,d21,[x0,#8*20] + stp d22,d23,[x0,#8*22] + str d24,[x0,#8*24] + + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 +.long 0xd50323bf // autiasp + ret + +.globl _SHA3_absorb_cext + +.align 5 +_SHA3_absorb_cext: +.long 0xd503233f // paciasp + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + ldp d0,d1,[x0,#8*0] + ldp d2,d3,[x0,#8*2] + ldp d4,d5,[x0,#8*4] + ldp d6,d7,[x0,#8*6] + ldp d8,d9,[x0,#8*8] + ldp d10,d11,[x0,#8*10] + ldp d12,d13,[x0,#8*12] + ldp d14,d15,[x0,#8*14] + ldp d16,d17,[x0,#8*16] + ldp d18,d19,[x0,#8*18] + ldp d20,d21,[x0,#8*20] + ldp d22,d23,[x0,#8*22] + ldr d24,[x0,#8*24] + b Loop_absorb_ce + +.align 4 +Loop_absorb_ce: + subs x2,x2,x3 // len - bsz + blo Labsorbed_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v0.16b,v0.16b,v31.16b + cmp x3,#8*(0+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v1.16b,v1.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v2.16b,v2.16b,v31.16b + cmp x3,#8*(2+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v3.16b,v3.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v4.16b,v4.16b,v31.16b + cmp x3,#8*(4+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v5.16b,v5.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v6.16b,v6.16b,v31.16b + cmp x3,#8*(6+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v7.16b,v7.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v8.16b,v8.16b,v31.16b + cmp x3,#8*(8+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v9.16b,v9.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v10.16b,v10.16b,v31.16b + cmp x3,#8*(10+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v11.16b,v11.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v12.16b,v12.16b,v31.16b + cmp x3,#8*(12+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v13.16b,v13.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v14.16b,v14.16b,v31.16b + cmp x3,#8*(14+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v15.16b,v15.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v16.16b,v16.16b,v31.16b + cmp x3,#8*(16+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v17.16b,v17.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v18.16b,v18.16b,v31.16b + cmp x3,#8*(18+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v19.16b,v19.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v20.16b,v20.16b,v31.16b + cmp x3,#8*(20+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v21.16b,v21.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v22.16b,v22.16b,v31.16b + cmp x3,#8*(22+2) + blo Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v23.16b,v23.16b,v31.16b + beq Lprocess_block_ce + ldr d31,[x1],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor v24.16b,v24.16b,v31.16b + +Lprocess_block_ce: + + bl KeccakF1600_ce + + b Loop_absorb_ce + +.align 4 +Labsorbed_ce: + stp d0,d1,[x0,#8*0] + stp d2,d3,[x0,#8*2] + stp d4,d5,[x0,#8*4] + stp d6,d7,[x0,#8*6] + stp d8,d9,[x0,#8*8] + stp d10,d11,[x0,#8*10] + stp d12,d13,[x0,#8*12] + stp d14,d15,[x0,#8*14] + stp d16,d17,[x0,#8*16] + stp d18,d19,[x0,#8*18] + stp d20,d21,[x0,#8*20] + stp d22,d23,[x0,#8*22] + str d24,[x0,#8*24] + add x0,x2,x3 // return value + + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldp x29,x30,[sp],#80 +.long 0xd50323bf // autiasp + ret + +.globl _SHA3_squeeze_cext + +.align 5 +_SHA3_squeeze_cext: +.long 0xd503233f // paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x9,x0 + mov x10,x3 + +Loop_squeeze_ce: + ldr x4,[x9],#8 + cmp x2,#8 + blo Lsqueeze_tail_ce +#ifdef __AARCH64EB__ + rev x4,x4 +#endif + str x4,[x1],#8 + beq Lsqueeze_done_ce + + sub x2,x2,#8 + subs x10,x10,#8 + bhi Loop_squeeze_ce + + bl KeccakF1600_cext + ldr x30,[sp,#8] + mov x9,x0 + mov x10,x3 + b Loop_squeeze_ce + +.align 4 +Lsqueeze_tail_ce: + strb w4,[x1],#1 + lsr x4,x4,#8 + subs x2,x2,#1 + beq Lsqueeze_done_ce + strb w4,[x1],#1 + lsr x4,x4,#8 + subs x2,x2,#1 + beq Lsqueeze_done_ce + strb w4,[x1],#1 + lsr x4,x4,#8 + subs x2,x2,#1 + beq Lsqueeze_done_ce + strb w4,[x1],#1 + lsr x4,x4,#8 + subs x2,x2,#1 + beq Lsqueeze_done_ce + strb w4,[x1],#1 + lsr x4,x4,#8 + subs x2,x2,#1 + beq Lsqueeze_done_ce + strb w4,[x1],#1 + lsr x4,x4,#8 + subs x2,x2,#1 + beq Lsqueeze_done_ce + strb w4,[x1],#1 + +Lsqueeze_done_ce: + ldr x29,[sp],#16 +.long 0xd50323bf // autiasp + ret + +.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 |