aboutsummaryrefslogtreecommitdiff
path: root/android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S
diff options
context:
space:
mode:
Diffstat (limited to 'android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S')
-rw-r--r--android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S1095
1 files changed, 1095 insertions, 0 deletions
diff --git a/android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S b/android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S
new file mode 100644
index 0000000000..ecb1151893
--- /dev/null
+++ b/android/gen-darwin_arm64/crypto/sha/keccak1600-armv8.S
@@ -0,0 +1,1095 @@
+.text
+
+.align 8 // strategic alignment and padding that allows to use
+ // address value as loop termination condition...
+.quad 0,0,0,0,0,0,0,0
+
+iotas:
+.quad 0x0000000000000001
+.quad 0x0000000000008082
+.quad 0x800000000000808a
+.quad 0x8000000080008000
+.quad 0x000000000000808b
+.quad 0x0000000080000001
+.quad 0x8000000080008081
+.quad 0x8000000000008009
+.quad 0x000000000000008a
+.quad 0x0000000000000088
+.quad 0x0000000080008009
+.quad 0x000000008000000a
+.quad 0x000000008000808b
+.quad 0x800000000000008b
+.quad 0x8000000000008089
+.quad 0x8000000000008003
+.quad 0x8000000000008002
+.quad 0x8000000000000080
+.quad 0x000000000000800a
+.quad 0x800000008000000a
+.quad 0x8000000080008081
+.quad 0x8000000000008080
+.quad 0x0000000080000001
+.quad 0x8000000080008008
+
+
+.align 5
+KeccakF1600_int:
+ adr x28,iotas
+.long 0xd503233f // paciasp
+ stp x28,x30,[sp,#16] // 32 bytes on top are mine
+ b Loop
+.align 4
+Loop:
+ ////////////////////////////////////////// Theta
+ eor x26,x0,x5
+ stp x4,x9,[sp,#0] // offload pair...
+ eor x27,x1,x6
+ eor x28,x2,x7
+ eor x30,x3,x8
+ eor x4,x4,x9
+ eor x26,x26,x10
+ eor x27,x27,x11
+ eor x28,x28,x12
+ eor x30,x30,x13
+ eor x4,x4,x14
+ eor x26,x26,x15
+ eor x27,x27,x16
+ eor x28,x28,x17
+ eor x30,x30,x25
+ eor x4,x4,x19
+ eor x26,x26,x20
+ eor x28,x28,x22
+ eor x27,x27,x21
+ eor x30,x30,x23
+ eor x4,x4,x24
+
+ eor x9,x26,x28,ror#63
+
+ eor x1,x1,x9
+ eor x6,x6,x9
+ eor x11,x11,x9
+ eor x16,x16,x9
+ eor x21,x21,x9
+
+ eor x9,x27,x30,ror#63
+ eor x28,x28,x4,ror#63
+ eor x30,x30,x26,ror#63
+ eor x4,x4,x27,ror#63
+
+ eor x27, x2,x9 // mov x27,x2
+ eor x7,x7,x9
+ eor x12,x12,x9
+ eor x17,x17,x9
+ eor x22,x22,x9
+
+ eor x0,x0,x4
+ eor x5,x5,x4
+ eor x10,x10,x4
+ eor x15,x15,x4
+ eor x20,x20,x4
+ ldp x4,x9,[sp,#0] // re-load offloaded data
+ eor x26, x3,x28 // mov x26,x3
+ eor x8,x8,x28
+ eor x13,x13,x28
+ eor x25,x25,x28
+ eor x23,x23,x28
+
+ eor x28, x4,x30 // mov x28,x4
+ eor x9,x9,x30
+ eor x14,x14,x30
+ eor x19,x19,x30
+ eor x24,x24,x30
+
+ ////////////////////////////////////////// Rho+Pi
+ mov x30,x1
+ ror x1,x6,#64-44
+ //mov x27,x2
+ ror x2,x12,#64-43
+ //mov x26,x3
+ ror x3,x25,#64-21
+ //mov x28,x4
+ ror x4,x24,#64-14
+
+ ror x6,x9,#64-20
+ ror x12,x13,#64-25
+ ror x25,x17,#64-15
+ ror x24,x21,#64-2
+
+ ror x9,x22,#64-61
+ ror x13,x19,#64-8
+ ror x17,x11,#64-10
+ ror x21,x8,#64-55
+
+ ror x22,x14,#64-39
+ ror x19,x23,#64-56
+ ror x11,x7,#64-6
+ ror x8,x16,#64-45
+
+ ror x14,x20,#64-18
+ ror x23,x15,#64-41
+ ror x7,x10,#64-3
+ ror x16,x5,#64-36
+
+ ror x5,x26,#64-28
+ ror x10,x30,#64-1
+ ror x15,x28,#64-27
+ ror x20,x27,#64-62
+
+ ////////////////////////////////////////// Chi+Iota
+ bic x26,x2,x1
+ bic x27,x3,x2
+ bic x28,x0,x4
+ bic x30,x1,x0
+ eor x0,x0,x26
+ bic x26,x4,x3
+ eor x1,x1,x27
+ ldr x27,[sp,#16]
+ eor x3,x3,x28
+ eor x4,x4,x30
+ eor x2,x2,x26
+ ldr x30,[x27],#8 // Iota[i++]
+
+ bic x26,x7,x6
+ tst x27,#255 // are we done?
+ str x27,[sp,#16]
+ bic x27,x8,x7
+ bic x28,x5,x9
+ eor x0,x0,x30 // A[0][0] ^= Iota
+ bic x30,x6,x5
+ eor x5,x5,x26
+ bic x26,x9,x8
+ eor x6,x6,x27
+ eor x8,x8,x28
+ eor x9,x9,x30
+ eor x7,x7,x26
+
+ bic x26,x12,x11
+ bic x27,x13,x12
+ bic x28,x10,x14
+ bic x30,x11,x10
+ eor x10,x10,x26
+ bic x26,x14,x13
+ eor x11,x11,x27
+ eor x13,x13,x28
+ eor x14,x14,x30
+ eor x12,x12,x26
+
+ bic x26,x17,x16
+ bic x27,x25,x17
+ bic x28,x15,x19
+ bic x30,x16,x15
+ eor x15,x15,x26
+ bic x26,x19,x25
+ eor x16,x16,x27
+ eor x25,x25,x28
+ eor x19,x19,x30
+ eor x17,x17,x26
+
+ bic x26,x22,x21
+ bic x27,x23,x22
+ bic x28,x20,x24
+ bic x30,x21,x20
+ eor x20,x20,x26
+ bic x26,x24,x23
+ eor x21,x21,x27
+ eor x23,x23,x28
+ eor x24,x24,x30
+ eor x22,x22,x26
+
+ bne Loop
+
+ ldr x30,[sp,#24]
+.long 0xd50323bf // autiasp
+ ret
+
+
+
+.align 5
+KeccakF1600:
+.long 0xd503233f // paciasp
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#48
+
+ str x0,[sp,#32] // offload argument
+ mov x26,x0
+ ldp x0,x1,[x0,#16*0]
+ ldp x2,x3,[x26,#16*1]
+ ldp x4,x5,[x26,#16*2]
+ ldp x6,x7,[x26,#16*3]
+ ldp x8,x9,[x26,#16*4]
+ ldp x10,x11,[x26,#16*5]
+ ldp x12,x13,[x26,#16*6]
+ ldp x14,x15,[x26,#16*7]
+ ldp x16,x17,[x26,#16*8]
+ ldp x25,x19,[x26,#16*9]
+ ldp x20,x21,[x26,#16*10]
+ ldp x22,x23,[x26,#16*11]
+ ldr x24,[x26,#16*12]
+
+ bl KeccakF1600_int
+
+ ldr x26,[sp,#32]
+ stp x0,x1,[x26,#16*0]
+ stp x2,x3,[x26,#16*1]
+ stp x4,x5,[x26,#16*2]
+ stp x6,x7,[x26,#16*3]
+ stp x8,x9,[x26,#16*4]
+ stp x10,x11,[x26,#16*5]
+ stp x12,x13,[x26,#16*6]
+ stp x14,x15,[x26,#16*7]
+ stp x16,x17,[x26,#16*8]
+ stp x25,x19,[x26,#16*9]
+ stp x20,x21,[x26,#16*10]
+ stp x22,x23,[x26,#16*11]
+ str x24,[x26,#16*12]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#48
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+.long 0xd50323bf // autiasp
+ ret
+
+
+.globl _SHA3_absorb
+
+.align 5
+_SHA3_absorb:
+.long 0xd503233f // paciasp
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ stp x0,x1,[sp,#32] // offload arguments
+ stp x2,x3,[sp,#48]
+
+ mov x26,x0 // uint64_t A[5][5]
+ mov x27,x1 // const void *inp
+ mov x28,x2 // size_t len
+ mov x30,x3 // size_t bsz
+ ldp x0,x1,[x26,#16*0]
+ ldp x2,x3,[x26,#16*1]
+ ldp x4,x5,[x26,#16*2]
+ ldp x6,x7,[x26,#16*3]
+ ldp x8,x9,[x26,#16*4]
+ ldp x10,x11,[x26,#16*5]
+ ldp x12,x13,[x26,#16*6]
+ ldp x14,x15,[x26,#16*7]
+ ldp x16,x17,[x26,#16*8]
+ ldp x25,x19,[x26,#16*9]
+ ldp x20,x21,[x26,#16*10]
+ ldp x22,x23,[x26,#16*11]
+ ldr x24,[x26,#16*12]
+ b Loop_absorb
+
+.align 4
+Loop_absorb:
+ subs x26,x28,x30 // len - bsz
+ blo Labsorbed
+
+ str x26,[sp,#48] // save len - bsz
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x0,x0,x26
+ cmp x30,#8*(0+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x1,x1,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x2,x2,x26
+ cmp x30,#8*(2+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x3,x3,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x4,x4,x26
+ cmp x30,#8*(4+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x5,x5,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x6,x6,x26
+ cmp x30,#8*(6+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x7,x7,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x8,x8,x26
+ cmp x30,#8*(8+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x9,x9,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x10,x10,x26
+ cmp x30,#8*(10+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x11,x11,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x12,x12,x26
+ cmp x30,#8*(12+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x13,x13,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x14,x14,x26
+ cmp x30,#8*(14+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x15,x15,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x16,x16,x26
+ cmp x30,#8*(16+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x17,x17,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x25,x25,x26
+ cmp x30,#8*(18+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x19,x19,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x20,x20,x26
+ cmp x30,#8*(20+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x21,x21,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x22,x22,x26
+ cmp x30,#8*(22+2)
+ blo Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x23,x23,x26
+ beq Lprocess_block
+ ldr x26,[x27],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev x26,x26
+#endif
+ eor x24,x24,x26
+
+Lprocess_block:
+ str x27,[sp,#40] // save inp
+
+ bl KeccakF1600_int
+
+ ldr x27,[sp,#40] // restore arguments
+ ldp x28,x30,[sp,#48]
+ b Loop_absorb
+
+.align 4
+Labsorbed:
+ ldr x27,[sp,#32]
+ stp x0,x1,[x27,#16*0]
+ stp x2,x3,[x27,#16*1]
+ stp x4,x5,[x27,#16*2]
+ stp x6,x7,[x27,#16*3]
+ stp x8,x9,[x27,#16*4]
+ stp x10,x11,[x27,#16*5]
+ stp x12,x13,[x27,#16*6]
+ stp x14,x15,[x27,#16*7]
+ stp x16,x17,[x27,#16*8]
+ stp x25,x19,[x27,#16*9]
+ stp x20,x21,[x27,#16*10]
+ stp x22,x23,[x27,#16*11]
+ str x24,[x27,#16*12]
+
+ mov x0,x28 // return value
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+.long 0xd50323bf // autiasp
+ ret
+
+.globl _SHA3_squeeze
+
+.align 5
+_SHA3_squeeze:
+.long 0xd503233f // paciasp
+ stp x29,x30,[sp,#-48]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+
+ mov x19,x0 // put aside arguments
+ mov x20,x1
+ mov x21,x2
+ mov x22,x3
+
+Loop_squeeze:
+ ldr x4,[x0],#8
+ cmp x21,#8
+ blo Lsqueeze_tail
+#ifdef __AARCH64EB__
+ rev x4,x4
+#endif
+ str x4,[x20],#8
+ subs x21,x21,#8
+ beq Lsqueeze_done
+
+ subs x3,x3,#8
+ bhi Loop_squeeze
+
+ mov x0,x19
+ bl KeccakF1600
+ mov x0,x19
+ mov x3,x22
+ b Loop_squeeze
+
+.align 4
+Lsqueeze_tail:
+ strb w4,[x20],#1
+ lsr x4,x4,#8
+ subs x21,x21,#1
+ beq Lsqueeze_done
+ strb w4,[x20],#1
+ lsr x4,x4,#8
+ subs x21,x21,#1
+ beq Lsqueeze_done
+ strb w4,[x20],#1
+ lsr x4,x4,#8
+ subs x21,x21,#1
+ beq Lsqueeze_done
+ strb w4,[x20],#1
+ lsr x4,x4,#8
+ subs x21,x21,#1
+ beq Lsqueeze_done
+ strb w4,[x20],#1
+ lsr x4,x4,#8
+ subs x21,x21,#1
+ beq Lsqueeze_done
+ strb w4,[x20],#1
+ lsr x4,x4,#8
+ subs x21,x21,#1
+ beq Lsqueeze_done
+ strb w4,[x20],#1
+
+Lsqueeze_done:
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x29,x30,[sp],#48
+.long 0xd50323bf // autiasp
+ ret
+
+
+.align 5
+KeccakF1600_ce:
+ mov x9,#12
+ adr x10,iotas
+ b Loop_ce
+.align 4
+Loop_ce:
+ ////////////////////////////////////////////////// Theta
+.long 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b
+.long 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b
+.long 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b
+.long 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b
+.long 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b
+.long 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b
+.long 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b
+.long 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b
+.long 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b
+.long 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b
+
+.long 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1]
+.long 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2]
+.long 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3]
+.long 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4]
+.long 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0]
+
+ ////////////////////////////////////////////////// Theta+Rho+Pi
+.long 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1]
+.long 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20
+.long 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61
+.long 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39
+.long 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18
+
+.long 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62
+
+.long 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43
+.long 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25
+.long 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8
+.long 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56
+.long 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41
+
+.long 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27
+
+ eor v0.16b,v0.16b,v29.16b
+ ldr x11,[x10],#8
+
+.long 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3]
+.long 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15
+.long 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10
+.long 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6
+.long 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3
+
+.long 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // *
+
+.long 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14
+.long 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2
+.long 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55
+.long 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45
+.long 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36
+
+.long 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0]
+
+ ////////////////////////////////////////////////// Chi+Iota
+ dup v31.2d,x11 // borrow C[6]
+.long 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // *
+.long 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // *
+.long 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b
+.long 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b
+.long 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b
+
+.long 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // *
+.long 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // *
+.long 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
+.long 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b
+.long 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b
+
+ eor v0.16b,v28.16b,v31.16b // Iota
+
+.long 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // *
+.long 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // *
+.long 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
+.long 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b
+.long 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b
+
+.long 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // *
+.long 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // *
+.long 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
+.long 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b
+.long 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b
+
+.long 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // *
+.long 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // *
+.long 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
+.long 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b
+.long 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b
+ ////////////////////////////////////////////////// Theta
+.long 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b
+.long 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b
+.long 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b
+.long 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b
+.long 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b
+.long 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b
+.long 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b
+.long 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b
+.long 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b
+.long 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b
+
+.long 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1]
+.long 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2]
+.long 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3]
+.long 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4]
+.long 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0]
+
+ ////////////////////////////////////////////////// Theta+Rho+Pi
+.long 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1]
+.long 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20
+.long 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61
+.long 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39
+.long 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18
+
+.long 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62
+
+.long 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43
+.long 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25
+.long 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8
+.long 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56
+.long 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41
+
+.long 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27
+
+ eor v0.16b,v0.16b,v16.16b
+ ldr x11,[x10],#8
+
+.long 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3]
+.long 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15
+.long 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10
+.long 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6
+.long 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3
+
+.long 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // *
+
+.long 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14
+.long 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2
+.long 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55
+.long 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45
+.long 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36
+
+.long 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0]
+
+ ////////////////////////////////////////////////// Chi+Iota
+ dup v21.2d,x11 // borrow C[6]
+.long 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // *
+.long 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // *
+.long 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b
+.long 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b
+.long 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b
+
+.long 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // *
+.long 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // *
+.long 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
+.long 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b
+.long 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b
+
+ eor v0.16b,v15.16b,v21.16b // Iota
+
+.long 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // *
+.long 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // *
+.long 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
+.long 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b
+.long 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b
+
+.long 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // *
+.long 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // *
+.long 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
+.long 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b
+.long 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b
+
+.long 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // *
+.long 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // *
+.long 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
+.long 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b
+.long 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b
+ subs x9,x9,#1
+ bne Loop_ce
+
+ ret
+
+
+
+.align 5
+KeccakF1600_cext:
+.long 0xd503233f // paciasp
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#16] // per ABI requirement
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+ ldp d0,d1,[x0,#8*0]
+ ldp d2,d3,[x0,#8*2]
+ ldp d4,d5,[x0,#8*4]
+ ldp d6,d7,[x0,#8*6]
+ ldp d8,d9,[x0,#8*8]
+ ldp d10,d11,[x0,#8*10]
+ ldp d12,d13,[x0,#8*12]
+ ldp d14,d15,[x0,#8*14]
+ ldp d16,d17,[x0,#8*16]
+ ldp d18,d19,[x0,#8*18]
+ ldp d20,d21,[x0,#8*20]
+ ldp d22,d23,[x0,#8*22]
+ ldr d24,[x0,#8*24]
+ bl KeccakF1600_ce
+ ldr x30,[sp,#8]
+ stp d0,d1,[x0,#8*0]
+ stp d2,d3,[x0,#8*2]
+ stp d4,d5,[x0,#8*4]
+ stp d6,d7,[x0,#8*6]
+ stp d8,d9,[x0,#8*8]
+ stp d10,d11,[x0,#8*10]
+ stp d12,d13,[x0,#8*12]
+ stp d14,d15,[x0,#8*14]
+ stp d16,d17,[x0,#8*16]
+ stp d18,d19,[x0,#8*18]
+ stp d20,d21,[x0,#8*20]
+ stp d22,d23,[x0,#8*22]
+ str d24,[x0,#8*24]
+
+ ldp d8,d9,[sp,#16]
+ ldp d10,d11,[sp,#32]
+ ldp d12,d13,[sp,#48]
+ ldp d14,d15,[sp,#64]
+ ldr x29,[sp],#80
+.long 0xd50323bf // autiasp
+ ret
+
+.globl _SHA3_absorb_cext
+
+.align 5
+_SHA3_absorb_cext:
+.long 0xd503233f // paciasp
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#16] // per ABI requirement
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+ ldp d0,d1,[x0,#8*0]
+ ldp d2,d3,[x0,#8*2]
+ ldp d4,d5,[x0,#8*4]
+ ldp d6,d7,[x0,#8*6]
+ ldp d8,d9,[x0,#8*8]
+ ldp d10,d11,[x0,#8*10]
+ ldp d12,d13,[x0,#8*12]
+ ldp d14,d15,[x0,#8*14]
+ ldp d16,d17,[x0,#8*16]
+ ldp d18,d19,[x0,#8*18]
+ ldp d20,d21,[x0,#8*20]
+ ldp d22,d23,[x0,#8*22]
+ ldr d24,[x0,#8*24]
+ b Loop_absorb_ce
+
+.align 4
+Loop_absorb_ce:
+ subs x2,x2,x3 // len - bsz
+ blo Labsorbed_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v0.16b,v0.16b,v31.16b
+ cmp x3,#8*(0+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v1.16b,v1.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v2.16b,v2.16b,v31.16b
+ cmp x3,#8*(2+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v3.16b,v3.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v4.16b,v4.16b,v31.16b
+ cmp x3,#8*(4+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v5.16b,v5.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v6.16b,v6.16b,v31.16b
+ cmp x3,#8*(6+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v7.16b,v7.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v8.16b,v8.16b,v31.16b
+ cmp x3,#8*(8+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v9.16b,v9.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v10.16b,v10.16b,v31.16b
+ cmp x3,#8*(10+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v11.16b,v11.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v12.16b,v12.16b,v31.16b
+ cmp x3,#8*(12+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v13.16b,v13.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v14.16b,v14.16b,v31.16b
+ cmp x3,#8*(14+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v15.16b,v15.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v16.16b,v16.16b,v31.16b
+ cmp x3,#8*(16+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v17.16b,v17.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v18.16b,v18.16b,v31.16b
+ cmp x3,#8*(18+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v19.16b,v19.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v20.16b,v20.16b,v31.16b
+ cmp x3,#8*(20+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v21.16b,v21.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v22.16b,v22.16b,v31.16b
+ cmp x3,#8*(22+2)
+ blo Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v23.16b,v23.16b,v31.16b
+ beq Lprocess_block_ce
+ ldr d31,[x1],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor v24.16b,v24.16b,v31.16b
+
+Lprocess_block_ce:
+
+ bl KeccakF1600_ce
+
+ b Loop_absorb_ce
+
+.align 4
+Labsorbed_ce:
+ stp d0,d1,[x0,#8*0]
+ stp d2,d3,[x0,#8*2]
+ stp d4,d5,[x0,#8*4]
+ stp d6,d7,[x0,#8*6]
+ stp d8,d9,[x0,#8*8]
+ stp d10,d11,[x0,#8*10]
+ stp d12,d13,[x0,#8*12]
+ stp d14,d15,[x0,#8*14]
+ stp d16,d17,[x0,#8*16]
+ stp d18,d19,[x0,#8*18]
+ stp d20,d21,[x0,#8*20]
+ stp d22,d23,[x0,#8*22]
+ str d24,[x0,#8*24]
+ add x0,x2,x3 // return value
+
+ ldp d8,d9,[sp,#16]
+ ldp d10,d11,[sp,#32]
+ ldp d12,d13,[sp,#48]
+ ldp d14,d15,[sp,#64]
+ ldp x29,x30,[sp],#80
+.long 0xd50323bf // autiasp
+ ret
+
+.globl _SHA3_squeeze_cext
+
+.align 5
+_SHA3_squeeze_cext:
+.long 0xd503233f // paciasp
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ mov x9,x0
+ mov x10,x3
+
+Loop_squeeze_ce:
+ ldr x4,[x9],#8
+ cmp x2,#8
+ blo Lsqueeze_tail_ce
+#ifdef __AARCH64EB__
+ rev x4,x4
+#endif
+ str x4,[x1],#8
+ beq Lsqueeze_done_ce
+
+ sub x2,x2,#8
+ subs x10,x10,#8
+ bhi Loop_squeeze_ce
+
+ bl KeccakF1600_cext
+ ldr x30,[sp,#8]
+ mov x9,x0
+ mov x10,x3
+ b Loop_squeeze_ce
+
+.align 4
+Lsqueeze_tail_ce:
+ strb w4,[x1],#1
+ lsr x4,x4,#8
+ subs x2,x2,#1
+ beq Lsqueeze_done_ce
+ strb w4,[x1],#1
+ lsr x4,x4,#8
+ subs x2,x2,#1
+ beq Lsqueeze_done_ce
+ strb w4,[x1],#1
+ lsr x4,x4,#8
+ subs x2,x2,#1
+ beq Lsqueeze_done_ce
+ strb w4,[x1],#1
+ lsr x4,x4,#8
+ subs x2,x2,#1
+ beq Lsqueeze_done_ce
+ strb w4,[x1],#1
+ lsr x4,x4,#8
+ subs x2,x2,#1
+ beq Lsqueeze_done_ce
+ strb w4,[x1],#1
+ lsr x4,x4,#8
+ subs x2,x2,#1
+ beq Lsqueeze_done_ce
+ strb w4,[x1],#1
+
+Lsqueeze_done_ce:
+ ldr x29,[sp],#16
+.long 0xd50323bf // autiasp
+ ret
+
+.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2