From cf708447e7959f18580e4fefaba57cea27a04bdc Mon Sep 17 00:00:00 2001 From: Bernhard Rosenkraenzer Date: Wed, 26 Sep 2012 01:38:22 +0159 Subject: Sync with svn rev. 191738 Signed-off-by: Bernhard Rosenkraenzer --- gcc/ChangeLog.aarch64 | 120 ++ gcc/config/aarch64/aarch64-builtins.c | 85 +- gcc/config/aarch64/aarch64-modes.def | 10 + gcc/config/aarch64/aarch64-protos.h | 9 +- gcc/config/aarch64/aarch64-simd.md | 530 ++++++++- gcc/config/aarch64/aarch64.c | 183 ++- gcc/config/aarch64/aarch64.h | 7 +- gcc/config/aarch64/aarch64.md | 18 +- gcc/config/aarch64/arm_neon.h | 2072 ++++++++++++++++++++++++++++----- gcc/config/aarch64/constraints.md | 18 + gcc/config/aarch64/iterators.md | 37 + gcc/config/aarch64/predicates.md | 36 + gcc/testsuite/ChangeLog.aarch64 | 6 + gcc/testsuite/lib/target-supports.exp | 3 +- 14 files changed, 2764 insertions(+), 370 deletions(-) (limited to 'gcc') diff --git a/gcc/ChangeLog.aarch64 b/gcc/ChangeLog.aarch64 index 6baf1523b..90fba49a2 100644 --- a/gcc/ChangeLog.aarch64 +++ b/gcc/ChangeLog.aarch64 @@ -1,3 +1,123 @@ +2012-09-25 Tejas Belagod + + * config/aarch64/aarch64.c (aarch64_shift_truncation_mask): Define. + (TARGET_SHIFT_TRUNCATION_MASK): Define. + * config/aarch64/aarch64.h (SHIFT_COUNT_TRUNCATED): Conditionalize on + TARGET_SIMD. + +2012-09-25 Tejas Belagod + + * config/aarch64/arm_neon.h (vrshrn_high_n_s16, vrshrn_high_n_s32) + (vrshrn_high_n_s64, vrshrn_high_n_u16, vrshrn_high_n_u32) + (vrshrn_high_n_u64, vshrn_high_n_s16, vshrn_high_n_s32) + (vshrn_high_n_s32, vshrn_high_n_s64, vshrn_high_n_u16, vshrn_high_n_u32) + (vshrn_high_n_u64): Fix template to reference correct operands. + +2012-09-25 Tejas Belagod + + * config/aarch64/arm_neon.h (vmovq_n_f64): Add. + +2012-09-25 Tejas Belagod + + * config/aarch64/arm_neon.h (vfmaq_lane_f64): Fix prototype and + assembler template accordingly. + +2012-09-25 Tejas Belagod + + * config/aarch64/aarch64-protos.h (aarch64_simd_imm_scalar_p): Declare. + * config/aarch64/aarch64.c (aarch64_simd_imm_scalar_p): New. + * config/aarch64/aarch64.md (*movdi_aarch64): Add alternative for moving + valid scalar immediate into a Advanved SIMD D-register. + * config/aarch64/constraints.md (Dd): New. + +2012-09-25 Tejas Belagod + + * config/aarch64/aarch64-simd.md (aarch64_cm): Tighten + predicate for operand 2 of the compare pattern to accept register + or zero. + * config/aarch64/predicates.md (aarch64_simd_reg_or_zero): New. + +2012-09-25 Tejas Belagod + + * config/aarch64/aarch64-simd.md (*aarch64_simd_mov): Split Q-reg + vector value move contained in general registers. + +2012-09-25 Tejas Belagod + + * config/aarch64/aarch64.c (aarch64_simd_expand_builtin): Expand binary + operations' constant operand only if the predicate allows it. + +2012-09-25 Tejas Belagod + + * config/aarch64/aarch64-builtins.c (aarch64_simd_builtin_data): + Populate intrinsic table with struct loads and store descriptors. + (init_aarch64_simd_builtins): Remove cruft. + (aarch64_simd_expand_builtin): Expand the builtins. + * config/aarch64/aarch64-modes.def: Define new vector modes for register + lists. + * config/aarch64/aarch64-protos.h (aarch64_simd_attr_length_move): New. + (aarch64_simd_mem_operand_p): New. + (aarch64_simd_imm_zero_p): New. + (aarch64_output_move_struct): New. + (aarch64_simd_disambiguate_copy): New. + * config/aarch64/aarch64-simd.md (simd_mode): Add OI, CI and XI to the + list. + (mov): Tighten predicates for simd operand. + (movmisalign): Likewise. + (*aarch64_simd_mov): Tighten predicates and constraints for simd + operands. + (*aarch64_combinez): New. + (vec_load_lanesoi, vec_store_lanesoi) + (vec_load_lanesci, vec_store_lanesci) + (vec_load_lanesxi) + (vec_store_lanesxi, mov, *aarch64_mov) + (aarch64_ld2_dreg, aarch64_ld3_dreg) + (aarch64_ld4_dreg, aarch64_ld) + (aarch64_ld) + (aarch64_get_dreg) + (aarch64_get_qreg, aarch64_st2_dreg) + (aarch64_st3_dreg, aarch64_st4_dreg) + (aarch64_st) + (aarch64_st) + (aarch64_set_qreg): New expanders and patterns + for vector struct loads and stores. + * config/aarch64/aarch64.c (aarch64_vect_struct_mode_p): New. + (aarch64_vector_mode_p): New. + (aarch64_array_mode_supported_p): New. + (aarch64_hard_regno_mode_ok): Check that reglists don't go out of + range and don't allocate general regs to large int modes. + (aarch64_classify_address): Restrict addressing modes of large int + modes to same as SIMD addressing modes. + (aarch64_print_operand): Print specifiers for register lists. + (aarch64_legitimize_reload_address): Treat large int modes simliar to + SIMD modes. + (aarch64_class_max_nregs): Return the correct max number of register + for a particular mode. + (aarch64_legitimate_constant_p): Do not allow large int modes + immediate values. + (aarch64_simd_imm_zero_p): New. + (aarch64_simd_mem_operand_p): Check if mem operand has a valid SIMD + addressing mode. + (aarch64_simd_disambiguate_copy): Copy values that span multiple + register with and without overlapping. + (aarch64_simd_attr_length_move): Length of instruction sequence + depending on the mode. + * config/aarch64/aarch64.h (AARCH64_VALID_SIMD_QREG_MODE): New. + * config/aarch64/aarch64.md (UNSPEC_VSTRUCTDUMMY, UNSPEC_LD2) + (UNSPEC_LD3, UNSPEC_LD4, UNSPEC_ST2, UNSPEC_ST3, UNSPEC_ST4): New. + * config/aarch64/arm_neon.h: Remove assembler implementation of vector + struct loads and stores and add new C implementations. + * config/aarch64/constraints.md (Utv): New memory constraint for SIMD + memory operands. + (Dz): New. + * config/aarch64/iterators.md (VDIC, VSTRUCT, DX): New mode iterators. + (Vendreg, nregs, VRL2, VRL3, VRL4, VSTRUCT_DREG): New mode attributes. + * config/aarch64/predicates.md (aarch64_simd_struct_operand): New. + (aarch64_simd_general_operand): New. + (aarch64_simd_nonimmediate_operand): New. + (aarch64_simd_reg_or_zero): New. + (aarch64_simd_imm_zero): New. + 2012-09-20 Ramana Radhakrishnan * config/aarch64/aarch64.md: Make unspec and unspecv constants diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 95ab760bb..429a0dfdb 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -179,6 +179,13 @@ typedef struct { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F), \ CF (N, G), CF (N, H), CF (N, I), CF (N, J), CF (N, K) }, 11, 0 +#define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \ + #N, AARCH64_SIMD_##T, UP (A) | UP (B) | UP (C) | UP (D) \ + | UP (E) | UP (F) | UP (G) \ + | UP (H) | UP (I) | UP (J) | UP (K) | UP (L), \ + { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F), \ + CF (N, G), CF (N, H), CF (N, I), CF (N, J), CF (N, K), CF (N, L) }, 12, 0 + /* The mode entries in the following table correspond to the "key" type of the instruction variant, i.e. equivalent to that which would be specified after @@ -197,6 +204,15 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { {VAR7 (GETLANE, get_lane_unsigned, v8qi, v4hi, v2si, v16qi, v8hi, v4si, v2di)}, {VAR4 (GETLANE, get_lane, v2sf, di, v4sf, v2df)}, + {VAR6 (GETLANE, get_dregoi, v8qi, v4hi, v2si, v2sf, di, df)}, + {VAR6 (GETLANE, get_qregoi, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, + {VAR6 (GETLANE, get_dregci, v8qi, v4hi, v2si, v2sf, di, df)}, + {VAR6 (GETLANE, get_qregci, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, + {VAR6 (GETLANE, get_dregxi, v8qi, v4hi, v2si, v2sf, di, df)}, + {VAR6 (GETLANE, get_qregxi, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, + {VAR6 (SETLANE, set_qregoi, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, + {VAR6 (SETLANE, set_qregci, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, + {VAR6 (SETLANE, set_qregxi, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, {VAR5 (REINTERP, reinterpretv8qi, v8qi, v4hi, v2si, v2sf, di)}, {VAR5 (REINTERP, reinterpretv4hi, v8qi, v4hi, v2si, v2sf, di)}, @@ -341,6 +357,18 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { { VAR6 (BINOP, umax, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, { VAR6 (BINOP, umin, v8qi, v4hi, v2si, v16qi, v8hi, v4si) }, { VAR3 (UNOP, sqrt, v2sf, v4sf, v2df) }, + {VAR12 (LOADSTRUCT, ld2, + v8qi, v4hi, v2si, v2sf, di, df, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, + {VAR12 (LOADSTRUCT, ld3, + v8qi, v4hi, v2si, v2sf, di, df, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, + {VAR12 (LOADSTRUCT, ld4, + v8qi, v4hi, v2si, v2sf, di, df, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, + {VAR12 (STORESTRUCT, st2, + v8qi, v4hi, v2si, v2sf, di, df, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, + {VAR12 (STORESTRUCT, st3, + v8qi, v4hi, v2si, v2sf, di, df, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, + {VAR12 (STORESTRUCT, st4, + v8qi, v4hi, v2si, v2sf, di, df, v16qi, v8hi, v4si, v4sf, v2di, v2df)}, }; #undef CF @@ -669,8 +697,8 @@ init_aarch64_simd_builtins (void) char namebuf[60]; tree ftype = NULL; enum insn_code icode; - int is_load = 0, is_struct_load = 0; - int is_store = 0, is_struct_store = 0; + int is_load = 0; + int is_store = 0; /* Skip if particular mode not supported. */ if ((d->bits & (1 << j)) == 0) @@ -683,21 +711,15 @@ init_aarch64_simd_builtins (void) case AARCH64_SIMD_LOAD1: case AARCH64_SIMD_LOAD1LANE: case AARCH64_SIMD_LOADSTRUCTLANE: - is_load = 1; - /* Fall through. */ case AARCH64_SIMD_LOADSTRUCT: - if (!is_load) - is_struct_load = 1; + is_load = 1; /* Fall through. */ case AARCH64_SIMD_STORE1: case AARCH64_SIMD_STORE1LANE: case AARCH64_SIMD_STORESTRUCTLANE: - if (!is_load && !is_struct_load) - is_store = 1; - /* Fall through. */ case AARCH64_SIMD_STORESTRUCT: - if (!is_load && !is_struct_load && !is_store) - is_struct_store = 1; + if (!is_load) + is_store = 1; /* Fall through. */ case AARCH64_SIMD_UNOP: case AARCH64_SIMD_BINOP: @@ -742,7 +764,7 @@ init_aarch64_simd_builtins (void) if (k == 2 && d->itype == AARCH64_SIMD_SPLIT) continue; - if (is_struct_load || (is_load && k == 1)) + if (is_load && k == 1) { /* AdvSIMD load patterns always have the memory operand (a DImode pointer) in the operand 1 position. We @@ -778,6 +800,7 @@ init_aarch64_simd_builtins (void) eltype = const_intDI_pointer_node; break; + case T_DF: case T_V2DF: eltype = const_double_pointer_node; break; @@ -786,7 +809,7 @@ init_aarch64_simd_builtins (void) gcc_unreachable (); } } - else if (is_struct_store || (is_store && k == 0)) + else if (is_store && k == 0) { /* Similarly, AdvSIMD store patterns use operand 0 as the memory location to store to (a DImode pointer). @@ -822,6 +845,7 @@ init_aarch64_simd_builtins (void) eltype = intDI_pointer_node; break; + case T_DF: case T_V2DF: eltype = double_pointer_node; break; @@ -908,8 +932,7 @@ init_aarch64_simd_builtins (void) } } - if (k == 0 && !is_store && !is_struct_load - && !is_struct_store) + if (k == 0 && !is_store) return_type = eltype; else args = tree_cons (NULL_TREE, eltype, args); @@ -1214,13 +1237,17 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx target) case AARCH64_SIMD_BINOP: { - bool op1_const_int_p - = CONST_INT_P (expand_normal (CALL_EXPR_ARG (exp, 1))); - return aarch64_simd_expand_args (target, icode, 1, exp, - SIMD_ARG_COPY_TO_REG, - op1_const_int_p ? SIMD_ARG_CONSTANT - : SIMD_ARG_COPY_TO_REG, - SIMD_ARG_STOP); + rtx arg2 = expand_normal (CALL_EXPR_ARG (exp, 1)); + /* Handle constants only if the predicate allows it. */ + bool op1_const_int_p = + (CONST_INT_P (arg2) + && (*insn_data[icode].operand[2].predicate) + (arg2, insn_data[icode].operand[2].mode)); + return aarch64_simd_expand_args + (target, icode, 1, exp, + SIMD_ARG_COPY_TO_REG, + op1_const_int_p ? SIMD_ARG_CONSTANT : SIMD_ARG_COPY_TO_REG, + SIMD_ARG_STOP); } case AARCH64_SIMD_TERNOP: @@ -1238,9 +1265,15 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx target) SIMD_ARG_COPY_TO_REG, SIMD_ARG_STOP); case AARCH64_SIMD_LOAD1: + case AARCH64_SIMD_LOADSTRUCT: return aarch64_simd_expand_args (target, icode, 1, exp, SIMD_ARG_COPY_TO_REG, SIMD_ARG_STOP); + case AARCH64_SIMD_STORESTRUCT: + return aarch64_simd_expand_args (target, icode, 0, exp, + SIMD_ARG_COPY_TO_REG, + SIMD_ARG_COPY_TO_REG, SIMD_ARG_STOP); + case AARCH64_SIMD_REINTERP: return aarch64_simd_expand_args (target, icode, 1, exp, SIMD_ARG_COPY_TO_REG, SIMD_ARG_STOP); @@ -1259,6 +1292,14 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx target) SIMD_ARG_COPY_TO_REG, SIMD_ARG_CONSTANT, SIMD_ARG_STOP); + + case AARCH64_SIMD_SETLANE: + return aarch64_simd_expand_args (target, icode, 1, exp, + SIMD_ARG_COPY_TO_REG, + SIMD_ARG_COPY_TO_REG, + SIMD_ARG_CONSTANT, + SIMD_ARG_STOP); + case AARCH64_SIMD_SHIFTIMM: return aarch64_simd_expand_args (target, icode, 1, exp, SIMD_ARG_COPY_TO_REG, diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def index 689fb79c8..ac05881f9 100644 --- a/gcc/config/aarch64/aarch64-modes.def +++ b/gcc/config/aarch64/aarch64-modes.def @@ -40,5 +40,15 @@ INT_MODE (EI, 24); INT_MODE (CI, 48); INT_MODE (XI, 64); +/* Vector modes for register lists. */ +VECTOR_MODES (INT, 32); /* V32QI V16HI V8SI V4DI. */ +VECTOR_MODES (FLOAT, 32); /* V8SF V4DF. */ + +VECTOR_MODES (INT, 48); /* V32QI V16HI V8SI V4DI. */ +VECTOR_MODES (FLOAT, 48); /* V8SF V4DF. */ + +VECTOR_MODES (INT, 64); /* V32QI V16HI V8SI V4DI. */ +VECTOR_MODES (FLOAT, 64); /* V8SF V4DF. */ + /* Quad float: 128-bit floating mode for long doubles. */ FLOAT_MODE (TF, 16, ieee_quad_format); diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 0902b06df..548d50e5d 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -178,6 +178,8 @@ bool aarch64_pad_arg_upward (enum machine_mode, const_tree); bool aarch64_pad_reg_upward (enum machine_mode, const_tree, bool); bool aarch64_regno_ok_for_base_p (int, bool); bool aarch64_regno_ok_for_index_p (int, bool); +bool aarch64_simd_imm_scalar_p (rtx x, enum machine_mode mode); +bool aarch64_simd_imm_zero_p (rtx, enum machine_mode); bool aarch64_simd_shift_imm_p (rtx, enum machine_mode, bool); bool aarch64_symbolic_address_p (rtx); bool aarch64_symbolic_constant_p (rtx, enum aarch64_symbol_context, @@ -193,15 +195,17 @@ enum reg_class aarch64_regno_regclass (unsigned); int aarch64_asm_preferred_eh_data_format (int, int); int aarch64_hard_regno_mode_ok (unsigned, enum machine_mode); int aarch64_hard_regno_nregs (unsigned, enum machine_mode); +int aarch64_simd_attr_length_move (rtx); int aarch64_simd_immediate_valid_for_move (rtx, enum machine_mode, rtx *, int *, unsigned char *, int *, int *); int aarch64_uxt_size (int, HOST_WIDE_INT); rtx aarch64_final_eh_return_addr (void); -rtx aarch64_legitimize_reload_address (rtx *, enum machine_mode, int, int, - int); +rtx aarch64_legitimize_reload_address (rtx *, enum machine_mode, int, int, int); +const char *aarch64_output_move_struct (rtx *operands); rtx aarch64_return_addr (int, rtx); rtx aarch64_simd_gen_const_vector_dup (enum machine_mode, int); +bool aarch64_simd_mem_operand_p (rtx); rtx aarch64_simd_vect_par_cnst_half (enum machine_mode, bool); rtx aarch64_tls_get_addr (void); unsigned aarch64_dbx_register_number (unsigned); @@ -225,6 +229,7 @@ void aarch64_print_operand_address (FILE *, rtx); void init_aarch64_simd_builtins (void); void aarch64_simd_const_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT); +void aarch64_simd_disambiguate_copy (rtx *, rtx *, rtx *, unsigned int); /* Emit code to place a AdvSIMD pair result in memory locations (with equal registers). */ diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index d3f8ef27a..a7ddfb1c1 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -21,7 +21,7 @@ ; Main data types used by the insntructions -(define_attr "simd_mode" "unknown,none,V8QI,V16QI,V4HI,V8HI,V2SI,V4SI,V2DI,V2SF,V4SF,V2DF,DI,DF,SI,HI,QI" +(define_attr "simd_mode" "unknown,none,V8QI,V16QI,V4HI,V8HI,V2SI,V4SI,V2DI,V2SF,V4SF,V2DF,OI,CI,XI,DI,DF,SI,HI,QI" (const_string "unknown")) @@ -309,8 +309,8 @@ (define_expand "mov" - [(set (match_operand:VALL 0 "nonimmediate_operand" "") - (match_operand:VALL 1 "general_operand" ""))] + [(set (match_operand:VALL 0 "aarch64_simd_nonimmediate_operand" "") + (match_operand:VALL 1 "aarch64_simd_general_operand" ""))] "TARGET_SIMD" " if (GET_CODE (operands[0]) == MEM) @@ -319,8 +319,8 @@ ) (define_expand "movmisalign" - [(set (match_operand:VALL 0 "nonimmediate_operand" "") - (match_operand:VALL 1 "general_operand" ""))] + [(set (match_operand:VALL 0 "aarch64_simd_nonimmediate_operand" "") + (match_operand:VALL 1 "aarch64_simd_general_operand" ""))] "TARGET_SIMD" { /* This pattern is not permitted to fail during expansion: if both arguments @@ -375,10 +375,10 @@ ) (define_insn "*aarch64_simd_mov" - [(set (match_operand:VD 0 "nonimmediate_operand" - "=w, m, w, ?r, ?w, ?r, w") - (match_operand:VD 1 "general_operand" - "m, w, w, w, r, r, Dn"))] + [(set (match_operand:VD 0 "aarch64_simd_nonimmediate_operand" + "=w, Utv, w, ?r, ?w, ?r, w") + (match_operand:VD 1 "aarch64_simd_general_operand" + "Utv, w, w, w, r, r, Dn"))] "TARGET_SIMD && (register_operand (operands[0], mode) || register_operand (operands[1], mode))" @@ -428,10 +428,10 @@ ) (define_insn "*aarch64_simd_mov" - [(set (match_operand:VQ 0 "nonimmediate_operand" - "=w, m, w, ?r, ?w, ?r, w") - (match_operand:VQ 1 "general_operand" - "m, w, w, w, r, r, Dn"))] + [(set (match_operand:VQ 0 "aarch64_simd_nonimmediate_operand" + "=w, Utv, w, ?r, ?w, ?r, w") + (match_operand:VQ 1 "aarch64_simd_general_operand" + "Utv, w, w, w, r, r, Dn"))] "TARGET_SIMD && (register_operand (operands[0], mode) || register_operand (operands[1], mode))" @@ -443,7 +443,7 @@ case 2: return "orr\t%0., %1., %1."; case 3: return "umov\t%0, %1.d[0]\;umov\t%H0, %1.d[1]"; case 4: return "ins\t%0.d[0], %1\;ins\t%0.d[1], %H1"; - case 5: return "mov\t%0, %1;mov\t%H0, %H1"; + case 5: return "#"; case 6: { int is_valid; @@ -475,6 +475,27 @@ (set_attr "length" "4,4,4,8,8,8,4")] ) +(define_split + [(set (match_operand:VQ 0 "register_operand" "") + (match_operand:VQ 1 "register_operand" ""))] + "TARGET_SIMD && reload_completed + && GP_REGNUM_P (REGNO (operands[0])) + && GP_REGNUM_P (REGNO (operands[1]))" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] +{ + int rdest = REGNO (operands[0]); + int rsrc = REGNO (operands[1]); + rtx dest[2], src[2]; + + dest[0] = gen_rtx_REG (DImode, rdest); + src[0] = gen_rtx_REG (DImode, rsrc); + dest[1] = gen_rtx_REG (DImode, rdest + 1); + src[1] = gen_rtx_REG (DImode, rsrc + 1); + + aarch64_simd_disambiguate_copy (operands, dest, src, 2); +}) + (define_insn "orn3" [(set (match_operand:VDQ 0 "register_operand" "=w") (ior:VDQ (not:VDQ (match_operand:VDQ 1 "register_operand" "w")) @@ -1606,6 +1627,17 @@ ;; In this insn, operand 1 should be low, and operand 2 the high part of the ;; dest vector. +(define_insn "*aarch64_combinez" + [(set (match_operand: 0 "register_operand" "=&w") + (vec_concat: + (match_operand:VDIC 1 "register_operand" "w") + (match_operand:VDIC 2 "aarch64_simd_imm_zero" "Dz")))] + "TARGET_SIMD" + "mov\\t%0.8b, %1.8b" + [(set_attr "simd_type" "simd_move") + (set_attr "simd_mode" "")] +) + (define_insn "aarch64_combine" [(set (match_operand: 0 "register_operand" "=&w") (vec_concat: (match_operand:VDC 1 "register_operand" "w") @@ -2670,7 +2702,7 @@ [(set (match_operand: 0 "register_operand" "=w,w") (unspec: [(match_operand:VSDQ_I_DI 1 "register_operand" "w,w") - (match_operand:VSDQ_I_DI 2 "nonmemory_operand" "w,Z")] + (match_operand:VSDQ_I_DI 2 "aarch64_simd_reg_or_zero" "w,Z")] VCMP_S))] "TARGET_SIMD" "@ @@ -2762,3 +2794,471 @@ emit_insn (gen_sqrt2 (operands[0], operands[1])); DONE; }) + + +;; Patterns for vector struct loads and stores. + +(define_insn "vec_load_lanesoi" + [(set (match_operand:OI 0 "register_operand" "=w") + (unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_LD2))] + "TARGET_SIMD" + "ld2\\t{%S0. - %T0.}, %1" + [(set_attr "simd_type" "simd_load2") + (set_attr "simd_mode" "")]) + +(define_insn "vec_store_lanesoi" + [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:OI [(match_operand:OI 1 "register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_ST2))] + "TARGET_SIMD" + "st2\\t{%S1. - %T1.}, %0" + [(set_attr "simd_type" "simd_store2") + (set_attr "simd_mode" "")]) + +(define_insn "vec_load_lanesci" + [(set (match_operand:CI 0 "register_operand" "=w") + (unspec:CI [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_LD3))] + "TARGET_SIMD" + "ld3\\t{%S0. - %U0.}, %1" + [(set_attr "simd_type" "simd_load3") + (set_attr "simd_mode" "")]) + +(define_insn "vec_store_lanesci" + [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:CI [(match_operand:CI 1 "register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_ST3))] + "TARGET_SIMD" + "st3\\t{%S1. - %U1.}, %0" + [(set_attr "simd_type" "simd_store3") + (set_attr "simd_mode" "")]) + +(define_insn "vec_load_lanesxi" + [(set (match_operand:XI 0 "register_operand" "=w") + (unspec:XI [(match_operand:XI 1 "aarch64_simd_struct_operand" "Utv") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_LD4))] + "TARGET_SIMD" + "ld4\\t{%S0. - %V0.}, %1" + [(set_attr "simd_type" "simd_load4") + (set_attr "simd_mode" "")]) + +(define_insn "vec_store_lanesxi" + [(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:XI [(match_operand:XI 1 "register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_ST4))] + "TARGET_SIMD" + "st4\\t{%S1. - %V1.}, %0" + [(set_attr "simd_type" "simd_store4") + (set_attr "simd_mode" "")]) + +;; Reload patterns for AdvSIMD register list operands. + +(define_expand "mov" + [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "") + (match_operand:VSTRUCT 1 "aarch64_simd_general_operand" ""))] + "TARGET_SIMD" +{ + if (can_create_pseudo_p ()) + { + if (GET_CODE (operands[0]) != REG) + operands[1] = force_reg (mode, operands[1]); + } +}) + +(define_insn "*aarch64_mov" + [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w") + (match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))] + "TARGET_SIMD + && (register_operand (operands[0], mode) + || register_operand (operands[1], mode))" + +{ + switch (which_alternative) + { + case 0: return "#"; + case 1: return "st1\\t{%S1.16b - %1.16b}, %0"; + case 2: return "ld1\\t{%S0.16b - %0.16b}, %1"; + default: gcc_unreachable (); + } +} + [(set_attr "simd_type" "simd_move,simd_store,simd_load") + (set (attr "length") (symbol_ref "aarch64_simd_attr_length_move (insn)")) + (set_attr "simd_mode" "")]) + +(define_split + [(set (match_operand:OI 0 "register_operand" "") + (match_operand:OI 1 "register_operand" ""))] + "TARGET_SIMD && reload_completed" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))] +{ + int rdest = REGNO (operands[0]); + int rsrc = REGNO (operands[1]); + rtx dest[2], src[2]; + + dest[0] = gen_rtx_REG (TFmode, rdest); + src[0] = gen_rtx_REG (TFmode, rsrc); + dest[1] = gen_rtx_REG (TFmode, rdest + 1); + src[1] = gen_rtx_REG (TFmode, rsrc + 1); + + aarch64_simd_disambiguate_copy (operands, dest, src, 2); +}) + +(define_split + [(set (match_operand:CI 0 "register_operand" "") + (match_operand:CI 1 "register_operand" ""))] + "TARGET_SIMD && reload_completed" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3)) + (set (match_dup 4) (match_dup 5))] +{ + int rdest = REGNO (operands[0]); + int rsrc = REGNO (operands[1]); + rtx dest[3], src[3]; + + dest[0] = gen_rtx_REG (TFmode, rdest); + src[0] = gen_rtx_REG (TFmode, rsrc); + dest[1] = gen_rtx_REG (TFmode, rdest + 1); + src[1] = gen_rtx_REG (TFmode, rsrc + 1); + dest[2] = gen_rtx_REG (TFmode, rdest + 2); + src[2] = gen_rtx_REG (TFmode, rsrc + 2); + + aarch64_simd_disambiguate_copy (operands, dest, src, 3); +}) + +(define_split + [(set (match_operand:XI 0 "register_operand" "") + (match_operand:XI 1 "register_operand" ""))] + "TARGET_SIMD && reload_completed" + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3)) + (set (match_dup 4) (match_dup 5)) + (set (match_dup 6) (match_dup 7))] +{ + int rdest = REGNO (operands[0]); + int rsrc = REGNO (operands[1]); + rtx dest[4], src[4]; + + dest[0] = gen_rtx_REG (TFmode, rdest); + src[0] = gen_rtx_REG (TFmode, rsrc); + dest[1] = gen_rtx_REG (TFmode, rdest + 1); + src[1] = gen_rtx_REG (TFmode, rsrc + 1); + dest[2] = gen_rtx_REG (TFmode, rdest + 2); + src[2] = gen_rtx_REG (TFmode, rsrc + 2); + dest[3] = gen_rtx_REG (TFmode, rdest + 3); + src[3] = gen_rtx_REG (TFmode, rsrc + 3); + + aarch64_simd_disambiguate_copy (operands, dest, src, 4); +}) + +(define_insn "aarch64_ld2_dreg" + [(set (match_operand:OI 0 "register_operand" "=w") + (subreg:OI + (vec_concat: + (vec_concat: + (unspec:VD [(match_operand:TI 1 "aarch64_simd_struct_operand" "Utv")] + UNSPEC_LD2) + (vec_duplicate:VD (const_int 0))) + (vec_concat: + (unspec:VD [(match_dup 1)] + UNSPEC_LD2) + (vec_duplicate:VD (const_int 0)))) 0))] + "TARGET_SIMD" + "ld2\\t{%S0. - %T0.}, %1" + [(set_attr "simd_type" "simd_load2") + (set_attr "simd_mode" "")]) + +(define_insn "aarch64_ld2_dreg" + [(set (match_operand:OI 0 "register_operand" "=w") + (subreg:OI + (vec_concat: + (vec_concat: + (unspec:DX [(match_operand:TI 1 "aarch64_simd_struct_operand" "Utv")] + UNSPEC_LD2) + (const_int 0)) + (vec_concat: + (unspec:DX [(match_dup 1)] + UNSPEC_LD2) + (const_int 0))) 0))] + "TARGET_SIMD" + "ld1\\t{%S0.1d - %T0.1d}, %1" + [(set_attr "simd_type" "simd_load2") + (set_attr "simd_mode" "")]) + +(define_insn "aarch64_ld3_dreg" + [(set (match_operand:CI 0 "register_operand" "=w") + (subreg:CI + (vec_concat: + (vec_concat: + (vec_concat: + (unspec:VD [(match_operand:EI 1 "aarch64_simd_struct_operand" "Utv")] + UNSPEC_LD3) + (vec_duplicate:VD (const_int 0))) + (vec_concat: + (unspec:VD [(match_dup 1)] + UNSPEC_LD3) + (vec_duplicate:VD (const_int 0)))) + (vec_concat: + (unspec:VD [(match_dup 1)] + UNSPEC_LD3) + (vec_duplicate:VD (const_int 0)))) 0))] + "TARGET_SIMD" + "ld3\\t{%S0. - %U0.}, %1" + [(set_attr "simd_type" "simd_load3") + (set_attr "simd_mode" "")]) + +(define_insn "aarch64_ld3_dreg" + [(set (match_operand:CI 0 "register_operand" "=w") + (subreg:CI + (vec_concat: + (vec_concat: + (vec_concat: + (unspec:DX [(match_operand:EI 1 "aarch64_simd_struct_operand" "Utv")] + UNSPEC_LD3) + (const_int 0)) + (vec_concat: + (unspec:DX [(match_dup 1)] + UNSPEC_LD3) + (const_int 0))) + (vec_concat: + (unspec:DX [(match_dup 1)] + UNSPEC_LD3) + (const_int 0))) 0))] + "TARGET_SIMD" + "ld1\\t{%S0.1d - %U0.1d}, %1" + [(set_attr "simd_type" "simd_load3") + (set_attr "simd_mode" "")]) + +(define_insn "aarch64_ld4_dreg" + [(set (match_operand:XI 0 "register_operand" "=w") + (subreg:XI + (vec_concat: + (vec_concat: + (vec_concat: + (unspec:VD [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv")] + UNSPEC_LD4) + (vec_duplicate:VD (const_int 0))) + (vec_concat: + (unspec:VD [(match_dup 1)] + UNSPEC_LD4) + (vec_duplicate:VD (const_int 0)))) + (vec_concat: + (vec_concat: + (unspec:VD [(match_dup 1)] + UNSPEC_LD4) + (vec_duplicate:VD (const_int 0))) + (vec_concat: + (unspec:VD [(match_dup 1)] + UNSPEC_LD4) + (vec_duplicate:VD (const_int 0))))) 0))] + "TARGET_SIMD" + "ld4\\t{%S0. - %V0.}, %1" + [(set_attr "simd_type" "simd_load4") + (set_attr "simd_mode" "")]) + +(define_insn "aarch64_ld4_dreg" + [(set (match_operand:XI 0 "register_operand" "=w") + (subreg:XI + (vec_concat: + (vec_concat: + (vec_concat: + (unspec:DX [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv")] + UNSPEC_LD4) + (const_int 0)) + (vec_concat: + (unspec:DX [(match_dup 1)] + UNSPEC_LD4) + (const_int 0))) + (vec_concat: + (vec_concat: + (unspec:DX [(match_dup 1)] + UNSPEC_LD4) + (const_int 0)) + (vec_concat: + (unspec:DX [(match_dup 1)] + UNSPEC_LD4) + (const_int 0)))) 0))] + "TARGET_SIMD" + "ld1\\t{%S0.1d - %V0.1d}, %1" + [(set_attr "simd_type" "simd_load4") + (set_attr "simd_mode" "")]) + +(define_expand "aarch64_ld" + [(match_operand:VSTRUCT 0 "register_operand" "=w") + (match_operand:DI 1 "register_operand" "r") + (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_SIMD" +{ + enum machine_mode mode = mode; + rtx mem = gen_rtx_MEM (mode, operands[1]); + + emit_insn (gen_aarch64_ld_dreg (operands[0], mem)); + DONE; +}) + +(define_expand "aarch64_ld" + [(match_operand:VSTRUCT 0 "register_operand" "=w") + (match_operand:DI 1 "register_operand" "r") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_SIMD" +{ + enum machine_mode mode = mode; + rtx mem = gen_rtx_MEM (mode, operands[1]); + + emit_insn (gen_vec_load_lanes (operands[0], mem)); + DONE; +}) + +;; Expanders for builtins to extract vector registers from large +;; opaque integer modes. + +;; D-register list. + +(define_expand "aarch64_get_dreg" + [(match_operand:VDC 0 "register_operand" "=w") + (match_operand:VSTRUCT 1 "register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + "TARGET_SIMD" +{ + int part = INTVAL (operands[2]); + rtx temp = gen_reg_rtx (mode); + int offset = part * 16; + + emit_move_insn (temp, gen_rtx_SUBREG (mode, operands[1], offset)); + emit_move_insn (operands[0], gen_lowpart (mode, temp)); + DONE; +}) + +;; Q-register list. + +(define_expand "aarch64_get_qreg" + [(match_operand:VQ 0 "register_operand" "=w") + (match_operand:VSTRUCT 1 "register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + "TARGET_SIMD" +{ + int part = INTVAL (operands[2]); + int offset = part * 16; + + emit_move_insn (operands[0], + gen_rtx_SUBREG (mode, operands[1], offset)); + DONE; +}) + +;; Permuted-store expanders for neon intrinsics. + +(define_insn "aarch64_st2_dreg" + [(set (match_operand:TI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:TI [(match_operand:OI 1 "register_operand" "w") + (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_ST2))] + "TARGET_SIMD" + "st2\\t{%S1. - %T1.}, %0" + [(set_attr "simd_type" "simd_store2") + (set_attr "simd_mode" "")]) + +(define_insn "aarch64_st2_dreg" + [(set (match_operand:TI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:TI [(match_operand:OI 1 "register_operand" "w") + (unspec:DX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_ST2))] + "TARGET_SIMD" + "st1\\t{%S1.1d - %T1.1d}, %0" + [(set_attr "simd_type" "simd_store2") + (set_attr "simd_mode" "")]) + +(define_insn "aarch64_st3_dreg" + [(set (match_operand:EI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:EI [(match_operand:CI 1 "register_operand" "w") + (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_ST3))] + "TARGET_SIMD" + "st3\\t{%S1. - %U1.}, %0" + [(set_attr "simd_type" "simd_store3") + (set_attr "simd_mode" "")]) + +(define_insn "aarch64_st3_dreg" + [(set (match_operand:EI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:EI [(match_operand:CI 1 "register_operand" "w") + (unspec:DX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_ST3))] + "TARGET_SIMD" + "st1\\t{%S1.1d - %U1.1d}, %0" + [(set_attr "simd_type" "simd_store3") + (set_attr "simd_mode" "")]) + +(define_insn "aarch64_st4_dreg" + [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:OI [(match_operand:XI 1 "register_operand" "w") + (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_ST4))] + "TARGET_SIMD" + "st4\\t{%S1. - %V1.}, %0" + [(set_attr "simd_type" "simd_store4") + (set_attr "simd_mode" "")]) + +(define_insn "aarch64_st4_dreg" + [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:OI [(match_operand:XI 1 "register_operand" "w") + (unspec:DX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_ST4))] + "TARGET_SIMD" + "st1\\t{%S1.1d - %V1.1d}, %0" + [(set_attr "simd_type" "simd_store4") + (set_attr "simd_mode" "")]) + +(define_expand "aarch64_st" + [(match_operand:DI 0 "register_operand" "r") + (match_operand:VSTRUCT 1 "register_operand" "w") + (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_SIMD" +{ + enum machine_mode mode = mode; + rtx mem = gen_rtx_MEM (mode, operands[0]); + + emit_insn (gen_aarch64_st_dreg (mem, operands[1])); + DONE; +}) + +(define_expand "aarch64_st" + [(match_operand:DI 0 "register_operand" "r") + (match_operand:VSTRUCT 1 "register_operand" "w") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_SIMD" +{ + enum machine_mode mode = mode; + rtx mem = gen_rtx_MEM (mode, operands[0]); + + emit_insn (gen_vec_store_lanes (mem, operands[1])); + DONE; +}) + +;; Expander for builtins to insert vector registers into large +;; opaque integer modes. + +;; Q-register list. We don't need a D-reg inserter as we zero +;; extend them in arm_neon.h and insert the resulting Q-regs. + +(define_expand "aarch64_set_qreg" + [(match_operand:VSTRUCT 0 "register_operand" "+w") + (match_operand:VSTRUCT 1 "register_operand" "0") + (match_operand:VQ 2 "register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_SIMD" +{ + int part = INTVAL (operands[3]); + int offset = part * 16; + + emit_move_insn (operands[0], operands[1]); + emit_move_insn (gen_rtx_SUBREG (mode, operands[0], offset), + operands[2]); + DONE; +}) + diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b2867ef3b..fbee64dde 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -295,6 +295,34 @@ aarch64_dbx_register_number (unsigned regno) return DWARF_FRAME_REGISTERS; } +/* Return TRUE if MODE is any of the large INT modes. */ +static bool +aarch64_vect_struct_mode_p (enum machine_mode mode) +{ + return mode == OImode || mode == CImode || mode == XImode; +} + +/* Return TRUE if MODE is any of the vector modes. */ +static bool +aarch64_vector_mode_p (enum machine_mode mode) +{ + return aarch64_vector_mode_supported_p (mode) + || aarch64_vect_struct_mode_p (mode); +} + +/* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */ +static bool +aarch64_array_mode_supported_p (enum machine_mode mode, + unsigned HOST_WIDE_INT nelems) +{ + if (TARGET_SIMD + && AARCH64_VALID_SIMD_QREG_MODE (mode) + && (nelems >= 2 && nelems <= 4)) + return true; + + return false; +} + /* Implement HARD_REGNO_NREGS. */ int @@ -323,11 +351,17 @@ aarch64_hard_regno_mode_ok (unsigned regno, enum machine_mode mode) || regno == ARG_POINTER_REGNUM) return mode == Pmode; - if (GP_REGNUM_P (regno)) + if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode)) return 1; if (FP_REGNUM_P (regno)) - return 1; + { + if (aarch64_vect_struct_mode_p (mode)) + return + (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM; + else + return 1; + } return 0; } @@ -2729,7 +2763,7 @@ aarch64_classify_address (struct aarch64_address_info *info, /* Don't support anything other than POST_INC or REG addressing for AdvSIMD. */ - if (aarch64_vector_mode_supported_p (mode) + if (aarch64_vector_mode_p (mode) && (code != POST_INC && code != REG)) return false; @@ -3252,6 +3286,20 @@ aarch64_print_operand (FILE *f, rtx x, char code) asm_fprintf (f, "%s%c%d", REGISTER_PREFIX, code, REGNO (x) - V0_REGNUM); break; + case 'S': + case 'T': + case 'U': + case 'V': + /* Print the first FP/SIMD register name in a list. */ + if (!REG_P (x) || !FP_REGNUM_P (REGNO (x))) + { + output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code); + return; + } + asm_fprintf (f, "%sv%d", REGISTER_PREFIX, + REGNO (x) - V0_REGNUM + (code - 'S')); + break; + case 'w': case 'x': /* Print a general register name or the zero register (32-bit or @@ -3560,7 +3608,7 @@ aarch64_legitimize_reload_address (rtx *x_p, rtx x = *x_p; /* Do not allow mem (plus (reg, const)) if vector mode. */ - if (aarch64_vector_mode_supported_p (mode) + if (aarch64_vector_mode_p (mode) && GET_CODE (x) == PLUS && REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1))) @@ -3834,8 +3882,9 @@ aarch64_class_max_nregs (reg_class_t regclass, enum machine_mode mode) case ALL_REGS: case FP_REGS: case FP_LO_REGS: - return (GET_MODE_SIZE (mode) + 7) / 8; - + return + aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 : + (GET_MODE_SIZE (mode) + 7) / 8; case STACK_REG: return 1; @@ -4895,6 +4944,11 @@ aarch64_legitimate_pic_operand_p (rtx x) static bool aarch64_legitimate_constant_p (enum machine_mode mode, rtx x) { + /* Do not allow vector struct mode constants. We could support + 0 and -1 easily, but they need support in aarch64-simd.md. */ + if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode)) + return false; + /* This could probably go away because we now decompose CONST_INTs according to expand_mov_immediate. */ if ((GET_CODE (x) == CONST_VECTOR @@ -6440,6 +6494,41 @@ aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left) return aarch64_const_vec_all_same_int_p (x, 1, bit_width); } +bool +aarch64_simd_imm_zero_p (rtx x, enum machine_mode mode) +{ + int nunits; + int i; + + if (GET_CODE (x) != CONST_VECTOR) + return false; + + nunits = GET_MODE_NUNITS (mode); + + for (i = 0; i < nunits; i++) + if (INTVAL (CONST_VECTOR_ELT (x, i)) != 0) + return false; + + return true; +} + +bool +aarch64_simd_imm_scalar_p (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED) +{ + HOST_WIDE_INT imm = INTVAL (x); + int i; + + for (i = 0; i < 8; i++) + { + unsigned int byte = imm & 0xff; + if (byte != 0xff && byte != 0) + return false; + imm >>= 8; + } + + return true; +} + /* Return a const_int vector of VAL. */ rtx aarch64_simd_gen_const_vector_dup (enum machine_mode mode, int val) @@ -6520,6 +6609,82 @@ aarch64_simd_emit_pair_result_insn (enum machine_mode mode, emit_move_insn (mem, tmp2); } +/* Return TRUE if OP is a valid vector addressing mode. */ +bool +aarch64_simd_mem_operand_p (rtx op) +{ + return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC + || GET_CODE (XEXP (op, 0)) == REG); +} + +/* Set up OPERANDS for a register copy from SRC to DEST, taking care + not to early-clobber SRC registers in the process. + + We assume that the operands described by SRC and DEST represent a + decomposed copy of OPERANDS[1] into OPERANDS[0]. COUNT is the + number of components into which the copy has been decomposed. */ +void +aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest, + rtx *src, unsigned int count) +{ + unsigned int i; + + if (!reg_overlap_mentioned_p (operands[0], operands[1]) + || REGNO (operands[0]) < REGNO (operands[1])) + { + for (i = 0; i < count; i++) + { + operands[2 * i] = dest[i]; + operands[2 * i + 1] = src[i]; + } + } + else + { + for (i = 0; i < count; i++) + { + operands[2 * i] = dest[count - i - 1]; + operands[2 * i + 1] = src[count - i - 1]; + } + } +} + +/* Compute and return the length of aarch64_simd_mov, where is + one of VSTRUCT modes: OI, CI or XI. */ +int +aarch64_simd_attr_length_move (rtx insn) +{ + rtx reg, mem, addr; + int load; + enum machine_mode mode; + + extract_insn_cached (insn); + + if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1])) + { + mode = GET_MODE (recog_data.operand[0]); + switch (mode) + { + case OImode: + return 8; + case CImode: + return 12; + case XImode: + return 16; + default: + gcc_unreachable (); + } + } + return 4; +} + +static unsigned HOST_WIDE_INT +aarch64_shift_truncation_mask (enum machine_mode mode) +{ + return + (aarch64_vector_mode_supported_p (mode) + || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1); +} + #ifndef TLS_SECTION_ASM_FLAG #define TLS_SECTION_ASM_FLAG 'T' #endif @@ -6773,6 +6938,9 @@ aarch64_c_mode_for_suffix (char suffix) #undef TARGET_SECONDARY_RELOAD #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload +#undef TARGET_SHIFT_TRUNCATION_MASK +#define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask + #undef TARGET_SETUP_INCOMING_VARARGS #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs @@ -6800,6 +6968,9 @@ aarch64_c_mode_for_suffix (char suffix) #undef TARGET_VECTOR_MODE_SUPPORTED_P #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p +#undef TARGET_ARRAY_MODE_SUPPORTED_P +#define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p + #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index b4d3d7dd3..a069a4139 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -788,7 +788,7 @@ enum aarch64_builtins : 0) -#define SHIFT_COUNT_TRUNCATED 1 +#define SHIFT_COUNT_TRUNCATED !TARGET_SIMD /* Callee only saves lower 64-bits of a 128-bit register. Tell the compiler the callee clobbers the top 64-bits when restoring the @@ -811,4 +811,9 @@ extern enum aarch64_code_model aarch64_cmodel; (aarch64_cmodel == AARCH64_CMODEL_TINY \ || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC) +/* Modes valid for AdvSIMD Q registers. */ +#define AARCH64_VALID_SIMD_QREG_MODE(MODE) \ + ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \ + || (MODE) == V4SFmode || (MODE) == V2DImode || mode == V2DFmode) + #endif /* GCC_AARCH64_H */ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index e6086a91d..38cbfcdff 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -76,12 +76,19 @@ UNSPEC_FRINTZ UNSPEC_GOTSMALLPIC UNSPEC_GOTSMALLTLS + UNSPEC_LD2 + UNSPEC_LD3 + UNSPEC_LD4 UNSPEC_MB UNSPEC_NOP UNSPEC_PRLG_STK UNSPEC_RBIT + UNSPEC_ST2 + UNSPEC_ST3 + UNSPEC_ST4 UNSPEC_TLS UNSPEC_TLSDESC + UNSPEC_VSTRUCTDUMMY ]) (define_c_enum "unspecv" [ @@ -864,8 +871,8 @@ ) (define_insn "*movdi_aarch64" - [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,m, r, r, *w, r,*w") - (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,m,rZ,Usa,Ush,rZ,*w,*w"))] + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,m, r, r, *w, r,*w,w") + (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,m,rZ,Usa,Ush,rZ,*w,*w,Dd"))] "(register_operand (operands[0], DImode) || aarch64_reg_or_zero (operands[1], DImode))" "@ @@ -879,10 +886,11 @@ adrp\\t%x0, %A1 fmov\\t%d0, %x1 fmov\\t%x0, %d1 - fmov\\t%d0, %d1" - [(set_attr "v8type" "move,move,move,alu,load1,store1,adr,adr,fmov,fmov,fmov") + fmov\\t%d0, %d1 + movi\\t%d0, %1" + [(set_attr "v8type" "move,move,move,alu,load1,store1,adr,adr,fmov,fmov,fmov,fmov") (set_attr "mode" "DI") - (set_attr "fp" "*,*,*,*,*,*,*,*,yes,yes,yes")] + (set_attr "fp" "*,*,*,*,*,*,*,*,yes,yes,yes,yes")] ) (define_insn "insv_imm" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 46abaf6f4..a092dfff9 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -7859,15 +7859,16 @@ vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) result; \ }) -#define vfmaq_lane_f64(a, b, c) \ +#define vfmaq_lane_f64(a, b, c, d) \ __extension__ \ ({ \ + float64x2_t c_ = (c); \ float64x2_t b_ = (b); \ float64x2_t a_ = (a); \ float64x2_t result; \ - __asm__ ("fmla %0.2d,%1.2d,%2.d[%3]" \ + __asm__ ("fmla %0.2d,%2.2d,%3.d[%4]" \ : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) @@ -11752,6 +11753,12 @@ vmovq_n_f32 (float32_t a) return result; } +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmovq_n_f64 (float64_t a) +{ + return (float64x2_t) {a, a}; +} + __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) vmovq_n_p8 (uint32_t a) { @@ -15334,7 +15341,7 @@ vrndqp_f64 (float64x2_t a) int8x8_t a_ = (a); \ int8x16_t result = vcombine_s8 \ (a_, vcreate_s8 (UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.16b,%2.8h,#%3" \ + __asm__ ("rshrn2 %0.16b,%1.8h,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -15348,7 +15355,7 @@ vrndqp_f64 (float64x2_t a) int16x4_t a_ = (a); \ int16x8_t result = vcombine_s16 \ (a_, vcreate_s16 (UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.8h,%2.4s,#%3" \ + __asm__ ("rshrn2 %0.8h,%1.4s,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -15362,7 +15369,7 @@ vrndqp_f64 (float64x2_t a) int32x2_t a_ = (a); \ int32x4_t result = vcombine_s32 \ (a_, vcreate_s32 (UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.4s,%2.2d,#%3" \ + __asm__ ("rshrn2 %0.4s,%1.2d,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -15376,7 +15383,7 @@ vrndqp_f64 (float64x2_t a) uint8x8_t a_ = (a); \ uint8x16_t result = vcombine_u8 \ (a_, vcreate_u8 (UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.16b,%2.8h,#%3" \ + __asm__ ("rshrn2 %0.16b,%1.8h,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -15390,7 +15397,7 @@ vrndqp_f64 (float64x2_t a) uint16x4_t a_ = (a); \ uint16x8_t result = vcombine_u16 \ (a_, vcreate_u16 (UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.8h,%2.4s,#%3" \ + __asm__ ("rshrn2 %0.8h,%1.4s,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -15404,7 +15411,7 @@ vrndqp_f64 (float64x2_t a) uint32x2_t a_ = (a); \ uint32x4_t result = vcombine_u32 \ (a_, vcreate_u32 (UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.4s,%2.2d,#%3" \ + __asm__ ("rshrn2 %0.4s,%1.2d,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -16088,7 +16095,7 @@ vrsubhn_u64 (uint64x2_t a, uint64x2_t b) int8x8_t a_ = (a); \ int8x16_t result = vcombine_s8 \ (a_, vcreate_s8 (UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.16b,%2.8h,#%3" \ + __asm__ ("shrn2 %0.16b,%1.8h,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -16102,7 +16109,7 @@ vrsubhn_u64 (uint64x2_t a, uint64x2_t b) int16x4_t a_ = (a); \ int16x8_t result = vcombine_s16 \ (a_, vcreate_s16 (UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.8h,%2.4s,#%3" \ + __asm__ ("shrn2 %0.8h,%1.4s,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -16116,7 +16123,7 @@ vrsubhn_u64 (uint64x2_t a, uint64x2_t b) int32x2_t a_ = (a); \ int32x4_t result = vcombine_s32 \ (a_, vcreate_s32 (UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.4s,%2.2d,#%3" \ + __asm__ ("shrn2 %0.4s,%1.2d,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -16130,7 +16137,7 @@ vrsubhn_u64 (uint64x2_t a, uint64x2_t b) uint8x8_t a_ = (a); \ uint8x16_t result = vcombine_u8 \ (a_, vcreate_u8 (UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.16b,%2.8h,#%3" \ + __asm__ ("shrn2 %0.16b,%1.8h,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -16144,7 +16151,7 @@ vrsubhn_u64 (uint64x2_t a, uint64x2_t b) uint16x4_t a_ = (a); \ uint16x8_t result = vcombine_u16 \ (a_, vcreate_u16 (UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.8h,%2.4s,#%3" \ + __asm__ ("shrn2 %0.8h,%1.4s,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -16158,7 +16165,7 @@ vrsubhn_u64 (uint64x2_t a, uint64x2_t b) uint32x2_t a_ = (a); \ uint32x4_t result = vcombine_u32 \ (a_, vcreate_u32 (UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.4s,%2.2d,#%3" \ + __asm__ ("shrn2 %0.4s,%1.2d,#%2" \ : "+w"(result) \ : "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -18386,165 +18393,6 @@ vzip2q_u64 (uint64x2_t a, uint64x2_t b) /* Start of temporary inline asm for vldn, vstn and friends. */ -#define __LD2_FUNC(rettype, ptrtype, regsuffix, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld2 ## Q ## _ ## funcsuffix (const ptrtype *ptr) \ - { \ - rettype result; \ - __asm__ ("ld2 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ - "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(*(const rettype *)ptr) \ - : "memory", "v16", "v17"); \ - return result; \ - } - -#define __LD2_64x1_FUNC(rettype, ptrtype, funcsuffix) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld2_ ## funcsuffix (const ptrtype *ptr) \ - { \ - rettype result; \ - __asm__ ("ld1 {v16.1d, v17.1d}, %1\n\t" \ - "st1 {v16.1d, v17.1d}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(*(const rettype *)ptr) \ - : "memory", "v16", "v17"); \ - return result; \ - } - -__LD2_FUNC (float32x2x2_t, float32_t, 2s, f32,) -__LD2_64x1_FUNC (float64x1x2_t, float64_t, f64) -__LD2_FUNC (poly8x8x2_t, poly8_t, 8b, p8,) -__LD2_FUNC (poly16x4x2_t, poly16_t, 4h, p16,) -__LD2_FUNC (int8x8x2_t, int8_t, 8b, s8,) -__LD2_FUNC (int16x4x2_t, int16_t, 4h, s16,) -__LD2_FUNC (int32x2x2_t, int32_t, 2s, s32,) -__LD2_64x1_FUNC (int64x1x2_t, int64_t, s64) -__LD2_FUNC (uint8x8x2_t, uint8_t, 8b, u8,) -__LD2_FUNC (uint16x4x2_t, uint16_t, 4h, u16,) -__LD2_FUNC (uint32x2x2_t, uint32_t, 2s, u32,) -__LD2_64x1_FUNC (uint64x1x2_t, uint64_t, u64) -__LD2_FUNC (float32x4x2_t, float32_t, 4s, f32, q) -__LD2_FUNC (float64x2x2_t, float64_t, 2d, f64, q) -__LD2_FUNC (poly8x16x2_t, poly8_t, 16b, p8, q) -__LD2_FUNC (poly16x8x2_t, poly16_t, 8h, p16, q) -__LD2_FUNC (int8x16x2_t, int8_t, 16b, s8, q) -__LD2_FUNC (int16x8x2_t, int16_t, 8h, s16, q) -__LD2_FUNC (int32x4x2_t, int32_t, 4s, s32, q) -__LD2_FUNC (int64x2x2_t, int64_t, 2d, s64, q) -__LD2_FUNC (uint8x16x2_t, uint8_t, 16b, u8, q) -__LD2_FUNC (uint16x8x2_t, uint16_t, 8h, u16, q) -__LD2_FUNC (uint32x4x2_t, uint32_t, 4s, u32, q) -__LD2_FUNC (uint64x2x2_t, uint64_t, 2d, u64, q) - -#define __LD3_FUNC(rettype, ptrtype, regsuffix, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld3 ## Q ## _ ## funcsuffix (const ptrtype *ptr) \ - { \ - rettype result; \ - __asm__ ("ld3 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ - "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(*(const rettype *)ptr) \ - : "memory", "v16", "v17", "v18"); \ - return result; \ - } - -#define __LD3_64x1_FUNC(rettype, ptrtype, funcsuffix) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld3_ ## funcsuffix (const ptrtype *ptr) \ - { \ - rettype result; \ - __asm__ ("ld1 {v16.1d - v18.1d}, %1\n\t" \ - "st1 {v16.1d - v18.1d}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(*(const rettype *)ptr) \ - : "memory", "v16", "v17", "v18"); \ - return result; \ - } - -__LD3_FUNC (float32x2x3_t, float32_t, 2s, f32,) -__LD3_64x1_FUNC (float64x1x3_t, float64_t, f64) -__LD3_FUNC (poly8x8x3_t, poly8_t, 8b, p8,) -__LD3_FUNC (poly16x4x3_t, poly16_t, 4h, p16,) -__LD3_FUNC (int8x8x3_t, int8_t, 8b, s8,) -__LD3_FUNC (int16x4x3_t, int16_t, 4h, s16,) -__LD3_FUNC (int32x2x3_t, int32_t, 2s, s32,) -__LD3_64x1_FUNC (int64x1x3_t, int64_t, s64) -__LD3_FUNC (uint8x8x3_t, uint8_t, 8b, u8,) -__LD3_FUNC (uint16x4x3_t, uint16_t, 4h, u16,) -__LD3_FUNC (uint32x2x3_t, uint32_t, 2s, u32,) -__LD3_64x1_FUNC (uint64x1x3_t, uint64_t, u64) -__LD3_FUNC (float32x4x3_t, float32_t, 4s, f32, q) -__LD3_FUNC (float64x2x3_t, float64_t, 2d, f64, q) -__LD3_FUNC (poly8x16x3_t, poly8_t, 16b, p8, q) -__LD3_FUNC (poly16x8x3_t, poly16_t, 8h, p16, q) -__LD3_FUNC (int8x16x3_t, int8_t, 16b, s8, q) -__LD3_FUNC (int16x8x3_t, int16_t, 8h, s16, q) -__LD3_FUNC (int32x4x3_t, int32_t, 4s, s32, q) -__LD3_FUNC (int64x2x3_t, int64_t, 2d, s64, q) -__LD3_FUNC (uint8x16x3_t, uint8_t, 16b, u8, q) -__LD3_FUNC (uint16x8x3_t, uint16_t, 8h, u16, q) -__LD3_FUNC (uint32x4x3_t, uint32_t, 4s, u32, q) -__LD3_FUNC (uint64x2x3_t, uint64_t, 2d, u64, q) - -#define __LD4_FUNC(rettype, ptrtype, regsuffix, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld4 ## Q ## _ ## funcsuffix (const ptrtype *ptr) \ - { \ - rettype result; \ - __asm__ ("ld4 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ - "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(*(const rettype *)ptr) \ - : "memory", "v16", "v17", "v18", "v19"); \ - return result; \ - } - -#define __LD4_64x1_FUNC(rettype, ptrtype, funcsuffix) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld4_ ## funcsuffix (const ptrtype *ptr) \ - { \ - rettype result; \ - __asm__ ("ld1 {v16.1d - v19.1d}, %1\n\t" \ - "st1 {v16.1d - v19.1d}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(*(const rettype *)ptr) \ - : "memory", "v16", "v17", "v18", "v19"); \ - return result; \ - } - -__LD4_FUNC (float32x2x4_t, float32_t, 2s, f32,) -__LD4_64x1_FUNC (float64x1x4_t, float64_t, f64) -__LD4_FUNC (poly8x8x4_t, poly8_t, 8b, p8,) -__LD4_FUNC (poly16x4x4_t, poly16_t, 4h, p16,) -__LD4_FUNC (int8x8x4_t, int8_t, 8b, s8,) -__LD4_FUNC (int16x4x4_t, int16_t, 4h, s16,) -__LD4_FUNC (int32x2x4_t, int32_t, 2s, s32,) -__LD4_64x1_FUNC (int64x1x4_t, int64_t, s64) -__LD4_FUNC (uint8x8x4_t, uint8_t, 8b, u8,) -__LD4_FUNC (uint16x4x4_t, uint16_t, 4h, u16,) -__LD4_FUNC (uint32x2x4_t, uint32_t, 2s, u32,) -__LD4_64x1_FUNC (uint64x1x4_t, uint64_t, u64) -__LD4_FUNC (float32x4x4_t, float32_t, 4s, f32, q) -__LD4_FUNC (float64x2x4_t, float64_t, 2d, f64, q) -__LD4_FUNC (poly8x16x4_t, poly8_t, 16b, p8, q) -__LD4_FUNC (poly16x8x4_t, poly16_t, 8h, p16, q) -__LD4_FUNC (int8x16x4_t, int8_t, 16b, s8, q) -__LD4_FUNC (int16x8x4_t, int16_t, 8h, s16, q) -__LD4_FUNC (int32x4x4_t, int32_t, 4s, s32, q) -__LD4_FUNC (int64x2x4_t, int64_t, 2d, s64, q) -__LD4_FUNC (uint8x16x4_t, uint8_t, 16b, u8, q) -__LD4_FUNC (uint16x8x4_t, uint16_t, 8h, u16, q) -__LD4_FUNC (uint32x4x4_t, uint32_t, 4s, u32, q) -__LD4_FUNC (uint64x2x4_t, uint64_t, 2d, u64, q) - /* Create struct element types for duplicating loads. Create 2 element structures of: @@ -18870,54 +18718,6 @@ __LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) __LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) __LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) -#define __ST2_FUNC(intype, ptrtype, regsuffix, funcsuffix, Q) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst2 ## Q ## _ ## funcsuffix (ptrtype *ptr, intype b) \ - { \ - __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ - "st2 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \ - :"=Q"(*(intype *)ptr) \ - :"Q"(b) \ - :"v16", "v17", "memory"); \ - } -#define __ST2_64x1_FUNC(intype, ptrtype, funcsuffix) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst2_ ## funcsuffix (ptrtype *ptr, intype b) \ - { \ - __asm__ ("ld1 {v16.1d - v17.1d}, %1\n\t" \ - "st1 {v16.1d - v17.1d}, %0\n\t" \ - :"=Q"(*(intype *)ptr) \ - :"Q"(b) \ - :"v16", "v17", "memory"); \ - } - -__ST2_FUNC (float32x2x2_t, float32_t, 2s, f32,) -__ST2_64x1_FUNC (float64x1x2_t, float64_t, f64) -__ST2_FUNC (poly8x8x2_t, poly8_t, 8b, p8,) -__ST2_FUNC (poly16x4x2_t, poly16_t, 4h, p16,) -__ST2_FUNC (int8x8x2_t, int8_t, 8b, s8,) -__ST2_FUNC (int16x4x2_t, int16_t, 4h, s16,) -__ST2_FUNC (int32x2x2_t, int32_t, 2s, s32,) -__ST2_64x1_FUNC (int64x1x2_t, int64_t, s64) -__ST2_FUNC (uint8x8x2_t, uint8_t, 8b, u8,) -__ST2_FUNC (uint16x4x2_t, uint16_t, 4h, u16,) -__ST2_FUNC (uint32x2x2_t, uint32_t, 2s, u32,) -__ST2_64x1_FUNC (uint64x1x2_t, uint64_t, u64) -__ST2_FUNC (float32x4x2_t, float32_t, 4s, f32, q) -__ST2_FUNC (float64x2x2_t, float64_t, 2d, f64, q) -__ST2_FUNC (poly8x16x2_t, poly8_t, 16b, p8, q) -__ST2_FUNC (poly16x8x2_t, poly16_t, 8h, p16, q) -__ST2_FUNC (int8x16x2_t, int8_t, 16b, s8, q) -__ST2_FUNC (int16x8x2_t, int16_t, 8h, s16, q) -__ST2_FUNC (int32x4x2_t, int32_t, 4s, s32, q) -__ST2_FUNC (int64x2x2_t, int64_t, 2d, s64, q) -__ST2_FUNC (uint8x16x2_t, uint8_t, 16b, u8, q) -__ST2_FUNC (uint16x8x2_t, uint16_t, 8h, u16, q) -__ST2_FUNC (uint32x4x2_t, uint32_t, 4s, u32, q) -__ST2_FUNC (uint64x2x2_t, uint64_t, 2d, u64, q) - #define __ST2_LANE_FUNC(intype, ptrtype, regsuffix, \ lnsuffix, funcsuffix, Q) \ __extension__ static __inline void \ @@ -18957,54 +18757,6 @@ __ST2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q) __ST2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q) __ST2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q) -#define __ST3_FUNC(intype, ptrtype, regsuffix, funcsuffix, Q) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst3 ## Q ## _ ## funcsuffix (ptrtype *ptr, intype b) \ - { \ - __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ - "st3 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \ - :"=Q"(*(intype *)ptr) \ - :"Q"(b) \ - :"v16", "v17", "v18", "memory"); \ - } -#define __ST3_64x1_FUNC(intype, ptrtype, funcsuffix) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst3_ ## funcsuffix (ptrtype *ptr, intype b) \ - { \ - __asm__ ("ld1 {v16.1d - v18.1d}, %1\n\t" \ - "st1 {v16.1d - v18.1d}, %0\n\t" \ - :"=Q"(*(intype *)ptr) \ - :"Q"(b) \ - :"v16", "v17", "v18", "memory"); \ - } - -__ST3_FUNC (float32x2x3_t, float32_t, 2s, f32,) -__ST3_64x1_FUNC (float64x1x3_t, float64_t, f64) -__ST3_FUNC (poly8x8x3_t, poly8_t, 8b, p8,) -__ST3_FUNC (poly16x4x3_t, poly16_t, 4h, p16,) -__ST3_FUNC (int8x8x3_t, int8_t, 8b, s8,) -__ST3_FUNC (int16x4x3_t, int16_t, 4h, s16,) -__ST3_FUNC (int32x2x3_t, int32_t, 2s, s32,) -__ST3_64x1_FUNC (int64x1x3_t, int64_t, s64) -__ST3_FUNC (uint8x8x3_t, uint8_t, 8b, u8,) -__ST3_FUNC (uint16x4x3_t, uint16_t, 4h, u16,) -__ST3_FUNC (uint32x2x3_t, uint32_t, 2s, u32,) -__ST3_64x1_FUNC (uint64x1x3_t, uint64_t, u64) -__ST3_FUNC (float32x4x3_t, float32_t, 4s, f32, q) -__ST3_FUNC (float64x2x3_t, float64_t, 2d, f64, q) -__ST3_FUNC (poly8x16x3_t, poly8_t, 16b, p8, q) -__ST3_FUNC (poly16x8x3_t, poly16_t, 8h, p16, q) -__ST3_FUNC (int8x16x3_t, int8_t, 16b, s8, q) -__ST3_FUNC (int16x8x3_t, int16_t, 8h, s16, q) -__ST3_FUNC (int32x4x3_t, int32_t, 4s, s32, q) -__ST3_FUNC (int64x2x3_t, int64_t, 2d, s64, q) -__ST3_FUNC (uint8x16x3_t, uint8_t, 16b, u8, q) -__ST3_FUNC (uint16x8x3_t, uint16_t, 8h, u16, q) -__ST3_FUNC (uint32x4x3_t, uint32_t, 4s, u32, q) -__ST3_FUNC (uint64x2x3_t, uint64_t, 2d, u64, q) - #define __ST3_LANE_FUNC(intype, ptrtype, regsuffix, \ lnsuffix, funcsuffix, Q) \ __extension__ static __inline void \ @@ -19044,54 +18796,6 @@ __ST3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q) __ST3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q) __ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q) -#define __ST4_FUNC(intype, ptrtype, regsuffix, funcsuffix, Q) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst4 ## Q ## _ ## funcsuffix (ptrtype *ptr, intype b) \ - { \ - __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ - "st4 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \ - :"=Q"(*(intype *)ptr) \ - :"Q"(b) \ - :"v16", "v17", "v18", "v19", "memory"); \ - } -#define __ST4_64x1_FUNC(intype, ptrtype, funcsuffix) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst4_ ## funcsuffix (ptrtype *ptr, intype b) \ - { \ - __asm__ ("ld1 {v16.1d - v19.1d}, %1\n\t" \ - "st1 {v16.1d - v19.1d}, %0\n\t" \ - :"=Q"(*(intype *)ptr) \ - :"Q"(b) \ - :"v16", "v17", "v18", "v19", "memory"); \ - } - -__ST4_FUNC (float32x2x4_t, float32_t, 2s, f32,) -__ST4_64x1_FUNC (float64x1x4_t, float64_t, f64) -__ST4_FUNC (poly8x8x4_t, poly8_t, 8b, p8,) -__ST4_FUNC (poly16x4x4_t, poly16_t, 4h, p16,) -__ST4_FUNC (int8x8x4_t, int8_t, 8b, s8,) -__ST4_FUNC (int16x4x4_t, int16_t, 4h, s16,) -__ST4_FUNC (int32x2x4_t, int32_t, 2s, s32,) -__ST4_64x1_FUNC (int64x1x4_t, int64_t, s64) -__ST4_FUNC (uint8x8x4_t, uint8_t, 8b, u8,) -__ST4_FUNC (uint16x4x4_t, uint16_t, 4h, u16,) -__ST4_FUNC (uint32x2x4_t, uint32_t, 2s, u32,) -__ST4_64x1_FUNC (uint64x1x4_t, uint64_t, u64) -__ST4_FUNC (float32x4x4_t, float32_t, 4s, f32, q) -__ST4_FUNC (float64x2x4_t, float64_t, 2d, f64, q) -__ST4_FUNC (poly8x16x4_t, poly8_t, 16b, p8, q) -__ST4_FUNC (poly16x8x4_t, poly16_t, 8h, p16, q) -__ST4_FUNC (int8x16x4_t, int8_t, 16b, s8, q) -__ST4_FUNC (int16x8x4_t, int16_t, 8h, s16, q) -__ST4_FUNC (int32x4x4_t, int32_t, 4s, s32, q) -__ST4_FUNC (int64x2x4_t, int64_t, 2d, s64, q) -__ST4_FUNC (uint8x16x4_t, uint8_t, 16b, u8, q) -__ST4_FUNC (uint16x8x4_t, uint16_t, 8h, u16, q) -__ST4_FUNC (uint32x4x4_t, uint32_t, 4s, u32, q) -__ST4_FUNC (uint64x2x4_t, uint64_t, 2d, u64, q) - #define __ST4_LANE_FUNC(intype, ptrtype, regsuffix, \ lnsuffix, funcsuffix, Q) \ __extension__ static __inline void \ @@ -20896,6 +20600,872 @@ vdupd_lane_u64 (uint64x2_t a, int const b) return (uint64x1_t) __builtin_aarch64_dup_lanedi ((int64x2_t) a, b); } +/* vldn */ + +__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__)) +vld2_s64 (const int64_t * __a) +{ + int64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return ret; +} + +__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__)) +vld2_u64 (const uint64_t * __a) +{ + uint64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return ret; +} + +__extension__ static __inline float64x1x2_t __attribute__ ((__always_inline__)) +vld2_f64 (const float64_t * __a) +{ + float64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x1_t) __builtin_aarch64_get_dregoidf (__o, 0); + ret.val[1] = (float64x1_t) __builtin_aarch64_get_dregoidf (__o, 1); + return ret; +} + +__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) +vld2_s8 (const int8_t * __a) +{ + int8x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); + ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); + return ret; +} + +__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) +vld2_p8 (const poly8_t * __a) +{ + poly8x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); + ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); + return ret; +} + +__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) +vld2_s16 (const int16_t * __a) +{ + int16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); + return ret; +} + +__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) +vld2_p16 (const poly16_t * __a) +{ + poly16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); + return ret; +} + +__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) +vld2_s32 (const int32_t * __a) +{ + int32x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); + ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); + return ret; +} + +__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) +vld2_u8 (const uint8_t * __a) +{ + uint8x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); + ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); + return ret; +} + +__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) +vld2_u16 (const uint16_t * __a) +{ + uint16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); + return ret; +} + +__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) +vld2_u32 (const uint32_t * __a) +{ + uint32x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); + ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); + return ret; +} + +__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) +vld2_f32 (const float32_t * __a) +{ + float32x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v2sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0); + ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1); + return ret; +} + +__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) +vld2q_s8 (const int8_t * __a) +{ + int8x16x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); + return ret; +} + +__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__)) +vld2q_p8 (const poly8_t * __a) +{ + poly8x16x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); + return ret; +} + +__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) +vld2q_s16 (const int16_t * __a) +{ + int16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); + return ret; +} + +__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) +vld2q_p16 (const poly16_t * __a) +{ + poly16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); + return ret; +} + +__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) +vld2q_s32 (const int32_t * __a) +{ + int32x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__)) +vld2q_s64 (const int64_t * __a) +{ + int64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); + return ret; +} + +__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__)) +vld2q_u8 (const uint8_t * __a) +{ + uint8x16x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); + return ret; +} + +__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) +vld2q_u16 (const uint16_t * __a) +{ + uint16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); + return ret; +} + +__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) +vld2q_u32 (const uint32_t * __a) +{ + uint32x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__)) +vld2q_u64 (const uint64_t * __a) +{ + uint64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); + return ret; +} + +__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) +vld2q_f32 (const float32_t * __a) +{ + float32x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1); + return ret; +} + +__extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__)) +vld2q_f64 (const float64_t * __a) +{ + float64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1); + return ret; +} + +__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__)) +vld3_s64 (const int64_t * __a) +{ + int64x1x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + return ret; +} + +__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__)) +vld3_u64 (const uint64_t * __a) +{ + uint64x1x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + return ret; +} + +__extension__ static __inline float64x1x3_t __attribute__ ((__always_inline__)) +vld3_f64 (const float64_t * __a) +{ + float64x1x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidf (__o, 0); + ret.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidf (__o, 1); + ret.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidf (__o, 2); + return ret; +} + +__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__)) +vld3_s8 (const int8_t * __a) +{ + int8x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); + ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); + ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); + return ret; +} + +__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__)) +vld3_p8 (const poly8_t * __a) +{ + poly8x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); + ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); + ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); + return ret; +} + +__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__)) +vld3_s16 (const int16_t * __a) +{ + int16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); + ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); + ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); + return ret; +} + +__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__)) +vld3_p16 (const poly16_t * __a) +{ + poly16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); + ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); + ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); + return ret; +} + +__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__)) +vld3_s32 (const int32_t * __a) +{ + int32x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0); + ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1); + ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2); + return ret; +} + +__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__)) +vld3_u8 (const uint8_t * __a) +{ + uint8x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); + ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); + ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); + return ret; +} + +__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__)) +vld3_u16 (const uint16_t * __a) +{ + uint16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); + ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); + ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); + return ret; +} + +__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__)) +vld3_u32 (const uint32_t * __a) +{ + uint32x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0); + ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1); + ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2); + return ret; +} + +__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) +vld3_f32 (const float32_t * __a) +{ + float32x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v2sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0); + ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1); + ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2); + return ret; +} + +__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__)) +vld3q_s8 (const int8_t * __a) +{ + int8x16x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); + ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); + return ret; +} + +__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__)) +vld3q_p8 (const poly8_t * __a) +{ + poly8x16x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); + ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); + return ret; +} + +__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__)) +vld3q_s16 (const int16_t * __a) +{ + int16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); + ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); + return ret; +} + +__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__)) +vld3q_p16 (const poly16_t * __a) +{ + poly16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); + ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); + return ret; +} + +__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__)) +vld3q_s32 (const int32_t * __a) +{ + int32x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__)) +vld3q_s64 (const int64_t * __a) +{ + int64x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); + ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); + return ret; +} + +__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__)) +vld3q_u8 (const uint8_t * __a) +{ + uint8x16x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); + ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); + return ret; +} + +__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__)) +vld3q_u16 (const uint16_t * __a) +{ + uint16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); + ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); + return ret; +} + +__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__)) +vld3q_u32 (const uint32_t * __a) +{ + uint32x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__)) +vld3q_u64 (const uint64_t * __a) +{ + uint64x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); + ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); + return ret; +} + +__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__)) +vld3q_f32 (const float32_t * __a) +{ + float32x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1); + ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2); + return ret; +} + +__extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__)) +vld3q_f64 (const float64_t * __a) +{ + float64x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1); + ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2); + return ret; +} + +__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__)) +vld4_s64 (const int64_t * __a) +{ + int64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return ret; +} + +__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__)) +vld4_u64 (const uint64_t * __a) +{ + uint64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return ret; +} + +__extension__ static __inline float64x1x4_t __attribute__ ((__always_inline__)) +vld4_f64 (const float64_t * __a) +{ + float64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x1_t) __builtin_aarch64_get_dregxidf (__o, 0); + ret.val[1] = (float64x1_t) __builtin_aarch64_get_dregxidf (__o, 1); + ret.val[2] = (float64x1_t) __builtin_aarch64_get_dregxidf (__o, 2); + ret.val[3] = (float64x1_t) __builtin_aarch64_get_dregxidf (__o, 3); + return ret; +} + +__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__)) +vld4_s8 (const int8_t * __a) +{ + int8x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); + ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); + ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); + ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); + return ret; +} + +__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__)) +vld4_p8 (const poly8_t * __a) +{ + poly8x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); + ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); + ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); + ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); + return ret; +} + +__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__)) +vld4_s16 (const int16_t * __a) +{ + int16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); + ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); + ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); + ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); + return ret; +} + +__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__)) +vld4_p16 (const poly16_t * __a) +{ + poly16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); + ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); + ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); + ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); + return ret; +} + +__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__)) +vld4_s32 (const int32_t * __a) +{ + int32x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0); + ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1); + ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2); + ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3); + return ret; +} + +__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__)) +vld4_u8 (const uint8_t * __a) +{ + uint8x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); + ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); + ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); + ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); + return ret; +} + +__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__)) +vld4_u16 (const uint16_t * __a) +{ + uint16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); + ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); + ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); + ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); + return ret; +} + +__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__)) +vld4_u32 (const uint32_t * __a) +{ + uint32x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0); + ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1); + ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2); + ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3); + return ret; +} + +__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) +vld4_f32 (const float32_t * __a) +{ + float32x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v2sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0); + ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1); + ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2); + ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3); + return ret; +} + +__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__)) +vld4q_s8 (const int8_t * __a) +{ + int8x16x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); + ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); + ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3); + return ret; +} + +__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__)) +vld4q_p8 (const poly8_t * __a) +{ + poly8x16x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); + ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); + ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3); + return ret; +} + +__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__)) +vld4q_s16 (const int16_t * __a) +{ + int16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); + ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); + ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3); + return ret; +} + +__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__)) +vld4q_p16 (const poly16_t * __a) +{ + poly16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); + ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); + ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3); + return ret; +} + +__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__)) +vld4q_s32 (const int32_t * __a) +{ + int32x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__)) +vld4q_s64 (const int64_t * __a) +{ + int64x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1); + ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2); + ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3); + return ret; +} + +__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__)) +vld4q_u8 (const uint8_t * __a) +{ + uint8x16x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); + ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); + ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3); + return ret; +} + +__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__)) +vld4q_u16 (const uint16_t * __a) +{ + uint16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); + ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); + ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3); + return ret; +} + +__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__)) +vld4q_u32 (const uint32_t * __a) +{ + uint32x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__)) +vld4q_u64 (const uint64_t * __a) +{ + uint64x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1); + ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2); + ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3); + return ret; +} + +__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__)) +vld4q_f32 (const float32_t * __a) +{ + float32x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1); + ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2); + ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3); + return ret; +} + +__extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__)) +vld4q_f64 (const float64_t * __a) +{ + float64x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1); + ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2); + ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3); + return ret; +} + /* vmax */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) @@ -23794,6 +24364,872 @@ vsrid_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) return (uint64x1_t) __builtin_aarch64_usri_ndi (__a, __b, __c); } +/* vstn */ + +__extension__ static __inline void +vst2_s64 (int64_t * __a, int64x1x2_t val) +{ + __builtin_aarch64_simd_oi __o; + int64x2x2_t temp; + temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (INT64_C (0))); + temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1); + __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void +vst2_u64 (uint64_t * __a, uint64x1x2_t val) +{ + __builtin_aarch64_simd_oi __o; + uint64x2x2_t temp; + temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (UINT64_C (0))); + temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1); + __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void +vst2_f64 (float64_t * __a, float64x1x2_t val) +{ + __builtin_aarch64_simd_oi __o; + float64x2x2_t temp; + temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (UINT64_C (0))); + temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1); + __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o); +} + +__extension__ static __inline void +vst2_s8 (int8_t * __a, int8x8x2_t val) +{ + __builtin_aarch64_simd_oi __o; + int8x16x2_t temp; + temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (INT64_C (0))); + temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); + __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_p8 (poly8_t * __a, poly8x8x2_t val) +{ + __builtin_aarch64_simd_oi __o; + poly8x16x2_t temp; + temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (UINT64_C (0))); + temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); + __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_s16 (int16_t * __a, int16x4x2_t val) +{ + __builtin_aarch64_simd_oi __o; + int16x8x2_t temp; + temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (INT64_C (0))); + temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); + __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_p16 (poly16_t * __a, poly16x4x2_t val) +{ + __builtin_aarch64_simd_oi __o; + poly16x8x2_t temp; + temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (UINT64_C (0))); + temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); + __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_s32 (int32_t * __a, int32x2x2_t val) +{ + __builtin_aarch64_simd_oi __o; + int32x4x2_t temp; + temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (INT64_C (0))); + temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1); + __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_u8 (uint8_t * __a, uint8x8x2_t val) +{ + __builtin_aarch64_simd_oi __o; + uint8x16x2_t temp; + temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (UINT64_C (0))); + temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); + __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_u16 (uint16_t * __a, uint16x4x2_t val) +{ + __builtin_aarch64_simd_oi __o; + uint16x8x2_t temp; + temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (UINT64_C (0))); + temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); + __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_u32 (uint32_t * __a, uint32x2x2_t val) +{ + __builtin_aarch64_simd_oi __o; + uint32x4x2_t temp; + temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (UINT64_C (0))); + temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1); + __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_f32 (float32_t * __a, float32x2x2_t val) +{ + __builtin_aarch64_simd_oi __o; + float32x4x2_t temp; + temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (UINT64_C (0))); + temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1); + __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_s8 (int8_t * __a, int8x16x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); + __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_p8 (poly8_t * __a, poly8x16x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); + __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_s16 (int16_t * __a, int16x8x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); + __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_p16 (poly16_t * __a, poly16x8x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); + __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_s32 (int32_t * __a, int32x4x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1); + __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_s64 (int64_t * __a, int64x2x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1); + __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_u8 (uint8_t * __a, uint8x16x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); + __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_u16 (uint16_t * __a, uint16x8x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); + __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_u32 (uint32_t * __a, uint32x4x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1); + __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_u64 (uint64_t * __a, uint64x2x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1); + __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_f32 (float32_t * __a, float32x4x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1); + __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_f64 (float64_t * __a, float64x2x2_t val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1); + __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o); +} + +__extension__ static __inline void +vst3_s64 (int64_t * __a, int64x1x3_t val) +{ + __builtin_aarch64_simd_ci __o; + int64x2x3_t temp; + temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (INT64_C (0))); + temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (INT64_C (0))); + temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (INT64_C (0))); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2); + __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void +vst3_u64 (uint64_t * __a, uint64x1x3_t val) +{ + __builtin_aarch64_simd_ci __o; + uint64x2x3_t temp; + temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (UINT64_C (0))); + temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (UINT64_C (0))); + temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2); + __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void +vst3_f64 (float64_t * __a, float64x1x3_t val) +{ + __builtin_aarch64_simd_ci __o; + float64x2x3_t temp; + temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (UINT64_C (0))); + temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (UINT64_C (0))); + temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2); + __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o); +} + +__extension__ static __inline void +vst3_s8 (int8_t * __a, int8x8x3_t val) +{ + __builtin_aarch64_simd_ci __o; + int8x16x3_t temp; + temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (INT64_C (0))); + temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (INT64_C (0))); + temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (INT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); + __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_p8 (poly8_t * __a, poly8x8x3_t val) +{ + __builtin_aarch64_simd_ci __o; + poly8x16x3_t temp; + temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (UINT64_C (0))); + temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (UINT64_C (0))); + temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); + __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_s16 (int16_t * __a, int16x4x3_t val) +{ + __builtin_aarch64_simd_ci __o; + int16x8x3_t temp; + temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (INT64_C (0))); + temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (INT64_C (0))); + temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (INT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); + __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_p16 (poly16_t * __a, poly16x4x3_t val) +{ + __builtin_aarch64_simd_ci __o; + poly16x8x3_t temp; + temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (UINT64_C (0))); + temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (UINT64_C (0))); + temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); + __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_s32 (int32_t * __a, int32x2x3_t val) +{ + __builtin_aarch64_simd_ci __o; + int32x4x3_t temp; + temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (INT64_C (0))); + temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (INT64_C (0))); + temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (INT64_C (0))); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2); + __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_u8 (uint8_t * __a, uint8x8x3_t val) +{ + __builtin_aarch64_simd_ci __o; + uint8x16x3_t temp; + temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (UINT64_C (0))); + temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (UINT64_C (0))); + temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); + __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_u16 (uint16_t * __a, uint16x4x3_t val) +{ + __builtin_aarch64_simd_ci __o; + uint16x8x3_t temp; + temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (UINT64_C (0))); + temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (UINT64_C (0))); + temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); + __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_u32 (uint32_t * __a, uint32x2x3_t val) +{ + __builtin_aarch64_simd_ci __o; + uint32x4x3_t temp; + temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (UINT64_C (0))); + temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (UINT64_C (0))); + temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2); + __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_f32 (float32_t * __a, float32x2x3_t val) +{ + __builtin_aarch64_simd_ci __o; + float32x4x3_t temp; + temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (UINT64_C (0))); + temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (UINT64_C (0))); + temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2); + __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_s8 (int8_t * __a, int8x16x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); + __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_p8 (poly8_t * __a, poly8x16x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); + __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_s16 (int16_t * __a, int16x8x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); + __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_p16 (poly16_t * __a, poly16x8x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); + __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_s32 (int32_t * __a, int32x4x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2); + __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_s64 (int64_t * __a, int64x2x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2); + __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_u8 (uint8_t * __a, uint8x16x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); + __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_u16 (uint16_t * __a, uint16x8x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); + __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_u32 (uint32_t * __a, uint32x4x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2); + __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_u64 (uint64_t * __a, uint64x2x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2); + __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_f32 (float32_t * __a, float32x4x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2); + __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_f64 (float64_t * __a, float64x2x3_t val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2); + __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o); +} + +__extension__ static __inline void +vst4_s64 (int64_t * __a, int64x1x4_t val) +{ + __builtin_aarch64_simd_xi __o; + int64x2x4_t temp; + temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (INT64_C (0))); + temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (INT64_C (0))); + temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (INT64_C (0))); + temp.val[3] = vcombine_s64 (val.val[3], vcreate_s64 (INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3); + __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void +vst4_u64 (uint64_t * __a, uint64x1x4_t val) +{ + __builtin_aarch64_simd_xi __o; + uint64x2x4_t temp; + temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (UINT64_C (0))); + temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (UINT64_C (0))); + temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (UINT64_C (0))); + temp.val[3] = vcombine_u64 (val.val[3], vcreate_u64 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3); + __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void +vst4_f64 (float64_t * __a, float64x1x4_t val) +{ + __builtin_aarch64_simd_xi __o; + float64x2x4_t temp; + temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (UINT64_C (0))); + temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (UINT64_C (0))); + temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (UINT64_C (0))); + temp.val[3] = vcombine_f64 (val.val[3], vcreate_f64 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[3], 3); + __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o); +} + +__extension__ static __inline void +vst4_s8 (int8_t * __a, int8x8x4_t val) +{ + __builtin_aarch64_simd_xi __o; + int8x16x4_t temp; + temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (INT64_C (0))); + temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (INT64_C (0))); + temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (INT64_C (0))); + temp.val[3] = vcombine_s8 (val.val[3], vcreate_s8 (INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); + __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_p8 (poly8_t * __a, poly8x8x4_t val) +{ + __builtin_aarch64_simd_xi __o; + poly8x16x4_t temp; + temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (UINT64_C (0))); + temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (UINT64_C (0))); + temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (UINT64_C (0))); + temp.val[3] = vcombine_p8 (val.val[3], vcreate_p8 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); + __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_s16 (int16_t * __a, int16x4x4_t val) +{ + __builtin_aarch64_simd_xi __o; + int16x8x4_t temp; + temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (INT64_C (0))); + temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (INT64_C (0))); + temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (INT64_C (0))); + temp.val[3] = vcombine_s16 (val.val[3], vcreate_s16 (INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); + __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_p16 (poly16_t * __a, poly16x4x4_t val) +{ + __builtin_aarch64_simd_xi __o; + poly16x8x4_t temp; + temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (UINT64_C (0))); + temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (UINT64_C (0))); + temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (UINT64_C (0))); + temp.val[3] = vcombine_p16 (val.val[3], vcreate_p16 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); + __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_s32 (int32_t * __a, int32x2x4_t val) +{ + __builtin_aarch64_simd_xi __o; + int32x4x4_t temp; + temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (INT64_C (0))); + temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (INT64_C (0))); + temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (INT64_C (0))); + temp.val[3] = vcombine_s32 (val.val[3], vcreate_s32 (INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3); + __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_u8 (uint8_t * __a, uint8x8x4_t val) +{ + __builtin_aarch64_simd_xi __o; + uint8x16x4_t temp; + temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (UINT64_C (0))); + temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (UINT64_C (0))); + temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (UINT64_C (0))); + temp.val[3] = vcombine_u8 (val.val[3], vcreate_u8 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); + __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_u16 (uint16_t * __a, uint16x4x4_t val) +{ + __builtin_aarch64_simd_xi __o; + uint16x8x4_t temp; + temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (UINT64_C (0))); + temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (UINT64_C (0))); + temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (UINT64_C (0))); + temp.val[3] = vcombine_u16 (val.val[3], vcreate_u16 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); + __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_u32 (uint32_t * __a, uint32x2x4_t val) +{ + __builtin_aarch64_simd_xi __o; + uint32x4x4_t temp; + temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (UINT64_C (0))); + temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (UINT64_C (0))); + temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (UINT64_C (0))); + temp.val[3] = vcombine_u32 (val.val[3], vcreate_u32 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3); + __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_f32 (float32_t * __a, float32x2x4_t val) +{ + __builtin_aarch64_simd_xi __o; + float32x4x4_t temp; + temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (UINT64_C (0))); + temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (UINT64_C (0))); + temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (UINT64_C (0))); + temp.val[3] = vcombine_f32 (val.val[3], vcreate_f32 (UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[3], 3); + __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_s8 (int8_t * __a, int8x16x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); + __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_p8 (poly8_t * __a, poly8x16x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); + __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_s16 (int16_t * __a, int16x8x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); + __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_p16 (poly16_t * __a, poly16x8x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); + __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_s32 (int32_t * __a, int32x4x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3); + __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_s64 (int64_t * __a, int64x2x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3); + __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_u8 (uint8_t * __a, uint8x16x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); + __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_u16 (uint16_t * __a, uint16x8x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); + __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_u32 (uint32_t * __a, uint32x4x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3); + __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_u64 (uint64_t * __a, uint64x2x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3); + __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_f32 (float32_t * __a, float32x4x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3); + __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_f64 (float64_t * __a, float64x2x4_t val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3); + __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o); +} + /* vsub */ __extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index da50a4738..fe613070f 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -126,6 +126,13 @@ (match_test "aarch64_legitimate_address_p (GET_MODE (op), XEXP (op, 0), PARALLEL, 1)"))) +(define_memory_constraint "Utv" + "@internal + An address valid for loading/storing opaque structure + types wider than TImode." + (and (match_code "mem") + (match_test "aarch64_simd_mem_operand_p (op)"))) + (define_constraint "Dn" "@internal A constraint that matches vector of immediates." @@ -147,3 +154,14 @@ (and (match_code "const_vector") (match_test "aarch64_simd_shift_imm_p (op, GET_MODE (op), false)"))) +(define_constraint "Dz" + "@internal + A constraint that matches vector of immediate zero." + (and (match_code "const_vector") + (match_test "aarch64_simd_imm_zero_p (op, GET_MODE (op))"))) + +(define_constraint "Dd" + "@internal + A constraint that matches an immediate operand valid for AdvSIMD scalar." + (and (match_code "const_int") + (match_test "aarch64_simd_imm_scalar_p (op, GET_MODE (op))"))) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 9cecfca48..bf2041e78 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -110,6 +110,9 @@ ;; Double vector modes for combines. (define_mode_iterator VDC [V8QI V4HI V2SI V2SF DI DF]) +;; Double vector modes for combines. +(define_mode_iterator VDIC [V8QI V4HI V2SI]) + ;; Double vector modes. (define_mode_iterator VD_RE [V8QI V4HI V2SI DI DF V2SF]) @@ -139,6 +142,12 @@ (define_mode_iterator TX [TI TF]) +;; Opaque structure modes. +(define_mode_iterator VSTRUCT [OI CI XI]) + +;; Double scalar modes +(define_mode_iterator DX [DI DF]) + ;; ------------------------------------------------------------------ ;; Unspec enumerations for Advance SIMD. These could well go into ;; aarch64.md but for their use in int_iterators here. @@ -273,6 +282,7 @@ (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b") (V4HI "4h") (V8HI "8h") (V2SI "2s") (V4SI "4s") + (DI "1d") (DF "1d") (V2DI "2d") (V2SF "2s") (V4SF "4s") (V2DF "2d")]) @@ -411,6 +421,33 @@ (define_mode_attr vwx [(V4HI "x") (V8HI "x") (HI "x") (V2SI "w") (V4SI "w") (SI "w")]) +(define_mode_attr Vendreg [(OI "T") (CI "U") (XI "V")]) + +(define_mode_attr nregs [(OI "2") (CI "3") (XI "4")]) + +(define_mode_attr VRL2 [(V8QI "V32QI") (V4HI "V16HI") + (V2SI "V8SI") (V2SF "V8SF") + (DI "V4DI") (DF "V4DF") + (V16QI "V32QI") (V8HI "V16HI") + (V4SI "V8SI") (V4SF "V8SF") + (V2DI "V4DI") (V2DF "V4DF")]) + +(define_mode_attr VRL3 [(V8QI "V48QI") (V4HI "V24HI") + (V2SI "V12SI") (V2SF "V12SF") + (DI "V6DI") (DF "V6DF") + (V16QI "V48QI") (V8HI "V24HI") + (V4SI "V12SI") (V4SF "V12SF") + (V2DI "V6DI") (V2DF "V6DF")]) + +(define_mode_attr VRL4 [(V8QI "V64QI") (V4HI "V32HI") + (V2SI "V16SI") (V2SF "V16SF") + (DI "V8DI") (DF "V8DF") + (V16QI "V64QI") (V8HI "V32HI") + (V4SI "V16SI") (V4SF "V16SF") + (V2DI "V8DI") (V2DF "V8DF")]) + +(define_mode_attr VSTRUCT_DREG [(OI "TI") (CI "EI") (XI "OI")]) + ;; ------------------------------------------------------------------- ;; Code Iterators ;; ------------------------------------------------------------------- diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 328e5cf66..39339cc6e 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -265,3 +265,39 @@ { return aarch64_simd_shift_imm_p (op, mode, false); }) + +(define_predicate "aarch64_simd_reg_or_zero" + (and (match_code "reg,subreg,const_int,const_vector") + (ior (match_operand 0 "register_operand") + (ior (match_test "op == const0_rtx") + (match_test "aarch64_simd_imm_zero_p (op, mode)"))))) + +(define_predicate "aarch64_simd_struct_operand" + (and (match_code "mem") + (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p (op)"))) + +;; Like general_operand but allow only valid SIMD addressing modes. +(define_predicate "aarch64_simd_general_operand" + (and (match_operand 0 "general_operand") + (match_test "!MEM_P (op) + || GET_CODE (XEXP (op, 0)) == POST_INC + || GET_CODE (XEXP (op, 0)) == REG"))) + +;; Like nonimmediate_operand but allow only valid SIMD addressing modes. +(define_predicate "aarch64_simd_nonimmediate_operand" + (and (match_operand 0 "nonimmediate_operand") + (match_test "!MEM_P (op) + || GET_CODE (XEXP (op, 0)) == POST_INC + || GET_CODE (XEXP (op, 0)) == REG"))) + +(define_special_predicate "aarch64_simd_imm_zero" + (match_code "const_vector") +{ + return aarch64_simd_imm_zero_p (op, mode); +}) + +(define_predicate "aarch64_simd_reg_or_zero" + (and (match_code "reg,subreg,const_int,const_vector") + (ior (match_operand 0 "register_operand") + (ior (match_test "op == const0_rtx") + (match_test "aarch64_simd_imm_zero_p (op, mode)"))))) diff --git a/gcc/testsuite/ChangeLog.aarch64 b/gcc/testsuite/ChangeLog.aarch64 index 2a4295ada..5b577d5ba 100644 --- a/gcc/testsuite/ChangeLog.aarch64 +++ b/gcc/testsuite/ChangeLog.aarch64 @@ -1,3 +1,9 @@ +2012-09-25 Tejas Belagod + + * testsuite/lib/target-supports.exp + (check_effective_target_vect_stridedN): Enable support for strided + load and stores for aarch64. + 2012-09-18 Ian Bolton * gcc.target/aarch64/clrsb.c: New test. diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index abb6cbb16..bea6b2924 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -3762,7 +3762,8 @@ foreach N {2 3 4 8} { && [check_effective_target_vect_extract_even_odd] } { set et_vect_stridedN_saved 1 } - if { [istarget arm*-*-*] && N >= 2 && N <= 4 } { + if { ([istarget arm*-*-*] + || [istarget aarch64*-*-*]) && N >= 2 && N <= 4 } { set et_vect_stridedN_saved 1 } } -- cgit v1.2.3