diff options
author | Caio Oliveira <caio.oliveira@intel.com> | 2024-02-17 22:31:45 -0800 |
---|---|---|
committer | Marge Bot <emma+marge@anholt.net> | 2024-02-28 05:45:38 +0000 |
commit | c793644ce9f332efa6dade415d56d3c876bcb11b (patch) | |
tree | 6bed8960b1bc5320aee9b06fbeec41c8b1e27b63 | |
parent | e0d767f5fefb6d86d64d6599a674e8ec86f27d30 (diff) | |
download | mesa3d-c793644ce9f332efa6dade415d56d3c876bcb11b.tar.gz |
intel/brw: Remove Gfx8- code from SIMD lowering
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
-rw-r--r-- | src/intel/compiler/brw_fs_lower_simd_width.cpp | 249 |
1 files changed, 14 insertions, 235 deletions
diff --git a/src/intel/compiler/brw_fs_lower_simd_width.cpp b/src/intel/compiler/brw_fs_lower_simd_width.cpp index f54ccba9c65..0a94e9ab3fb 100644 --- a/src/intel/compiler/brw_fs_lower_simd_width.cpp +++ b/src/intel/compiler/brw_fs_lower_simd_width.cpp @@ -96,93 +96,13 @@ get_fpu_lowered_simd_width(const fs_visitor *shader, if (reg_count > max_reg_count) max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count)); - /* According to the IVB PRMs: - * "When destination spans two registers, the source MUST span two - * registers. The exception to the above rule: - * - * - When source is scalar, the source registers are not incremented. - * - When source is packed integer Word and destination is packed - * integer DWord, the source register is not incremented but the - * source sub register is incremented." - * - * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning - * restrictions. The code below intentionally doesn't check whether the - * destination type is integer because empirically the hardware doesn't - * seem to care what the actual type is as long as it's dword-aligned. - * - * HSW PRMs also add a note to the second exception: - * "When lower 8 channels are disabled, the sub register of source1 - * operand is not incremented. If the lower 8 channels are expected - * to be disabled, say by predication, the instruction must be split - * into pair of simd8 operations." - * - * We can't reliably know if the channels won't be disabled due to, - * for example, IMASK. So, play it safe and disallow packed-word exception - * for src1. - */ - if (devinfo->ver < 8) { - for (unsigned i = 0; i < inst->sources; i++) { - /* IVB implements DF scalars as <0;2,1> regions. */ - const bool is_scalar_exception = is_uniform(inst->src[i]) && - (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8); - const bool is_packed_word_exception = i != 1 && - type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 && - type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1; - - /* We check size_read(i) against size_written instead of REG_SIZE - * because we want to properly handle SIMD32. In SIMD32, you can end - * up with writes to 4 registers and a source that reads 2 registers - * and we may still need to lower all the way to SIMD8 in that case. - */ - if (inst->size_written > REG_SIZE && - inst->size_read(i) != 0 && - inst->size_read(i) < inst->size_written && - !is_scalar_exception && !is_packed_word_exception) { - const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); - max_width = MIN2(max_width, inst->exec_size / reg_count); - } - } - } - - if (devinfo->ver < 6) { - /* From the G45 PRM, Volume 4 Page 361: - * - * "Operand Alignment Rule: With the exceptions listed below, a - * source/destination operand in general should be aligned to even - * 256-bit physical register with a region size equal to two 256-bit - * physical registers." - * - * Normally we enforce this by allocating virtual registers to the - * even-aligned class. But we need to handle payload registers. - */ - for (unsigned i = 0; i < inst->sources; i++) { - if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) && - inst->size_read(i) > REG_SIZE) { - max_width = MIN2(max_width, 8); - } - } - } - - /* From the IVB PRMs: - * "When an instruction is SIMD32, the low 16 bits of the execution mask - * are applied for both halves of the SIMD32 instruction. If different - * execution mask channels are required, split the instruction into two - * SIMD16 instructions." - * - * There is similar text in the HSW PRMs. Gfx4-6 don't even implement - * 32-wide control flow support in hardware and will behave similarly. - */ - if (devinfo->ver < 8 && !inst->force_writemask_all) - max_width = MIN2(max_width, 16); - /* From the IVB PRMs (applies to HSW too): * "Instructions with condition modifiers must not use SIMD32." * * From the BDW PRMs (applies to later hardware too): * "Ternary instruction with condition modifiers must not use SIMD32." */ - if (inst->conditional_mod && (devinfo->ver < 8 || - (inst->is_3src(compiler) && devinfo->ver < 12))) + if (inst->conditional_mod && inst->is_3src(compiler) && devinfo->ver < 12) max_width = MIN2(max_width, 16); /* From the IVB PRMs (applies to other devices that don't have the @@ -193,41 +113,6 @@ get_fpu_lowered_simd_width(const fs_visitor *shader, if (inst->is_3src(compiler) && !devinfo->supports_simd16_3src) max_width = MIN2(max_width, inst->exec_size / reg_count); - /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is - * the 8-bit quarter of the execution mask signals specified in the - * instruction control fields) for the second compressed half of any - * single-precision instruction (for double-precision instructions - * it's hardwired to use NibCtrl+1, at least on HSW), which means that - * the EU will apply the wrong execution controls for the second - * sequential GRF write if the number of channels per GRF is not exactly - * eight in single-precision mode (or four in double-float mode). - * - * In this situation we calculate the maximum size of the split - * instructions so they only ever write to a single register. - */ - if (devinfo->ver < 8 && inst->size_written > REG_SIZE && - !inst->force_writemask_all) { - const unsigned channels_per_grf = inst->exec_size / - DIV_ROUND_UP(inst->size_written, REG_SIZE); - const unsigned exec_type_size = get_exec_type_size(inst); - assert(exec_type_size); - - /* The hardware shifts exactly 8 channels per compressed half of the - * instruction in single-precision mode and exactly 4 in double-precision. - */ - if (channels_per_grf != (exec_type_size == 8 ? 4 : 8)) - max_width = MIN2(max_width, channels_per_grf); - - /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT - * because HW applies the same channel enable signals to both halves of - * the compressed instruction which will be just wrong under - * non-uniform control flow. - */ - if (devinfo->verx10 == 70 && - (exec_type_size == 8 || type_sz(inst->dst.type) == 8)) - max_width = MIN2(max_width, 4); - } - /* From the SKL PRM, Special Restrictions for Handling Mixed Mode * Float Operations: * @@ -292,24 +177,10 @@ get_sampler_lowered_simd_width(const struct intel_device_info *devinfo, inst->components_read(TEX_LOGICAL_SRC_MIN_LOD)) return devinfo->ver < 20 ? 8 : 16; - /* Calculate the number of coordinate components that have to be present - * assuming that additional arguments follow the texel coordinates in the - * message payload. On IVB+ there is no need for padding, on ILK-SNB we - * need to pad to four or three components depending on the message, - * pre-ILK we need to pad to at most three components. - */ - const unsigned req_coord_components = - (devinfo->ver >= 7 || - !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 : - (devinfo->ver >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL && - inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 : - 3; - /* On Gfx9+ the LOD argument is for free if we're able to use the LZ * variant of the TXL or TXF message. */ - const bool implicit_lod = devinfo->ver >= 9 && - (inst->opcode == SHADER_OPCODE_TXL || + const bool implicit_lod = (inst->opcode == SHADER_OPCODE_TXL || inst->opcode == SHADER_OPCODE_TXF) && inst->src[TEX_LOGICAL_SRC_LOD].is_zero(); @@ -317,8 +188,7 @@ get_sampler_lowered_simd_width(const struct intel_device_info *devinfo, * to the sampler unit. */ const unsigned num_payload_components = - MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE), - req_coord_components) + + inst->components_read(TEX_LOGICAL_SRC_COORDINATE) + inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) + (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) + inst->components_read(TEX_LOGICAL_SRC_LOD2) + @@ -386,32 +256,10 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst) case SHADER_OPCODE_SEL_EXEC: case SHADER_OPCODE_CLUSTER_BROADCAST: case SHADER_OPCODE_MOV_RELOC_IMM: - return get_fpu_lowered_simd_width(shader, inst); - - case BRW_OPCODE_CMP: { - /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that - * when the destination is a GRF the dependency-clear bit on the flag - * register is cleared early. - * - * Suggested workarounds are to disable coissuing CMP instructions - * or to split CMP(16) instructions into two CMP(8) instructions. - * - * We choose to split into CMP(8) instructions since disabling - * coissuing would affect CMP instructions not otherwise affected by - * the errata. - */ - const unsigned max_width = (devinfo->verx10 == 70 && - !inst->dst.is_null() ? 8 : ~0); - return MIN2(max_width, get_fpu_lowered_simd_width(shader, inst)); - } + case BRW_OPCODE_CMP: case BRW_OPCODE_BFI1: case BRW_OPCODE_BFI2: - /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we - * should - * "Force BFI instructions to be executed always in SIMD8." - */ - return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u, - get_fpu_lowered_simd_width(shader, inst)); + return get_fpu_lowered_simd_width(shader, inst); case BRW_OPCODE_IF: assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16); @@ -424,11 +272,6 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst) case SHADER_OPCODE_LOG2: case SHADER_OPCODE_SIN: case SHADER_OPCODE_COS: { - /* Unary extended math instructions are limited to SIMD8 on Gfx4 and - * Gfx6. Extended Math Function is limited to SIMD8 with half-float. - */ - if (devinfo->ver == 6 || devinfo->verx10 == 40) - return MIN2(8, inst->exec_size); if (inst->dst.type == BRW_REGISTER_TYPE_HF) return MIN2(8, inst->exec_size); return MIN2(16, inst->exec_size); @@ -438,8 +281,6 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst) /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited * to SIMD8 with half-float */ - if (devinfo->ver < 7) - return MIN2(8, inst->exec_size); if (inst->dst.type == BRW_REGISTER_TYPE_HF) return MIN2(8, inst->exec_size); return MIN2(16, inst->exec_size); @@ -460,64 +301,20 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst) case FS_OPCODE_INTERPOLATE_AT_SAMPLE: case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: - return MIN2(16, inst->exec_size); - case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: - /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch - * message used to implement varying pull constant loads, so expand it - * to SIMD16. An alternative with longer message payload length but - * shorter return payload would be to use the SIMD8 sampler message that - * takes (header, u, v, r) as parameters instead of (header, u). - */ - return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size)); - case FS_OPCODE_DDX_COARSE: case FS_OPCODE_DDX_FINE: case FS_OPCODE_DDY_COARSE: case FS_OPCODE_DDY_FINE: - /* The implementation of this virtual opcode may require emitting - * compressed Align16 instructions, which are severely limited on some - * generations. - * - * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register - * Region Restrictions): - * - * "In Align16 access mode, SIMD16 is not allowed for DW operations - * and SIMD8 is not allowed for DF operations." - * - * In this context, "DW operations" means "operations acting on 32-bit - * values", so it includes operations on floats. - * - * Gfx4 has a similar restriction. From the i965 PRM, section 11.5.3 - * (Instruction Compression -> Rules and Restrictions): - * - * "A compressed instruction must be in Align1 access mode. Align16 - * mode instructions cannot be compressed." - * - * Similar text exists in the g45 PRM. - * - * Empirically, compressed align16 instructions using odd register - * numbers don't appear to work on Sandybridge either. - */ - return (devinfo->ver == 4 || devinfo->ver == 6 || - (devinfo->verx10 == 70) ? - MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size)); + return MIN2(16, inst->exec_size); case SHADER_OPCODE_MULH: /* MULH is lowered to the MUL/MACH sequence using the accumulator, which * is 8-wide on Gfx7+. */ - return (devinfo->ver >= 20 ? 16 : - devinfo->ver >= 7 ? 8 : - get_fpu_lowered_simd_width(shader, inst)); + return devinfo->ver >= 20 ? 16 : 8; case FS_OPCODE_FB_WRITE_LOGICAL: - /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them - * here. - */ - assert(devinfo->ver != 6 || - inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE || - inst->exec_size == 8); /* Dual-source FB writes are unsupported in SIMD16 mode. */ return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ? 8 : MIN2(16, inst->exec_size)); @@ -539,6 +336,10 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst) case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL: case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL: case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: + case SHADER_OPCODE_TXL_LOGICAL: + case FS_OPCODE_TXB_LOGICAL: + case SHADER_OPCODE_TXF_LOGICAL: + case SHADER_OPCODE_TXS_LOGICAL: return get_sampler_lowered_simd_width(devinfo, inst); /* On gfx12 parameters are fixed to 16-bit values and therefore they all @@ -553,26 +354,6 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst) */ return devinfo->ver < 20 ? 8 : 16; - case SHADER_OPCODE_TXL_LOGICAL: - case FS_OPCODE_TXB_LOGICAL: - /* Only one execution size is representable pre-ILK depending on whether - * the shadow reference argument is present. - */ - if (devinfo->ver == 4) - return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8; - else - return get_sampler_lowered_simd_width(devinfo, inst); - - case SHADER_OPCODE_TXF_LOGICAL: - case SHADER_OPCODE_TXS_LOGICAL: - /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD - * messages. Use SIMD16 instead. - */ - if (devinfo->ver == 4) - return 16; - else - return get_sampler_lowered_simd_width(devinfo, inst); - case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: @@ -585,13 +366,11 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst) case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL: case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL: - return MIN2(16, inst->exec_size); - case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: - return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size); + return MIN2(16, inst->exec_size); case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: @@ -624,9 +403,9 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst) * the EU decompression logic not handling VxH indirect addressing * correctly. */ - const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE; + const unsigned max_size = 2 * REG_SIZE; /* Prior to Broadwell, we only have 8 address subregisters. */ - return MIN3(devinfo->ver >= 8 ? 16 : 8, + return MIN3(16, max_size / (inst->dst.stride * type_sz(inst->dst.type)), inst->exec_size); } |