aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCaio Oliveira <caio.oliveira@intel.com>2024-02-17 22:31:45 -0800
committerMarge Bot <emma+marge@anholt.net>2024-02-28 05:45:38 +0000
commitc793644ce9f332efa6dade415d56d3c876bcb11b (patch)
tree6bed8960b1bc5320aee9b06fbeec41c8b1e27b63
parente0d767f5fefb6d86d64d6599a674e8ec86f27d30 (diff)
downloadmesa3d-c793644ce9f332efa6dade415d56d3c876bcb11b.tar.gz
intel/brw: Remove Gfx8- code from SIMD lowering
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
-rw-r--r--src/intel/compiler/brw_fs_lower_simd_width.cpp249
1 files changed, 14 insertions, 235 deletions
diff --git a/src/intel/compiler/brw_fs_lower_simd_width.cpp b/src/intel/compiler/brw_fs_lower_simd_width.cpp
index f54ccba9c65..0a94e9ab3fb 100644
--- a/src/intel/compiler/brw_fs_lower_simd_width.cpp
+++ b/src/intel/compiler/brw_fs_lower_simd_width.cpp
@@ -96,93 +96,13 @@ get_fpu_lowered_simd_width(const fs_visitor *shader,
if (reg_count > max_reg_count)
max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
- /* According to the IVB PRMs:
- * "When destination spans two registers, the source MUST span two
- * registers. The exception to the above rule:
- *
- * - When source is scalar, the source registers are not incremented.
- * - When source is packed integer Word and destination is packed
- * integer DWord, the source register is not incremented but the
- * source sub register is incremented."
- *
- * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
- * restrictions. The code below intentionally doesn't check whether the
- * destination type is integer because empirically the hardware doesn't
- * seem to care what the actual type is as long as it's dword-aligned.
- *
- * HSW PRMs also add a note to the second exception:
- * "When lower 8 channels are disabled, the sub register of source1
- * operand is not incremented. If the lower 8 channels are expected
- * to be disabled, say by predication, the instruction must be split
- * into pair of simd8 operations."
- *
- * We can't reliably know if the channels won't be disabled due to,
- * for example, IMASK. So, play it safe and disallow packed-word exception
- * for src1.
- */
- if (devinfo->ver < 8) {
- for (unsigned i = 0; i < inst->sources; i++) {
- /* IVB implements DF scalars as <0;2,1> regions. */
- const bool is_scalar_exception = is_uniform(inst->src[i]) &&
- (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8);
- const bool is_packed_word_exception = i != 1 &&
- type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
- type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
-
- /* We check size_read(i) against size_written instead of REG_SIZE
- * because we want to properly handle SIMD32. In SIMD32, you can end
- * up with writes to 4 registers and a source that reads 2 registers
- * and we may still need to lower all the way to SIMD8 in that case.
- */
- if (inst->size_written > REG_SIZE &&
- inst->size_read(i) != 0 &&
- inst->size_read(i) < inst->size_written &&
- !is_scalar_exception && !is_packed_word_exception) {
- const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
- max_width = MIN2(max_width, inst->exec_size / reg_count);
- }
- }
- }
-
- if (devinfo->ver < 6) {
- /* From the G45 PRM, Volume 4 Page 361:
- *
- * "Operand Alignment Rule: With the exceptions listed below, a
- * source/destination operand in general should be aligned to even
- * 256-bit physical register with a region size equal to two 256-bit
- * physical registers."
- *
- * Normally we enforce this by allocating virtual registers to the
- * even-aligned class. But we need to handle payload registers.
- */
- for (unsigned i = 0; i < inst->sources; i++) {
- if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
- inst->size_read(i) > REG_SIZE) {
- max_width = MIN2(max_width, 8);
- }
- }
- }
-
- /* From the IVB PRMs:
- * "When an instruction is SIMD32, the low 16 bits of the execution mask
- * are applied for both halves of the SIMD32 instruction. If different
- * execution mask channels are required, split the instruction into two
- * SIMD16 instructions."
- *
- * There is similar text in the HSW PRMs. Gfx4-6 don't even implement
- * 32-wide control flow support in hardware and will behave similarly.
- */
- if (devinfo->ver < 8 && !inst->force_writemask_all)
- max_width = MIN2(max_width, 16);
-
/* From the IVB PRMs (applies to HSW too):
* "Instructions with condition modifiers must not use SIMD32."
*
* From the BDW PRMs (applies to later hardware too):
* "Ternary instruction with condition modifiers must not use SIMD32."
*/
- if (inst->conditional_mod && (devinfo->ver < 8 ||
- (inst->is_3src(compiler) && devinfo->ver < 12)))
+ if (inst->conditional_mod && inst->is_3src(compiler) && devinfo->ver < 12)
max_width = MIN2(max_width, 16);
/* From the IVB PRMs (applies to other devices that don't have the
@@ -193,41 +113,6 @@ get_fpu_lowered_simd_width(const fs_visitor *shader,
if (inst->is_3src(compiler) && !devinfo->supports_simd16_3src)
max_width = MIN2(max_width, inst->exec_size / reg_count);
- /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
- * the 8-bit quarter of the execution mask signals specified in the
- * instruction control fields) for the second compressed half of any
- * single-precision instruction (for double-precision instructions
- * it's hardwired to use NibCtrl+1, at least on HSW), which means that
- * the EU will apply the wrong execution controls for the second
- * sequential GRF write if the number of channels per GRF is not exactly
- * eight in single-precision mode (or four in double-float mode).
- *
- * In this situation we calculate the maximum size of the split
- * instructions so they only ever write to a single register.
- */
- if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
- !inst->force_writemask_all) {
- const unsigned channels_per_grf = inst->exec_size /
- DIV_ROUND_UP(inst->size_written, REG_SIZE);
- const unsigned exec_type_size = get_exec_type_size(inst);
- assert(exec_type_size);
-
- /* The hardware shifts exactly 8 channels per compressed half of the
- * instruction in single-precision mode and exactly 4 in double-precision.
- */
- if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
- max_width = MIN2(max_width, channels_per_grf);
-
- /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
- * because HW applies the same channel enable signals to both halves of
- * the compressed instruction which will be just wrong under
- * non-uniform control flow.
- */
- if (devinfo->verx10 == 70 &&
- (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
- max_width = MIN2(max_width, 4);
- }
-
/* From the SKL PRM, Special Restrictions for Handling Mixed Mode
* Float Operations:
*
@@ -292,24 +177,10 @@ get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
return devinfo->ver < 20 ? 8 : 16;
- /* Calculate the number of coordinate components that have to be present
- * assuming that additional arguments follow the texel coordinates in the
- * message payload. On IVB+ there is no need for padding, on ILK-SNB we
- * need to pad to four or three components depending on the message,
- * pre-ILK we need to pad to at most three components.
- */
- const unsigned req_coord_components =
- (devinfo->ver >= 7 ||
- !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
- (devinfo->ver >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
- inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
- 3;
-
/* On Gfx9+ the LOD argument is for free if we're able to use the LZ
* variant of the TXL or TXF message.
*/
- const bool implicit_lod = devinfo->ver >= 9 &&
- (inst->opcode == SHADER_OPCODE_TXL ||
+ const bool implicit_lod = (inst->opcode == SHADER_OPCODE_TXL ||
inst->opcode == SHADER_OPCODE_TXF) &&
inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
@@ -317,8 +188,7 @@ get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
* to the sampler unit.
*/
const unsigned num_payload_components =
- MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
- req_coord_components) +
+ inst->components_read(TEX_LOGICAL_SRC_COORDINATE) +
inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
(implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
inst->components_read(TEX_LOGICAL_SRC_LOD2) +
@@ -386,32 +256,10 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
case SHADER_OPCODE_SEL_EXEC:
case SHADER_OPCODE_CLUSTER_BROADCAST:
case SHADER_OPCODE_MOV_RELOC_IMM:
- return get_fpu_lowered_simd_width(shader, inst);
-
- case BRW_OPCODE_CMP: {
- /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
- * when the destination is a GRF the dependency-clear bit on the flag
- * register is cleared early.
- *
- * Suggested workarounds are to disable coissuing CMP instructions
- * or to split CMP(16) instructions into two CMP(8) instructions.
- *
- * We choose to split into CMP(8) instructions since disabling
- * coissuing would affect CMP instructions not otherwise affected by
- * the errata.
- */
- const unsigned max_width = (devinfo->verx10 == 70 &&
- !inst->dst.is_null() ? 8 : ~0);
- return MIN2(max_width, get_fpu_lowered_simd_width(shader, inst));
- }
+ case BRW_OPCODE_CMP:
case BRW_OPCODE_BFI1:
case BRW_OPCODE_BFI2:
- /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
- * should
- * "Force BFI instructions to be executed always in SIMD8."
- */
- return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u,
- get_fpu_lowered_simd_width(shader, inst));
+ return get_fpu_lowered_simd_width(shader, inst);
case BRW_OPCODE_IF:
assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
@@ -424,11 +272,6 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_SIN:
case SHADER_OPCODE_COS: {
- /* Unary extended math instructions are limited to SIMD8 on Gfx4 and
- * Gfx6. Extended Math Function is limited to SIMD8 with half-float.
- */
- if (devinfo->ver == 6 || devinfo->verx10 == 40)
- return MIN2(8, inst->exec_size);
if (inst->dst.type == BRW_REGISTER_TYPE_HF)
return MIN2(8, inst->exec_size);
return MIN2(16, inst->exec_size);
@@ -438,8 +281,6 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
/* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
* to SIMD8 with half-float
*/
- if (devinfo->ver < 7)
- return MIN2(8, inst->exec_size);
if (inst->dst.type == BRW_REGISTER_TYPE_HF)
return MIN2(8, inst->exec_size);
return MIN2(16, inst->exec_size);
@@ -460,64 +301,20 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
- return MIN2(16, inst->exec_size);
-
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
- /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
- * message used to implement varying pull constant loads, so expand it
- * to SIMD16. An alternative with longer message payload length but
- * shorter return payload would be to use the SIMD8 sampler message that
- * takes (header, u, v, r) as parameters instead of (header, u).
- */
- return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
-
case FS_OPCODE_DDX_COARSE:
case FS_OPCODE_DDX_FINE:
case FS_OPCODE_DDY_COARSE:
case FS_OPCODE_DDY_FINE:
- /* The implementation of this virtual opcode may require emitting
- * compressed Align16 instructions, which are severely limited on some
- * generations.
- *
- * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
- * Region Restrictions):
- *
- * "In Align16 access mode, SIMD16 is not allowed for DW operations
- * and SIMD8 is not allowed for DF operations."
- *
- * In this context, "DW operations" means "operations acting on 32-bit
- * values", so it includes operations on floats.
- *
- * Gfx4 has a similar restriction. From the i965 PRM, section 11.5.3
- * (Instruction Compression -> Rules and Restrictions):
- *
- * "A compressed instruction must be in Align1 access mode. Align16
- * mode instructions cannot be compressed."
- *
- * Similar text exists in the g45 PRM.
- *
- * Empirically, compressed align16 instructions using odd register
- * numbers don't appear to work on Sandybridge either.
- */
- return (devinfo->ver == 4 || devinfo->ver == 6 ||
- (devinfo->verx10 == 70) ?
- MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
+ return MIN2(16, inst->exec_size);
case SHADER_OPCODE_MULH:
/* MULH is lowered to the MUL/MACH sequence using the accumulator, which
* is 8-wide on Gfx7+.
*/
- return (devinfo->ver >= 20 ? 16 :
- devinfo->ver >= 7 ? 8 :
- get_fpu_lowered_simd_width(shader, inst));
+ return devinfo->ver >= 20 ? 16 : 8;
case FS_OPCODE_FB_WRITE_LOGICAL:
- /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
- * here.
- */
- assert(devinfo->ver != 6 ||
- inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
- inst->exec_size == 8);
/* Dual-source FB writes are unsupported in SIMD16 mode. */
return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
8 : MIN2(16, inst->exec_size));
@@ -539,6 +336,10 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
+ case SHADER_OPCODE_TXL_LOGICAL:
+ case FS_OPCODE_TXB_LOGICAL:
+ case SHADER_OPCODE_TXF_LOGICAL:
+ case SHADER_OPCODE_TXS_LOGICAL:
return get_sampler_lowered_simd_width(devinfo, inst);
/* On gfx12 parameters are fixed to 16-bit values and therefore they all
@@ -553,26 +354,6 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
*/
return devinfo->ver < 20 ? 8 : 16;
- case SHADER_OPCODE_TXL_LOGICAL:
- case FS_OPCODE_TXB_LOGICAL:
- /* Only one execution size is representable pre-ILK depending on whether
- * the shadow reference argument is present.
- */
- if (devinfo->ver == 4)
- return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
- else
- return get_sampler_lowered_simd_width(devinfo, inst);
-
- case SHADER_OPCODE_TXF_LOGICAL:
- case SHADER_OPCODE_TXS_LOGICAL:
- /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
- * messages. Use SIMD16 instead.
- */
- if (devinfo->ver == 4)
- return 16;
- else
- return get_sampler_lowered_simd_width(devinfo, inst);
-
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
@@ -585,13 +366,11 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
- return MIN2(16, inst->exec_size);
-
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
- return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
+ return MIN2(16, inst->exec_size);
case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
@@ -624,9 +403,9 @@ brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
* the EU decompression logic not handling VxH indirect addressing
* correctly.
*/
- const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
+ const unsigned max_size = 2 * REG_SIZE;
/* Prior to Broadwell, we only have 8 address subregisters. */
- return MIN3(devinfo->ver >= 8 ? 16 : 8,
+ return MIN3(16,
max_size / (inst->dst.stride * type_sz(inst->dst.type)),
inst->exec_size);
}