diff options
-rw-r--r-- | backend/include/berberis/backend/x86_64/machine_ir_builder.h | 31 | ||||
-rw-r--r-- | backend/x86_64/context_liveness_analyzer_test.cc | 22 | ||||
-rw-r--r-- | backend/x86_64/lir_instructions.json | 12 | ||||
-rw-r--r-- | backend/x86_64/local_guest_context_optimizer_test.cc | 12 | ||||
-rw-r--r-- | backend/x86_64/loop_guest_context_optimizer_test.cc | 58 | ||||
-rw-r--r-- | backend/x86_64/machine_ir_opt_test.cc | 30 | ||||
-rw-r--r-- | guest_state/include/berberis/guest_state/guest_state_opaque.h | 8 | ||||
-rw-r--r-- | guest_state/riscv64/guest_state_arch.cc | 22 | ||||
-rw-r--r-- | guest_state/riscv64/include/berberis/guest_state/guest_state_arch.h | 3 | ||||
-rw-r--r-- | heavy_optimizer/riscv64/frontend.cc | 287 | ||||
-rw-r--r-- | heavy_optimizer/riscv64/frontend.h | 335 | ||||
-rw-r--r-- | test_utils/include/berberis/test_utils/insn_tests_riscv64-inl.h | 28 | ||||
-rw-r--r-- | tests/run_host_tests.mk | 5 |
13 files changed, 764 insertions, 89 deletions
diff --git a/backend/include/berberis/backend/x86_64/machine_ir_builder.h b/backend/include/berberis/backend/x86_64/machine_ir_builder.h index 536e6790..f95d7606 100644 --- a/backend/include/berberis/backend/x86_64/machine_ir_builder.h +++ b/backend/include/berberis/backend/x86_64/machine_ir_builder.h @@ -23,6 +23,7 @@ #include "berberis/backend/common/machine_ir_builder.h" #include "berberis/backend/x86_64/machine_ir.h" #include "berberis/base/logging.h" +#include "berberis/guest_state/guest_addr.h" #include "berberis/guest_state/guest_state_opaque.h" namespace berberis::x86_64 { @@ -53,6 +54,14 @@ class MachineIRBuilder : public MachineIRBuilderBase<MachineIR> { x86_64::kMachineRegRBP, GetThreadStateRegOffset(guest_reg), src_reg); } + void GenGetOffset(MachineReg dst_reg, int32_t offset) { + Gen<x86_64::MovqRegMemBaseDisp>(dst_reg, x86_64::kMachineRegRBP, offset); + } + + void GenPutOffset(int32_t offset, MachineReg src_reg) { + Gen<x86_64::MovqMemBaseDispReg>(x86_64::kMachineRegRBP, offset, src_reg); + } + void GenGetSimd(MachineReg dst_reg, int guest_reg) { int32_t offset = GetThreadStateSimdRegOffset(guest_reg); Gen<x86_64::MovdqaXRegMemBaseDisp>(dst_reg, x86_64::kMachineRegRBP, offset); @@ -63,6 +72,28 @@ class MachineIRBuilder : public MachineIRBuilderBase<MachineIR> { Gen<x86_64::MovdqaMemBaseDispXReg>(x86_64::kMachineRegRBP, offset, src_reg); } + template <size_t kSize> + void GenGetSimd(MachineReg dst_reg, int32_t offset) { + if constexpr (kSize == 8) { + Gen<x86_64::MovsdXRegMemBaseDisp>(dst_reg, x86_64::kMachineRegRBP, offset); + } else if constexpr (kSize == 16) { + Gen<x86_64::MovdqaXRegMemBaseDisp>(dst_reg, x86_64::kMachineRegRBP, offset); + } else { + static_assert(kDependentValueFalse<kSize>); + } + } + + template <size_t kSize> + void GenSetSimd(int32_t offset, MachineReg src_reg) { + if constexpr (kSize == 8) { + Gen<x86_64::MovsdMemBaseDispXReg>(x86_64::kMachineRegRBP, offset, src_reg); + } else if constexpr (kSize == 16) { + Gen<x86_64::MovdqaMemBaseDispXReg>(x86_64::kMachineRegRBP, offset, src_reg); + } else { + static_assert(kDependentValueFalse<kSize>); + } + } + // Please use GenCallImm instead template <typename CallImmType, typename IntegralType, diff --git a/backend/x86_64/context_liveness_analyzer_test.cc b/backend/x86_64/context_liveness_analyzer_test.cc index cba84aaa..4746ded7 100644 --- a/backend/x86_64/context_liveness_analyzer_test.cc +++ b/backend/x86_64/context_liveness_analyzer_test.cc @@ -58,7 +58,7 @@ TEST(MachineIRContextLivenessAnalyzerTest, PutKillsLiveIn) { auto vreg = machine_ir.AllocVReg(); builder.StartBasicBlock(bb); - builder.GenPut(0, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); builder.Gen<PseudoJump>(kNullGuestAddr); EXPECT_EQ(x86_64::CheckMachineIR(machine_ir), x86_64::kMachineIRCheckSuccess); @@ -79,11 +79,11 @@ TEST(MachineIRContextLivenessAnalyzerTest, GetRevivesLiveInKilledByPut) { auto vreg = machine_ir.AllocVReg(); builder.StartBasicBlock(bb1); - builder.GenGet(vreg, 0); + builder.GenGetOffset(vreg, GetThreadStateRegOffset(0)); builder.Gen<PseudoBranch>(bb2); builder.StartBasicBlock(bb2); - builder.GenPut(0, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); builder.Gen<PseudoJump>(kNullGuestAddr); EXPECT_EQ(x86_64::CheckMachineIR(machine_ir), x86_64::kMachineIRCheckSuccess); @@ -106,13 +106,13 @@ TEST(MachineIRContextLivenessAnalyzerTest, auto vreg = machine_ir.AllocVReg(); builder.StartBasicBlock(bb1); - builder.GenGet(vreg, 1); + builder.GenGetOffset(vreg, GetThreadStateRegOffset(1)); builder.Gen<PseudoBranch>(bb2); builder.StartBasicBlock(bb2); - builder.GenPut(0, vreg); - builder.GenPut(1, vreg); - builder.GenPut(2, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); + builder.GenPutOffset(GetThreadStateRegOffset(1), vreg); + builder.GenPutOffset(GetThreadStateRegOffset(2), vreg); builder.Gen<PseudoJump>(kNullGuestAddr); EXPECT_EQ(x86_64::CheckMachineIR(machine_ir), x86_64::kMachineIRCheckSuccess); @@ -139,13 +139,13 @@ TEST(MachineIRContextLivenessAnalyzerTest, ContextWritesOnlyKillLiveInIfHappenIn builder.Gen<PseudoCondBranch>(CodeEmitter::Condition::kZero, bb2, bb3, x86_64::kMachineRegFLAGS); builder.StartBasicBlock(bb2); - builder.GenPut(0, vreg); - builder.GenPut(1, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); + builder.GenPutOffset(GetThreadStateRegOffset(1), vreg); builder.Gen<PseudoJump>(kNullGuestAddr); builder.StartBasicBlock(bb3); - builder.GenPut(0, vreg); - builder.GenPut(2, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); + builder.GenPutOffset(GetThreadStateRegOffset(2), vreg); builder.Gen<PseudoJump>(kNullGuestAddr); EXPECT_EQ(x86_64::CheckMachineIR(machine_ir), x86_64::kMachineIRCheckSuccess); diff --git a/backend/x86_64/lir_instructions.json b/backend/x86_64/lir_instructions.json index 2c0b6e7c..e242543e 100644 --- a/backend/x86_64/lir_instructions.json +++ b/backend/x86_64/lir_instructions.json @@ -23,6 +23,8 @@ "AddqRegImm", "AddqRegReg", "AddqRegMemInsns", + "AndbRegImm", + "AndbMemImmInsns", "AndlRegImm", "AndlRegReg", "AndnqRegRegReg", @@ -66,10 +68,13 @@ "SarlRegReg", "SarqRegImm", "SarqRegReg", + "ShlbRegImm", + "ShldlRegRegImm", "ShllRegImm", "ShllRegReg", "ShlqRegImm", "ShlqRegReg", + "ShrbRegImm", "ShrlRegImm", "ShrlRegReg", "ShrqRegImm", @@ -83,8 +88,10 @@ "TestqRegReg", "TestwRegImm", "TestwRegReg", + "LockCmpXchgqRegMemRegInsns", "LockCmpXchg16bRegRegRegRegMemInsns", "Mfence", + "MovbMemImmInsns", "MovbMemRegInsns", "MovdMemXRegInsns", "MovdRegXReg", @@ -111,6 +118,8 @@ "MovqRegXReg", "MovqXRegMemInsns", "MovqXRegReg", + "MovsdMemXRegInsns", + "MovsdXRegMemInsns", "MovsdXRegXReg", "MovssXRegXReg", "MovsxbqRegMemInsns", @@ -136,6 +145,9 @@ "MulsdXRegXReg", "MulssXRegXReg", "NotqReg", + "OrbMemImmInsns", + "OrbMemRegInsns", + "OrbRegReg", "OrlRegImm", "OrlRegReg", "OrqRegImm", diff --git a/backend/x86_64/local_guest_context_optimizer_test.cc b/backend/x86_64/local_guest_context_optimizer_test.cc index 811fcccb..65c1fc18 100644 --- a/backend/x86_64/local_guest_context_optimizer_test.cc +++ b/backend/x86_64/local_guest_context_optimizer_test.cc @@ -39,8 +39,8 @@ TEST(MachineIRLocalGuestContextOptimizer, RemoveReadAfterWrite) { builder.StartBasicBlock(bb); auto reg1 = machine_ir.AllocVReg(); auto reg2 = machine_ir.AllocVReg(); - builder.GenPut(0, reg1); - builder.GenGet(reg2, 0); + builder.GenPutOffset(GetThreadStateRegOffset(0), reg1); + builder.GenGetOffset(reg2, GetThreadStateRegOffset(0)); builder.Gen<PseudoJump>(kNullGuestAddr); x86_64::RemoveLocalGuestContextAccesses(&machine_ir); @@ -71,8 +71,8 @@ TEST(MachineIRLocalGuestContextOptimizer, RemoveReadAfterRead) { builder.StartBasicBlock(bb); auto reg1 = machine_ir.AllocVReg(); auto reg2 = machine_ir.AllocVReg(); - builder.GenGet(reg1, 0); - builder.GenGet(reg2, 0); + builder.GenGetOffset(reg1, GetThreadStateRegOffset(0)); + builder.GenGetOffset(reg2, GetThreadStateRegOffset(0)); builder.Gen<PseudoJump>(kNullGuestAddr); x86_64::RemoveLocalGuestContextAccesses(&machine_ir); @@ -101,8 +101,8 @@ TEST(MachineIRLocalGuestContextOptimizer, RemoveWriteAfterWrite) { builder.StartBasicBlock(bb); auto reg1 = machine_ir.AllocVReg(); auto reg2 = machine_ir.AllocVReg(); - builder.GenPut(0, reg1); - builder.GenPut(0, reg2); + builder.GenPutOffset(GetThreadStateRegOffset(0), reg1); + builder.GenPutOffset(GetThreadStateRegOffset(0), reg2); builder.Gen<PseudoJump>(kNullGuestAddr); x86_64::RemoveLocalGuestContextAccesses(&machine_ir); diff --git a/backend/x86_64/loop_guest_context_optimizer_test.cc b/backend/x86_64/loop_guest_context_optimizer_test.cc index 37bc0667..291f0ae4 100644 --- a/backend/x86_64/loop_guest_context_optimizer_test.cc +++ b/backend/x86_64/loop_guest_context_optimizer_test.cc @@ -41,7 +41,7 @@ TEST(MachineIRLoopGuestContextOptimizer, ReplaceGetAndUpdateMap) { auto bb = machine_ir.NewBasicBlock(); builder.StartBasicBlock(bb); auto reg1 = machine_ir.AllocVReg(); - builder.GenGet(reg1, 0); + builder.GenGetOffset(reg1, GetThreadStateRegOffset(0)); builder.Gen<PseudoJump>(kNullGuestAddr); auto insn_it = bb->insn_list().begin(); @@ -66,7 +66,7 @@ TEST(MachineIRLoopGuestContextOptimizer, ReplacePutAndUpdateMap) { auto bb = machine_ir.NewBasicBlock(); builder.StartBasicBlock(bb); auto reg1 = machine_ir.AllocVReg(); - builder.GenPut(1, reg1); + builder.GenPutOffset(GetThreadStateRegOffset(1), reg1); builder.Gen<PseudoJump>(kNullGuestAddr); auto insn_it = bb->insn_list().begin(); @@ -92,8 +92,8 @@ TEST(MachineIRLoopGuestContextOptimizer, ReplaceGetPutAndUpdateMap) { builder.StartBasicBlock(bb); auto reg1 = machine_ir.AllocVReg(); auto reg2 = machine_ir.AllocVReg(); - builder.GenGet(reg1, 1); - builder.GenPut(1, reg2); + builder.GenGetOffset(reg1, GetThreadStateRegOffset(1)); + builder.GenPutOffset(GetThreadStateRegOffset(1), reg2); builder.Gen<PseudoJump>(kNullGuestAddr); auto insn_it = bb->insn_list().begin(); @@ -122,7 +122,7 @@ TEST(MachineIRLoopGuestContextOptimizer, ReplaceGetSimdAndUpdateMap) { auto bb = machine_ir.NewBasicBlock(); builder.StartBasicBlock(bb); auto reg1 = machine_ir.AllocVReg(); - builder.GenGetSimd(reg1, 0); + builder.GenGetSimd<16>(reg1, GetThreadStateSimdRegOffset(0)); builder.Gen<PseudoJump>(kNullGuestAddr); auto insn_it = bb->insn_list().begin(); @@ -147,7 +147,7 @@ TEST(MachineIRLoopGuestContextOptimizer, ReplacePutSimdAndUpdateMap) { auto bb = machine_ir.NewBasicBlock(); builder.StartBasicBlock(bb); auto reg1 = machine_ir.AllocVReg(); - builder.GenSetSimd(0, reg1); + builder.GenSetSimd<16>(GetThreadStateSimdRegOffset(0), reg1); builder.Gen<PseudoJump>(kNullGuestAddr); auto insn_it = bb->insn_list().begin(); @@ -493,7 +493,7 @@ TEST(MachineIRLoopGuestContextOptimizer, RemovePutInSelfLoop) { builder.Gen<PseudoBranch>(body); builder.StartBasicBlock(body); - builder.GenPut(0, vreg1); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg1); builder.Gen<PseudoCondBranch>(CodeEmitter::Condition::kZero, body, afterloop, kMachineRegFLAGS); builder.StartBasicBlock(afterloop); @@ -537,7 +537,7 @@ TEST(MachineIRLoopGuestContextOptimizer, RemoveGetInSelfLoop) { builder.Gen<PseudoBranch>(body); builder.StartBasicBlock(body); - builder.GenGet(vreg1, 0); + builder.GenGetOffset(vreg1, GetThreadStateRegOffset(0)); builder.Gen<PseudoCondBranch>(CodeEmitter::Condition::kZero, body, afterloop, kMachineRegFLAGS); builder.StartBasicBlock(afterloop); @@ -580,8 +580,8 @@ TEST(MachineIRLoopGuestContextOptimizer, RemoveGetPutInSelfLoop) { builder.Gen<PseudoBranch>(body); builder.StartBasicBlock(body); - builder.GenGet(vreg1, 0); - builder.GenPut(0, vreg2); + builder.GenGetOffset(vreg1, GetThreadStateRegOffset(0)); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg2); builder.Gen<PseudoCondBranch>(CodeEmitter::Condition::kZero, body, afterloop, kMachineRegFLAGS); builder.StartBasicBlock(afterloop); @@ -634,7 +634,7 @@ TEST(MachineIRLoopGuestContextOptimizer, RemovePutInLoopWithMultipleExits) { builder.Gen<PseudoCondBranch>(CodeEmitter::Condition::kZero, body2, afterloop1, kMachineRegFLAGS); builder.StartBasicBlock(body2); - builder.GenPut(0, vreg1); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg1); builder.Gen<PseudoCondBranch>(CodeEmitter::Condition::kZero, body1, afterloop2, kMachineRegFLAGS); builder.StartBasicBlock(afterloop1); @@ -687,14 +687,14 @@ TEST(MachineIRLoopGuestContextOptimizer, CountGuestRegAccesses) { builder.Gen<PseudoBranch>(body1); builder.StartBasicBlock(body1); - builder.GenPut(0, vreg1); - builder.GenGetSimd(vreg2, 0); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg1); + builder.GenGetSimd<16>(vreg2, GetThreadStateSimdRegOffset(0)); builder.Gen<PseudoBranch>(body2); builder.StartBasicBlock(body2); - builder.GenGet(vreg1, 1); - builder.GenPut(1, vreg1); - builder.GenSetSimd(0, vreg2); + builder.GenGetOffset(vreg1, GetThreadStateRegOffset(1)); + builder.GenPutOffset(GetThreadStateRegOffset(1), vreg1); + builder.GenSetSimd<16>(GetThreadStateSimdRegOffset(0), vreg2); builder.Gen<PseudoBranch>(body1); Loop loop({body1, body2}, machine_ir.arena()); @@ -723,15 +723,15 @@ TEST(MachineIRLoopGuestContextOptimizer, GetOffsetCounters) { builder.Gen<PseudoBranch>(body1); builder.StartBasicBlock(body1); - builder.GenPut(0, vreg1); - builder.GenGet(vreg1, 0); - builder.GenGet(vreg1, 1); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg1); + builder.GenGetOffset(vreg1, GetThreadStateRegOffset(0)); + builder.GenGetOffset(vreg1, GetThreadStateRegOffset(1)); builder.Gen<PseudoBranch>(body2); builder.StartBasicBlock(body2); - builder.GenGet(vreg1, 2); - builder.GenPut(2, vreg1); - builder.GenPut(0, vreg1); + builder.GenGetOffset(vreg1, GetThreadStateRegOffset(2)); + builder.GenPutOffset(GetThreadStateRegOffset(2), vreg1); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg1); builder.Gen<PseudoBranch>(body1); Loop loop({body1, body2}, machine_ir.arena()); @@ -769,16 +769,16 @@ TEST(MachineIRLoopGuestContextOptimizer, OptimizeLoopWithPriority) { // Regular reg 0 has 3 uses. // Regular reg 1 has 1 use. builder.StartBasicBlock(body); - builder.GenGet(vreg1, 0); - builder.GenPut(0, vreg1); - builder.GenGet(vreg1, 0); - builder.GenGet(vreg1, 1); + builder.GenGetOffset(vreg1, GetThreadStateRegOffset(0)); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg1); + builder.GenGetOffset(vreg1, GetThreadStateRegOffset(0)); + builder.GenGetOffset(vreg1, GetThreadStateRegOffset(1)); // Simd reg 0 has 2 uses. // Simd reg 1 has 1 use. - builder.GenGetSimd(vreg2, 0); - builder.GenSetSimd(0, vreg2); - builder.GenGetSimd(vreg2, 1); + builder.GenGetSimd<16>(vreg2, GetThreadStateSimdRegOffset(0)); + builder.GenSetSimd<16>(GetThreadStateSimdRegOffset(0), vreg2); + builder.GenGetSimd<16>(vreg2, GetThreadStateSimdRegOffset(1)); builder.Gen<PseudoCondBranch>(CodeEmitter::Condition::kZero, body, afterloop, kMachineRegFLAGS); builder.StartBasicBlock(afterloop); diff --git a/backend/x86_64/machine_ir_opt_test.cc b/backend/x86_64/machine_ir_opt_test.cc index b3ea4623..b9814469 100644 --- a/backend/x86_64/machine_ir_opt_test.cc +++ b/backend/x86_64/machine_ir_opt_test.cc @@ -402,15 +402,15 @@ TEST(MachineIR, PutsInSuccessorsKillPut) { auto vreg = machine_ir.AllocVReg(); builder.StartBasicBlock(bb1); - builder.GenPut(0, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); builder.Gen<PseudoCondBranch>(CodeEmitter::Condition::kZero, bb2, bb3, x86_64::kMachineRegFLAGS); builder.StartBasicBlock(bb2); - builder.GenPut(0, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); builder.Gen<PseudoJump>(kNullGuestAddr); builder.StartBasicBlock(bb3); - builder.GenPut(0, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); builder.Gen<PseudoJump>(kNullGuestAddr); EXPECT_EQ(x86_64::CheckMachineIR(machine_ir), x86_64::kMachineIRCheckSuccess); @@ -435,11 +435,11 @@ TEST(MachineIR, PutInOneOfTwoSuccessorsDoesNotKillPut) { auto vreg = machine_ir.AllocVReg(); builder.StartBasicBlock(bb1); - builder.GenPut(0, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); builder.Gen<PseudoCondBranch>(CodeEmitter::Condition::kZero, bb2, bb3, x86_64::kMachineRegFLAGS); builder.StartBasicBlock(bb2); - builder.GenPut(0, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); builder.Gen<PseudoJump>(kNullGuestAddr); builder.StartBasicBlock(bb3); @@ -468,18 +468,18 @@ TEST(MachineIR, MultiplePutsCanBeKilled) { auto vreg1 = machine_ir.AllocVReg(); auto vreg2 = machine_ir.AllocVReg(); builder.StartBasicBlock(bb1); - builder.GenPut(0, vreg1); - builder.GenPut(1, vreg2); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg1); + builder.GenPutOffset(GetThreadStateRegOffset(1), vreg2); builder.Gen<PseudoCondBranch>(CodeEmitter::Condition::kZero, bb2, bb3, x86_64::kMachineRegFLAGS); builder.StartBasicBlock(bb2); - builder.GenPut(0, vreg1); - builder.GenPut(1, vreg2); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg1); + builder.GenPutOffset(GetThreadStateRegOffset(1), vreg2); builder.Gen<PseudoJump>(kNullGuestAddr); builder.StartBasicBlock(bb3); - builder.GenPut(0, vreg1); - builder.GenPut(1, vreg2); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg1); + builder.GenPutOffset(GetThreadStateRegOffset(1), vreg2); builder.Gen<PseudoJump>(kNullGuestAddr); EXPECT_EQ(x86_64::CheckMachineIR(machine_ir), x86_64::kMachineIRCheckSuccess); @@ -504,16 +504,16 @@ TEST(MachineIR, GetInOneOfTheSuccessorsMakesPutLive) { auto vreg = machine_ir.AllocVReg(); builder.StartBasicBlock(bb1); - builder.GenPut(0, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); builder.Gen<PseudoBranch>(bb2); builder.StartBasicBlock(bb2); - builder.GenGet(vreg, 0); - builder.GenPut(0, vreg); + builder.GenGetOffset(vreg, GetThreadStateRegOffset(0)); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); builder.Gen<PseudoJump>(kNullGuestAddr); builder.StartBasicBlock(bb3); - builder.GenPut(0, vreg); + builder.GenPutOffset(GetThreadStateRegOffset(0), vreg); builder.Gen<PseudoJump>(kNullGuestAddr); EXPECT_EQ(x86_64::CheckMachineIR(machine_ir), x86_64::kMachineIRCheckSuccess); diff --git a/guest_state/include/berberis/guest_state/guest_state_opaque.h b/guest_state/include/berberis/guest_state/guest_state_opaque.h index 714de181..a2adf57a 100644 --- a/guest_state/include/berberis/guest_state/guest_state_opaque.h +++ b/guest_state/include/berberis/guest_state/guest_state_opaque.h @@ -92,6 +92,14 @@ void SetShadowCallStackPointer(CPUState& cpu, GuestAddr scs_sp); void InitFloatingPointState(); std::size_t GetThreadStateRegOffset(int reg); + +bool DoesCpuStateHaveDedicatedFpRegs(); +std::size_t GetThreadStateFRegOffset(int freg); + +bool DoesCpuStateHaveDedicatedVecRegs(); +std::size_t GetThreadStateVRegOffset(int vreg); + +bool DoesCpuStateHaveDedicatedSimdRegs(); std::size_t GetThreadStateSimdRegOffset(int simd_reg); bool IsSimdOffset(size_t offset); diff --git a/guest_state/riscv64/guest_state_arch.cc b/guest_state/riscv64/guest_state_arch.cc index 1d0fbaae..86d5ac87 100644 --- a/guest_state/riscv64/guest_state_arch.cc +++ b/guest_state/riscv64/guest_state_arch.cc @@ -72,7 +72,16 @@ std::size_t GetThreadStateRegOffset(int reg) { return offsetof(ThreadState, cpu.x[reg]); } +std::size_t GetThreadStateFRegOffset(int freg) { + return offsetof(ThreadState, cpu.f[freg]); +} + +std::size_t GetThreadStateVRegOffset(int vreg) { + return offsetof(ThreadState, cpu.v[vreg]); +} + std::size_t GetThreadStateSimdRegOffset(int simd_reg) { + // TODO(b/291126259) Switch to CHECK(false) after we switch frontend to F regs. return offsetof(ThreadState, cpu.v[simd_reg]); } @@ -93,6 +102,19 @@ bool DoesCpuStateHaveFlags() { return false; } +bool DoesCpuStateHaveDedicatedFpRegs() { + return true; +} + +bool DoesCpuStateHaveDedicatedVecRegs() { + return true; +} + +bool DoesCpuStateHaveDedicatedSimdRegs() { + // TODO(b/291126259) Return false after we switch frontend to F regs. + return true; +} + std::size_t GetThreadStateFlagOffset() { // RISCV64 Does not have flags in its CPUState CHECK(false); diff --git a/guest_state/riscv64/include/berberis/guest_state/guest_state_arch.h b/guest_state/riscv64/include/berberis/guest_state/guest_state_arch.h index 59195c27..9c7f9044 100644 --- a/guest_state/riscv64/include/berberis/guest_state/guest_state_arch.h +++ b/guest_state/riscv64/include/berberis/guest_state/guest_state_arch.h @@ -20,6 +20,7 @@ #include <array> #include <atomic> #include <cstdint> +#include <type_traits> #include "berberis/base/config.h" #include "berberis/base/dependent_false.h" @@ -110,6 +111,8 @@ struct CPUState { uint8_t frm; }; +static_assert(std::is_standard_layout_v<CPUState>); + constexpr uint32_t kNumGuestRegs = std::size(CPUState{}.x); constexpr uint32_t kNumGuestFpRegs = std::size(CPUState{}.f); diff --git a/heavy_optimizer/riscv64/frontend.cc b/heavy_optimizer/riscv64/frontend.cc index cd1c0f1b..b04144cb 100644 --- a/heavy_optimizer/riscv64/frontend.cc +++ b/heavy_optimizer/riscv64/frontend.cc @@ -20,10 +20,12 @@ #include "berberis/assembler/x86_64.h" #include "berberis/backend/common/machine_ir.h" +#include "berberis/backend/x86_64/machine_ir.h" #include "berberis/base/checks.h" #include "berberis/base/config.h" #include "berberis/guest_state/guest_state_arch.h" #include "berberis/guest_state/guest_state_opaque.h" +#include "berberis/runtime_primitives/memory_region_reservation.h" #include "berberis/runtime_primitives/platform.h" namespace berberis { @@ -260,18 +262,18 @@ void HeavyOptimizerFrontend::UpdateBranchTargetsAfterSplit(GuestAddr addr, Register HeavyOptimizerFrontend::GetReg(uint8_t reg) { CHECK_LT(reg, kNumGuestRegs); Register dst = AllocTempReg(); - builder_.GenGet(dst, reg); + builder_.GenGetOffset(dst, GetThreadStateRegOffset(reg)); return dst; } void HeavyOptimizerFrontend::SetReg(uint8_t reg, Register value) { CHECK_LT(reg, kNumGuestRegs); - builder_.GenPut(reg, value); + builder_.GenPutOffset(GetThreadStateRegOffset(reg), value); } FpRegister HeavyOptimizerFrontend::GetFpReg(uint8_t reg) { FpRegister result = AllocTempSimdReg(); - builder_.GenGetSimd(result.machine_reg(), reg); + builder_.GenGetSimd<8>(result.machine_reg(), GetThreadStateFRegOffset(reg)); return result; } @@ -733,4 +735,283 @@ void HeavyOptimizerFrontend::Finalize(GuestAddr stop_pc) { ResolveJumps(); } +Register HeavyOptimizerFrontend::LoadWithoutRecovery(Decoder::LoadOperandType operand_type, + Register base, + int32_t disp) { + auto res = AllocTempReg(); + switch (operand_type) { + case Decoder::LoadOperandType::k8bitUnsigned: + Gen<x86_64::MovzxblRegMemBaseDisp>(res, base, disp); + break; + case Decoder::LoadOperandType::k16bitUnsigned: + Gen<x86_64::MovzxwlRegMemBaseDisp>(res, base, disp); + break; + case Decoder::LoadOperandType::k32bitUnsigned: + Gen<x86_64::MovlRegMemBaseDisp>(res, base, disp); + break; + case Decoder::LoadOperandType::k64bit: + Gen<x86_64::MovqRegMemBaseDisp>(res, base, disp); + break; + case Decoder::LoadOperandType::k8bitSigned: + Gen<x86_64::MovsxbqRegMemBaseDisp>(res, base, disp); + break; + case Decoder::LoadOperandType::k16bitSigned: + Gen<x86_64::MovsxwqRegMemBaseDisp>(res, base, disp); + break; + case Decoder::LoadOperandType::k32bitSigned: + Gen<x86_64::MovsxlqRegMemBaseDisp>(res, base, disp); + break; + default: + Unimplemented(); + return {}; + } + + return res; +} + +Register HeavyOptimizerFrontend::LoadWithoutRecovery(Decoder::LoadOperandType operand_type, + Register base, + Register index, + int32_t disp) { + auto res = AllocTempReg(); + switch (operand_type) { + case Decoder::LoadOperandType::k8bitUnsigned: + Gen<x86_64::MovzxblRegMemBaseIndexDisp>( + res, base, index, x86_64::MachineMemOperandScale::kOne, disp); + break; + case Decoder::LoadOperandType::k16bitUnsigned: + Gen<x86_64::MovzxwlRegMemBaseIndexDisp>( + res, base, index, x86_64::MachineMemOperandScale::kOne, disp); + break; + case Decoder::LoadOperandType::k32bitUnsigned: + Gen<x86_64::MovlRegMemBaseIndexDisp>( + res, base, index, x86_64::MachineMemOperandScale::kOne, disp); + break; + case Decoder::LoadOperandType::k64bit: + Gen<x86_64::MovqRegMemBaseIndexDisp>( + res, base, index, x86_64::MachineMemOperandScale::kOne, disp); + break; + case Decoder::LoadOperandType::k8bitSigned: + Gen<x86_64::MovsxbqRegMemBaseIndexDisp>( + res, base, index, x86_64::MachineMemOperandScale::kOne, disp); + break; + case Decoder::LoadOperandType::k16bitSigned: + Gen<x86_64::MovsxwqRegMemBaseIndexDisp>( + res, base, index, x86_64::MachineMemOperandScale::kOne, disp); + break; + case Decoder::LoadOperandType::k32bitSigned: + Gen<x86_64::MovsxlqRegMemBaseIndexDisp>( + res, base, index, x86_64::MachineMemOperandScale::kOne, disp); + break; + default: + Unimplemented(); + return {}; + } + return res; +} + +Register HeavyOptimizerFrontend::UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr) { + Register res = AllocTempReg(); + switch (opcode) { + case Decoder::CsrOpcode::kCsrrs: + Gen<PseudoCopy>(res, arg, 8); + Gen<x86_64::OrqRegReg>(res, csr, GetFlagsRegister()); + break; + case Decoder::CsrOpcode::kCsrrc: + if (host_platform::kHasBMI) { + Gen<x86_64::AndnqRegRegReg>(res, arg, csr, GetFlagsRegister()); + } else { + Gen<PseudoCopy>(res, arg, 8); + Gen<x86_64::NotqReg>(res); + Gen<x86_64::AndqRegReg>(res, csr, GetFlagsRegister()); + } + break; + default: + Unimplemented(); + return {}; + } + return arg; +} + +Register HeavyOptimizerFrontend::UpdateCsr(Decoder::CsrImmOpcode opcode, + uint8_t imm, + Register csr) { + Register res = AllocTempReg(); + switch (opcode) { + case Decoder::CsrImmOpcode::kCsrrwi: + Gen<x86_64::MovlRegImm>(res, imm); + break; + case Decoder::CsrImmOpcode::kCsrrsi: + Gen<x86_64::MovlRegImm>(res, imm); + Gen<x86_64::OrqRegReg>(res, csr, GetFlagsRegister()); + break; + case Decoder::CsrImmOpcode::kCsrrci: + Gen<x86_64::MovqRegImm>(res, static_cast<int8_t>(~imm)); + Gen<x86_64::AndqRegReg>(res, csr, GetFlagsRegister()); + break; + default: + Unimplemented(); + return {}; + } + return res; +} + +void HeavyOptimizerFrontend::StoreWithoutRecovery(Decoder::StoreOperandType operand_type, + Register base, + int32_t disp, + Register data) { + switch (operand_type) { + case Decoder::StoreOperandType::k8bit: + Gen<x86_64::MovbMemBaseDispReg>(base, disp, data); + break; + case Decoder::StoreOperandType::k16bit: + Gen<x86_64::MovwMemBaseDispReg>(base, disp, data); + break; + case Decoder::StoreOperandType::k32bit: + Gen<x86_64::MovlMemBaseDispReg>(base, disp, data); + break; + case Decoder::StoreOperandType::k64bit: + Gen<x86_64::MovqMemBaseDispReg>(base, disp, data); + break; + default: + return Unimplemented(); + } +} + +void HeavyOptimizerFrontend::StoreWithoutRecovery(Decoder::StoreOperandType operand_type, + Register base, + Register index, + int32_t disp, + Register data) { + switch (operand_type) { + case Decoder::StoreOperandType::k8bit: + Gen<x86_64::MovbMemBaseIndexDispReg>( + base, index, x86_64::MachineMemOperandScale::kOne, disp, data); + break; + case Decoder::StoreOperandType::k16bit: + Gen<x86_64::MovwMemBaseIndexDispReg>( + base, index, x86_64::MachineMemOperandScale::kOne, disp, data); + break; + case Decoder::StoreOperandType::k32bit: + Gen<x86_64::MovlMemBaseIndexDispReg>( + base, index, x86_64::MachineMemOperandScale::kOne, disp, data); + break; + case Decoder::StoreOperandType::k64bit: + Gen<x86_64::MovqMemBaseIndexDispReg>( + base, index, x86_64::MachineMemOperandScale::kOne, disp, data); + break; + default: + return Unimplemented(); + } +} + +void HeavyOptimizerFrontend::MemoryRegionReservationLoad(Register aligned_addr) { + // Store aligned_addr in CPUState. + int32_t address_offset = GetThreadStateReservationAddressOffset(); + Gen<x86_64::MovqMemBaseDispReg>(x86_64::kMachineRegRBP, address_offset, aligned_addr); + + // MemoryRegionReservation::SetOwner(aligned_addr, &(state->cpu)). + builder_.GenCallImm(bit_cast<uint64_t>(&MemoryRegionReservation::SetOwner), + GetFlagsRegister(), + std::array<x86_64::CallImm::Arg, 2>{{ + {aligned_addr, x86_64::CallImm::kIntRegType}, + {x86_64::kMachineRegRBP, x86_64::CallImm::kIntRegType}, + }}); + + // Load monitor value and store it in CPUState. + auto monitor = AllocTempSimdReg(); + MachineReg reservation_reg = monitor.machine_reg(); + Gen<x86_64::MovqRegMemBaseDisp>(reservation_reg, aligned_addr, 0); + int32_t value_offset = GetThreadStateReservationValueOffset(); + Gen<x86_64::MovqMemBaseDispReg>(x86_64::kMachineRegRBP, value_offset, reservation_reg); +} + +Register HeavyOptimizerFrontend::MemoryRegionReservationExchange(Register aligned_addr, + Register curr_reservation_value) { + auto* ir = builder_.ir(); + auto* cur_bb = builder_.bb(); + auto* addr_match_bb = ir->NewBasicBlock(); + auto* failure_bb = ir->NewBasicBlock(); + auto* continue_bb = ir->NewBasicBlock(); + ir->AddEdge(cur_bb, addr_match_bb); + ir->AddEdge(cur_bb, failure_bb); + ir->AddEdge(failure_bb, continue_bb); + Register result = AllocTempReg(); + + // MemoryRegionReservation::Clear. + Register stored_aligned_addr = AllocTempReg(); + int32_t address_offset = GetThreadStateReservationAddressOffset(); + Gen<x86_64::MovqRegMemBaseDisp>(stored_aligned_addr, x86_64::kMachineRegRBP, address_offset); + Gen<x86_64::MovqMemBaseDispImm>(x86_64::kMachineRegRBP, address_offset, kNullGuestAddr); + // Compare aligned_addr to the one in CPUState. + Gen<x86_64::CmpqRegReg>(stored_aligned_addr, aligned_addr, GetFlagsRegister()); + Gen<PseudoCondBranch>( + x86_64::Assembler::Condition::kNotEqual, failure_bb, addr_match_bb, GetFlagsRegister()); + + builder_.StartBasicBlock(addr_match_bb); + // Load new reservation value into integer register where CmpXchgq expects it. + Register new_reservation_value = AllocTempReg(); + int32_t value_offset = GetThreadStateReservationValueOffset(); + Gen<x86_64::MovqRegMemBaseDisp>(new_reservation_value, x86_64::kMachineRegRBP, value_offset); + + MemoryRegionReservationSwapWithLockedOwner( + aligned_addr, curr_reservation_value, new_reservation_value, failure_bb); + + ir->AddEdge(builder_.bb(), continue_bb); + // Pseudo-def for use-def operand of XOR to make sure data-flow is integrate. + Gen<PseudoDefReg>(result); + Gen<x86_64::XorqRegReg>(result, result, GetFlagsRegister()); + Gen<PseudoBranch>(continue_bb); + + builder_.StartBasicBlock(failure_bb); + Gen<x86_64::MovqRegImm>(result, 1); + Gen<PseudoBranch>(continue_bb); + + builder_.StartBasicBlock(continue_bb); + + return result; +} + +void HeavyOptimizerFrontend::MemoryRegionReservationSwapWithLockedOwner( + Register aligned_addr, + Register curr_reservation_value, + Register new_reservation_value, + MachineBasicBlock* failure_bb) { + auto* ir = builder_.ir(); + auto* lock_success_bb = ir->NewBasicBlock(); + auto* swap_success_bb = ir->NewBasicBlock(); + ir->AddEdge(builder_.bb(), lock_success_bb); + ir->AddEdge(builder_.bb(), failure_bb); + ir->AddEdge(lock_success_bb, swap_success_bb); + ir->AddEdge(lock_success_bb, failure_bb); + + // lock_entry = MemoryRegionReservation::TryLock(aligned_addr, &(state->cpu)). + auto* call = builder_.GenCallImm(bit_cast<uint64_t>(&MemoryRegionReservation::TryLock), + GetFlagsRegister(), + std::array<x86_64::CallImm::Arg, 2>{{ + {aligned_addr, x86_64::CallImm::kIntRegType}, + {x86_64::kMachineRegRBP, x86_64::CallImm::kIntRegType}, + }}); + Register lock_entry = AllocTempReg(); + // Limit life-time of a narrow reg-class call result. + Gen<PseudoCopy>(lock_entry, call->IntResultAt(0), 8); + Gen<x86_64::TestqRegReg>(lock_entry, lock_entry, GetFlagsRegister()); + Gen<PseudoCondBranch>( + x86_64::Assembler::Condition::kZero, failure_bb, lock_success_bb, GetFlagsRegister()); + + builder_.StartBasicBlock(lock_success_bb); + auto rax = AllocTempReg(); + Gen<PseudoCopy>(rax, curr_reservation_value, 8); + Gen<x86_64::LockCmpXchgqRegMemBaseDispReg>( + rax, aligned_addr, 0, new_reservation_value, GetFlagsRegister()); + + // MemoryRegionReservation::Unlock(lock_entry) + Gen<x86_64::MovqMemBaseDispImm>(lock_entry, 0, 0); + // Zero-flag is set if CmpXchg is successful. + Gen<PseudoCondBranch>( + x86_64::Assembler::Condition::kNotZero, failure_bb, swap_success_bb, GetFlagsRegister()); + + builder_.StartBasicBlock(swap_success_bb); +} + } // namespace berberis
\ No newline at end of file diff --git a/heavy_optimizer/riscv64/frontend.h b/heavy_optimizer/riscv64/frontend.h index d88920fe..67b64fbc 100644 --- a/heavy_optimizer/riscv64/frontend.h +++ b/heavy_optimizer/riscv64/frontend.h @@ -20,13 +20,16 @@ #include "berberis/backend/x86_64/machine_ir.h" #include "berberis/backend/x86_64/machine_ir_builder.h" #include "berberis/base/arena_map.h" +#include "berberis/base/checks.h" #include "berberis/base/dependent_false.h" #include "berberis/decoder/riscv64/decoder.h" #include "berberis/decoder/riscv64/semantics_player.h" #include "berberis/guest_state/guest_addr.h" #include "berberis/guest_state/guest_state_arch.h" +#include "berberis/guest_state/guest_state_opaque.h" #include "berberis/intrinsics/intrinsics.h" #include "berberis/intrinsics/macro_assembler.h" +#include "berberis/runtime_primitives/memory_region_reservation.h" #include "berberis/runtime_primitives/platform.h" #include "call_intrinsic.h" @@ -108,20 +111,103 @@ class HeavyOptimizerFrontend { void Store(Decoder::StoreOperandType operand_type, Register arg, int16_t offset, Register data); Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset); + template <typename IntType> + constexpr Decoder::LoadOperandType ToLoadOperandType() { + if constexpr (std::is_same_v<IntType, int8_t>) { + return Decoder::LoadOperandType::k8bitSigned; + } else if constexpr (std::is_same_v<IntType, int16_t>) { + return Decoder::LoadOperandType::k16bitSigned; + } else if constexpr (std::is_same_v<IntType, int32_t>) { + return Decoder::LoadOperandType::k32bitSigned; + } else if constexpr (std::is_same_v<IntType, int64_t> || std::is_same_v<IntType, uint64_t>) { + return Decoder::LoadOperandType::k64bit; + } else if constexpr (std::is_same_v<IntType, uint8_t>) { + return Decoder::LoadOperandType::k8bitUnsigned; + } else if constexpr (std::is_same_v<IntType, uint16_t>) { + return Decoder::LoadOperandType::k16bitUnsigned; + } else if constexpr (std::is_same_v<IntType, uint32_t>) { + return Decoder::LoadOperandType::k32bitUnsigned; + } else { + static_assert(kDependentTypeFalse<IntType>); + } + } + + template <typename IntType> + constexpr Decoder::StoreOperandType ToStoreOperandType() { + if constexpr (std::is_same_v<IntType, int8_t> || std::is_same_v<IntType, uint8_t>) { + return Decoder::StoreOperandType::k8bit; + } else if constexpr (std::is_same_v<IntType, int16_t> || std::is_same_v<IntType, uint16_t>) { + return Decoder::StoreOperandType::k16bit; + } else if constexpr (std::is_same_v<IntType, int32_t> || std::is_same_v<IntType, uint32_t>) { + return Decoder::StoreOperandType::k32bit; + } else if constexpr (std::is_same_v<IntType, int64_t> || std::is_same_v<IntType, uint64_t>) { + return Decoder::StoreOperandType::k64bit; + } else { + static_assert(kDependentTypeFalse<IntType>); + } + } + + // Versions without recovery can be used to access non-guest memory (e.g. CPUState). + Register LoadWithoutRecovery(Decoder::LoadOperandType operand_type, Register base, int32_t disp); + Register LoadWithoutRecovery(Decoder::LoadOperandType operand_type, + Register base, + Register index, + int32_t disp); + void StoreWithoutRecovery(Decoder::StoreOperandType operand_type, + Register base, + int32_t disp, + Register val); + void StoreWithoutRecovery(Decoder::StoreOperandType operand_type, + Register base, + Register index, + int32_t disp, + Register val); + // // Atomic extensions. // template <typename IntType, bool aq, bool rl> - Register Lr(Register /* addr */) { - Unimplemented(); - return {}; + Register Lr(Register addr) { + Register aligned_addr = AllocTempReg(); + Gen<PseudoCopy>(aligned_addr, addr, 8); + // The immediate is sign extended to 64-bit. + Gen<x86_64::AndqRegImm>(aligned_addr, ~int32_t{0xf}, GetFlagsRegister()); + + MemoryRegionReservationLoad(aligned_addr); + + Register addr_offset = AllocTempReg(); + Gen<PseudoCopy>(addr_offset, addr, 8); + Gen<x86_64::SubqRegReg>(addr_offset, aligned_addr, GetFlagsRegister()); + + // Load the requested part from CPUState. + return LoadWithoutRecovery(ToLoadOperandType<IntType>(), + x86_64::kMachineRegRBP, + addr_offset, + GetThreadStateReservationValueOffset()); } template <typename IntType, bool aq, bool rl> - Register Sc(Register /* addr */, Register /* data */) { - Unimplemented(); - return {}; + Register Sc(Register addr, Register data) { + // Compute aligned_addr. + auto aligned_addr = AllocTempReg(); + Gen<PseudoCopy>(aligned_addr, addr, 8); + // The immediate is sign extended to 64-bit. + Gen<x86_64::AndqRegImm>(aligned_addr, ~int32_t{0xf}, GetFlagsRegister()); + + // Load current monitor value before we clobber it. + auto reservation_value = AllocTempReg(); + int32_t value_offset = GetThreadStateReservationValueOffset(); + Gen<x86_64::MovqRegMemBaseDisp>(reservation_value, x86_64::kMachineRegRBP, value_offset); + Register addr_offset = AllocTempReg(); + Gen<PseudoCopy>(addr_offset, addr, 8); + Gen<x86_64::SubqRegReg>(addr_offset, aligned_addr, GetFlagsRegister()); + // It's okay to clobber reservation_value since we clear out reservation_address in + // MemoryRegionReservationExchange anyway. + StoreWithoutRecovery( + ToStoreOperandType<IntType>(), x86_64::kMachineRegRBP, addr_offset, value_offset, data); + + return MemoryRegionReservationExchange(aligned_addr, reservation_value); } void Fence(Decoder::FenceOpcode /*opcode*/, @@ -149,7 +235,7 @@ class HeavyOptimizerFrontend { [[nodiscard]] FpRegister GetFRegAndUnboxNan(uint8_t reg) { CHECK_LE(reg, kNumGuestFpRegs); FpRegister result = AllocTempSimdReg(); - builder_.GenGetSimd(result.machine_reg(), reg); + builder_.GenGetSimd<8>(result.machine_reg(), GetThreadStateFRegOffset(reg)); FpRegister unboxed_result = AllocTempSimdReg(); if (host_platform::kHasAVX) { builder_.Gen<x86_64::MacroUnboxNanFloat32AVX>(unboxed_result.machine_reg(), @@ -169,8 +255,7 @@ class HeavyOptimizerFrontend { } else { builder_.Gen<x86_64::MacroNanBoxFloat32>(value.machine_reg()); } - - builder_.GenSetSimd(reg, value.machine_reg()); + builder_.GenSetSimd<8>(GetThreadStateFRegOffset(reg), value.machine_reg()); } template <typename DataType> @@ -204,15 +289,8 @@ class HeavyOptimizerFrontend { // Csr // - Register UpdateCsr(Decoder::CsrOpcode /* opcode */, Register /* arg */, Register /* csr */) { - Unimplemented(); - return {}; - } - - Register UpdateCsr(Decoder::CsrImmOpcode /* opcode */, uint8_t /* imm */, Register /* csr */) { - Unimplemented(); - return {}; - } + Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr); + Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr); [[nodiscard]] bool success() const { return success_; } @@ -240,18 +318,48 @@ class HeavyOptimizerFrontend { template <CsrName kName> [[nodiscard]] Register GetCsr() { - Unimplemented(); - return {}; + auto csr_reg = AllocTempReg(); + if constexpr (std::is_same_v<CsrFieldType<kName>, uint8_t>) { + Gen<x86_64::MovzxblRegMemBaseDisp>(csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<kName>); + } else if constexpr (std::is_same_v<CsrFieldType<kName>, uint64_t>) { + Gen<x86_64::MovqRegMemBaseDisp>(csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<kName>); + } else { + static_assert(kDependentTypeFalse<CsrFieldType<kName>>); + } + return csr_reg; } template <CsrName kName> - void SetCsr(uint8_t /* imm */) { - Unimplemented(); + void SetCsr(uint8_t imm) { + // Note: csr immediate only have 5 bits in RISC-V encoding which guarantess us that + // “imm & kCsrMask<kName>”can be used as 8-bit immediate. + if constexpr (std::is_same_v<CsrFieldType<kName>, uint8_t>) { + Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, + kCsrFieldOffset<kName>, + static_cast<int8_t>(imm & kCsrMask<kName>)); + } else if constexpr (std::is_same_v<CsrFieldType<kName>, uint64_t>) { + Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, + kCsrFieldOffset<kName>, + static_cast<int8_t>(imm & kCsrMask<kName>)); + } else { + static_assert(kDependentTypeFalse<CsrFieldType<kName>>); + } } template <CsrName kName> - void SetCsr(Register /* arg */) { - Unimplemented(); + void SetCsr(Register arg) { + auto tmp = AllocTempReg(); + Gen<PseudoCopy>(tmp, arg, sizeof(CsrFieldType<kName>)); + if constexpr (sizeof(CsrFieldType<kName>) == 1) { + Gen<x86_64::AndbRegImm>(tmp, kCsrMask<kName>, GetFlagsRegister()); + Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<kName>, tmp); + } else if constexpr (sizeof(CsrFieldType<kName>) == 8) { + Gen<x86_64::AndqRegImm>( + tmp, constants_pool::kConst<uint64_t{kCsrMask<kName>}>, GetFlagsRegister()); + Gen<x86_64::MovqMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<kName>, tmp); + } else { + static_assert(kDependentTypeFalse<CsrFieldType<kName>>); + } } private: @@ -305,6 +413,13 @@ class HeavyOptimizerFrontend { return result; } + void MemoryRegionReservationLoad(Register aligned_addr); + Register MemoryRegionReservationExchange(Register aligned_addr, Register curr_reservation_value); + void MemoryRegionReservationSwapWithLockedOwner(Register aligned_addr, + Register curr_reservation_value, + Register new_reservation_value, + MachineBasicBlock* failure_bb); + // Syntax sugar. template <typename InsnType, typename... Args> /*may_discard*/ InsnType* Gen(Args... args) { @@ -350,6 +465,178 @@ class HeavyOptimizerFrontend { ArenaMap<GuestAddr, MachineInsnPosition> branch_targets_; }; +template <> +[[nodiscard]] inline HeavyOptimizerFrontend::Register +HeavyOptimizerFrontend::GetCsr<CsrName::kFCsr>() { + auto csr_reg = AllocTempReg(); + auto tmp = AllocTempReg(); + bool inline_successful = TryInlineIntrinsicForHeavyOptimizer<&intrinsics::FeGetExceptions>( + &builder_, GetFlagsRegister(), tmp); + CHECK(inline_successful); + Gen<x86_64::MovzxbqRegMemBaseDisp>( + csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>); + Gen<x86_64::ShlbRegImm>(csr_reg, 5, GetFlagsRegister()); + Gen<x86_64::OrbRegReg>(csr_reg, tmp, GetFlagsRegister()); + return csr_reg; +} + +template <> +[[nodiscard]] inline HeavyOptimizerFrontend::Register +HeavyOptimizerFrontend::GetCsr<CsrName::kFFlags>() { + return FeGetExceptions(); +} + +template <> +[[nodiscard]] inline HeavyOptimizerFrontend::Register +HeavyOptimizerFrontend::GetCsr<CsrName::kVlenb>() { + return GetImm(16); +} + +template <> +[[nodiscard]] inline HeavyOptimizerFrontend::Register +HeavyOptimizerFrontend::GetCsr<CsrName::kVxrm>() { + auto reg = AllocTempReg(); + Gen<x86_64::MovzxbqRegMemBaseDisp>(reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>); + Gen<x86_64::AndbRegImm>(reg, 0b11, GetFlagsRegister()); + return reg; +} + +template <> +[[nodiscard]] inline HeavyOptimizerFrontend::Register +HeavyOptimizerFrontend::GetCsr<CsrName::kVxsat>() { + auto reg = AllocTempReg(); + Gen<x86_64::MovzxbqRegMemBaseDisp>(reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>); + Gen<x86_64::ShrbRegImm>(reg, 2, GetFlagsRegister()); + return reg; +} + +template <> +inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(uint8_t /* imm */) { + Unimplemented(); + // TODO(b/291126436) Figure out how to pass Mem arg to FeSetExceptionsAndRoundImmTranslate. + // // Note: instructions Csrrci or Csrrsi couldn't affect Frm because immediate only has five + // bits. + // // But these instruction don't pass their immediate-specified argument into `SetCsr`, they + // combine + // // it with register first. Fixing that can only be done by changing code in the semantics + // player. + // // + // // But Csrrwi may clear it. And we actually may only arrive here from Csrrwi. + // // Thus, technically, we know that imm >> 5 is always zero, but it doesn't look like a good + // idea + // // to rely on that: it's very subtle and it only affects code generation speed. + // Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, + // static_cast<int8_t>(imm >> 5)); bool successful = + // TryInlineIntrinsicForHeavyOptimizer<&intrinsics::FeSetExceptionsAndRoundImmTranslate>( + // &builder_, + // GetFlagsRegister(), + // x86_64::kMachineRegRBP, + // static_cast<int>(offsetof(ThreadState, intrinsics_scratch_area)), + // imm); + // CHECK(successful); +} + +template <> +inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(Register /* arg */) { + Unimplemented(); + // TODO(b/291126436) Figure out how to pass Mem arg to FeSetExceptionsAndRoundTranslate. + // auto tmp1 = AllocTempReg(); + // auto tmp2 = AllocTempReg(); + // Gen<PseudoCopy>(tmp1, arg, 1); + // Gen<x86_64::AndlRegImm>(tmp1, 0b1'1111, GetFlagsRegister()); + // Gen<x86_64::ShldlRegRegImm>(tmp2, arg, int8_t{32 - 5}, GetFlagsRegister()); + // Gen<x86_64::AndbRegImm>(tmp2, kCsrMask<CsrName::kFrm>, GetFlagsRegister()); + // Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, + // tmp2); + // bool successful = + // TryInlineIntrinsicForHeavyOptimizer<&intrinsics::FeSetExceptionsAndRoundTranslate>( + // &builder_, + // GetFlagsRegister(), + // tmp1, + // x86_64::kMachineRegRBP, + // static_cast<int>(offsetof(ThreadState, intrinsics_scratch_area)), + // tmp1); + // CHECK(successful); +} + +template <> +inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFFlags>(uint8_t imm) { + FeSetExceptionsImm(static_cast<int8_t>(imm & 0b1'1111)); +} + +template <> +inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFFlags>(Register arg) { + auto tmp = AllocTempReg(); + Gen<PseudoCopy>(tmp, arg, 1); + Gen<x86_64::AndlRegImm>(tmp, 0b1'1111, GetFlagsRegister()); + FeSetExceptions(tmp); +} + +template <> +inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFrm>(uint8_t imm) { + Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, + kCsrFieldOffset<CsrName::kFrm>, + static_cast<int8_t>(imm & kCsrMask<CsrName::kFrm>)); + FeSetRoundImm(static_cast<int8_t>(imm & kCsrMask<CsrName::kFrm>)); +} + +template <> +inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFrm>(Register arg) { + // Use RCX as temporary register. We know it would be used by FeSetRound, too. + auto tmp = AllocTempReg(); + Gen<PseudoCopy>(tmp, arg, 1); + Gen<x86_64::AndbRegImm>(tmp, kCsrMask<CsrName::kFrm>, GetFlagsRegister()); + Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, tmp); + FeSetRound(tmp); +} + +template <> +inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxrm>(uint8_t imm) { + imm &= 0b11; + if (imm != 0b11) { + Gen<x86_64::AndbMemBaseDispImm>( + x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister()); + } + if (imm != 0b00) { + Gen<x86_64::OrbMemBaseDispImm>( + x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, imm, GetFlagsRegister()); + } +} + +template <> +inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxrm>(Register arg) { + Gen<x86_64::AndbMemBaseDispImm>( + x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister()); + Gen<x86_64::AndbRegImm>(arg, 0b11, GetFlagsRegister()); + Gen<x86_64::OrbMemBaseDispReg>( + x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, arg, GetFlagsRegister()); +} + +template <> +inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxsat>(uint8_t imm) { + if (imm & 0b1) { + Gen<x86_64::OrbMemBaseDispImm>( + x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister()); + } else { + Gen<x86_64::AndbMemBaseDispImm>( + x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b11, GetFlagsRegister()); + } +} + +template <> +inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxsat>(Register arg) { + using Condition = x86_64::Assembler::Condition; + Gen<x86_64::AndbMemBaseDispImm>( + x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b11, GetFlagsRegister()); + Gen<x86_64::TestbRegImm>(arg, 1, GetFlagsRegister()); + auto tmp = AllocTempReg(); + Gen<x86_64::SetccReg>(Condition::kNotZero, tmp, GetFlagsRegister()); + Gen<x86_64::MovzxbqRegReg>(tmp, tmp); + Gen<x86_64::ShlbRegImm>(tmp, int8_t{2}, GetFlagsRegister()); + Gen<x86_64::OrbMemBaseDispReg>( + x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, tmp, GetFlagsRegister()); +} + } // namespace berberis #endif /* BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_ */ diff --git a/test_utils/include/berberis/test_utils/insn_tests_riscv64-inl.h b/test_utils/include/berberis/test_utils/insn_tests_riscv64-inl.h index 1e952a95..43b5e9c4 100644 --- a/test_utils/include/berberis/test_utils/insn_tests_riscv64-inl.h +++ b/test_utils/include/berberis/test_utils/insn_tests_riscv64-inl.h @@ -222,6 +222,10 @@ class TESTSUITE : public ::testing::Test { EXPECT_EQ(GetXReg<2>(state_.cpu), expected_fflags); } +#endif // defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) +#if defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) || \ + defined(TESTING_HEAVY_OPTIMIZER) + void TestFrm(uint32_t insn_bytes, uint8_t frm_to_set, uint8_t expected_rm) { auto code_start = ToGuestAddr(&insn_bytes); state_.cpu.insn_addr = code_start; @@ -362,6 +366,10 @@ class TESTSUITE : public ::testing::Test { } } +#endif // defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) +#if defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) || \ + defined(TESTING_HEAVY_OPTIMIZER) + void TestAmo(uint32_t insn_bytes, uint64_t arg1, uint64_t arg2, @@ -390,6 +398,10 @@ class TESTSUITE : public ::testing::Test { expected_memory); } +#endif // defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) || + // defined(TESTING_HEAVY_OPTIMIZER) +#if defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) + template <typename... Types> void TestFmvFloatToInteger(uint32_t insn_bytes, std::initializer_list<std::tuple<Types...>> args) { @@ -1135,6 +1147,10 @@ TEST_F(TESTSUITE, CJalr) { // Tests for Non-Compressed Instructions. +#endif // defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) +#if defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) || \ + defined(TESTING_HEAVY_OPTIMIZER) + TEST_F(TESTSUITE, CsrInstructions) { ScopedRoundingMode scoped_rounding_mode; // Csrrw x2, frm, 2 @@ -1145,6 +1161,10 @@ TEST_F(TESTSUITE, CsrInstructions) { TestFrm(0x0020f173, 0, 0); } +#endif // defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) || + // defined(TESTING_HEAVY_OPTIMIZER) +#if defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) + TEST_F(TESTSUITE, FCsrRegister) { fenv_t saved_environment; EXPECT_EQ(fegetenv(&saved_environment), 0); @@ -1759,6 +1779,10 @@ TEST_F(TESTSUITE, FmaInstructions) { TestFma(0x223170cf, {std::tuple{1.0, 2.0, 3.0, -5.0}}); } +#endif // defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) +#if defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) || \ + defined(TESTING_HEAVY_OPTIMIZER) + TEST_F(TESTSUITE, AmoInstructions) { // Verifying that all aq and rl combinations work for Amoswap, but only test relaxed one for most // other instructions for brevity. @@ -1800,6 +1824,10 @@ TEST_F(TESTSUITE, AmoInstructions) { TestAmo(0xe03120af, 0xe03130af, 0xffff'eeee'dddd'ccccULL); } +#endif // defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) || + // defined(TESTING_HEAVY_OPTIMIZER) +#if defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR) + TEST_F(TESTSUITE, OpFpSingleInputInstructions) { // FSqrt.S TestOpFpSingleInput(0x580170d3, {std::tuple{4.0f, 2.0f}}); diff --git a/tests/run_host_tests.mk b/tests/run_host_tests.mk index e8e6a2b9..812ebc99 100644 --- a/tests/run_host_tests.mk +++ b/tests/run_host_tests.mk @@ -29,7 +29,10 @@ .PHONY: berberis_run_host_tests -berberis_all: berberis_host_tests_result +# TODO(b/295236834): Add berberis_host_tests_result to berberis_all once the tests pass in +# post-submit. They are currently failing due to unimplemented bit manipulation instructions in +# stock builds. +# berberis_all: berberis_host_tests_result test_dir := $(call intermediates-dir-for,PACKAGING,berberis_tests) |