From c1a367e7cf18df4f18a91246e25b439ec1457a44 Mon Sep 17 00:00:00 2001
From: Aman <amankumar2198@gmail.com>
Date: Tue, 18 Apr 2023 19:35:45 +0530
Subject: interp: added FAdd instruction.

Bug: 265372622

Test: berberis_host_tests/berberis_host_tests

Change-Id: I546086534d44fc09892d752dd1ed52008822d418
---
 decoder/include/berberis/decoder/riscv64/decoder.h |  45 +++++++++
 .../berberis/decoder/riscv64/semantics_player.h    |   7 ++
 interpreter/riscv64/fp_regs.h                      |  64 +++++++++++++
 interpreter/riscv64/interpreter.cc                 |  37 +++++++-
 interpreter/riscv64/interpreter_test.cc            |  22 +++++
 .../include/berberis/intrinsics/intrinsics_float.h |  14 +--
 .../riscv64_to_x86_64/intrinsics_float.h           | 101 +++++++++++++++++++++
 .../include/berberis/intrinsics/type_traits.h      |   6 ++
 8 files changed, 284 insertions(+), 12 deletions(-)
 create mode 100644 interpreter/riscv64/fp_regs.h
 create mode 100644 intrinsics/include/berberis/intrinsics/riscv64_to_x86_64/intrinsics_float.h
diff --git a/decoder/include/berberis/decoder/riscv64/decoder.h b/decoder/include/berberis/decoder/riscv64/decoder.h
index e91ee18f..1fda1316 100644
--- a/decoder/include/berberis/decoder/riscv64/decoder.h
+++ b/decoder/include/berberis/decoder/riscv64/decoder.h
@@ -194,6 +194,17 @@ class Decoder {
     kMaxAmoOpcode = 0b11111'111,
   };
 
+  enum class OpFpOpcode {
+    // Bit #2 = 1 means rm is an opcode extension.
+    // Bit #3 = 1 means rs2 is an opcode extension
+    // Bits #4, #1, and #0 - actual opcode.
+    kFAdd = 0b0'0'0'00,
+    kFSub = 0b0'0'0'01,
+    kFMul = 0b0'0'0'10,
+    kFDiv = 0b0'0'0'11,
+    kMaxOpFpOpcode = 0b1'1'1'11,
+  };
+
   enum class LoadOpcode {
     kLb = 0b000,
     kLh = 0b001,
@@ -277,6 +288,14 @@ class Decoder {
     kMaxCsrRegister = 0b11'11'1111'1111,
   };
 
+  enum class FloatSize {
+    kFloat = 0b00,
+    kDouble = 0b01,
+    kHalf = 0b10,
+    kQuad = 0b11,
+    kMaxFloatSize = 0b11,
+  };
+
   struct AmoArgs {
     AmoOpcode opcode;
     uint8_t dst;
@@ -379,6 +398,15 @@ class Decoder {
   using StoreArgs = StoreArgsTemplate<StoreOpcode>;
   using StoreFpArgs = StoreArgsTemplate<StoreFpOpcode>;
 
+  struct OpFpArgs {
+    OpFpOpcode opcode;
+    FloatSize float_size;
+    uint8_t dst;
+    uint8_t src1;
+    uint8_t src2;
+    uint8_t rm;
+  };
+
   struct BranchArgs {
     BranchOpcode opcode;
     uint8_t src1;
@@ -564,6 +592,9 @@ class Decoder {
       case BaseOpcode::kOpImm32:
         DecodeOp<OpImm32Opcode, ShiftImm32Opcode, 5>();
         break;
+      case BaseOpcode::kOpFp:
+        DecodeOpFp();
+        break;
       case BaseOpcode::kStore:
         DecodeStore<StoreOpcode>();
         break;
@@ -808,6 +839,20 @@ class Decoder {
     insn_consumer_->JumpAndLink(args);
   }
 
+  void DecodeOpFp() {
+    uint8_t float_size = GetBits<uint8_t, 25, 2>();
+    uint8_t opcode_bits = GetBits<uint8_t, 27, 5>();
+    const OpFpArgs args = {
+        .opcode = OpFpOpcode(opcode_bits),
+        .float_size = FloatSize(float_size),
+        .dst = GetBits<uint8_t, 7, 5>(),
+        .src1 = GetBits<uint8_t, 15, 5>(),
+        .src2 = GetBits<uint8_t, 20, 5>(),
+        .rm = GetBits<uint8_t, 12, 3>(),
+    };
+    insn_consumer_->OpFp(args);
+  }
+
   void DecodeSystem() {
     uint8_t low_opcode = GetBits<uint8_t, 12, 2>();
     if (low_opcode == 0b00) {
diff --git a/decoder/include/berberis/decoder/riscv64/semantics_player.h b/decoder/include/berberis/decoder/riscv64/semantics_player.h
index 40b559d3..e320b7ce 100644
--- a/decoder/include/berberis/decoder/riscv64/semantics_player.h
+++ b/decoder/include/berberis/decoder/riscv64/semantics_player.h
@@ -135,6 +135,13 @@ class SemanticsPlayer {
     SetRegOrIgnore(args.dst, result);
   };
 
+  void OpFp(const typename Decoder::OpFpArgs& args) {
+    FpRegister arg1 = GetFpReg(args.src1);
+    FpRegister arg2 = GetFpReg(args.src2);
+    FpRegister result = listener_->OpFp(args.opcode, args.float_size, args.rm, arg1, arg2);
+    SetFpReg(args.dst, result);
+  }
+
   void Store(const typename Decoder::StoreArgs& args) {
     Register arg = GetRegOrZero(args.src);
     Register data = GetRegOrZero(args.data);
diff --git a/interpreter/riscv64/fp_regs.h b/interpreter/riscv64/fp_regs.h
new file mode 100644
index 00000000..dd49f19d
--- /dev/null
+++ b/interpreter/riscv64/fp_regs.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BERBERIS_FP_REGS_H_
+#define BERBERIS_FP_REGS_H_
+
+#include <cstring>
+
+#include "berberis/base/bit_util.h"
+#include "berberis/intrinsics/intrinsics_float.h"
+
+namespace berberis {
+
+template <typename FloatType>
+inline FloatType NanUnboxFPRegToFloat(uint64_t arg);
+
+template <>
+inline intrinsics::Float32 NanUnboxFPRegToFloat(uint64_t arg) {
+  // Apart from transfer operations (e.g. loads and stores), all other floating-point operations on
+  // narrower n-bit operations, n < FLEN, check if the input operands are correctly NaN-boxed, i.e.,
+  // all upper FLEN−n bits are 1. If so, the n least-significant bits of the input are used as the
+  // input value, otherwise the input value is treated as an n-bit canonical NaN.
+  if ((arg & 0xffff'ffff'0000'0000) != 0xffff'ffff'0000'0000) {
+    return bit_cast<intrinsics::Float32>(0x7fc00000);
+  }
+  intrinsics::Float32 result;
+  memcpy(&result, &arg, sizeof(intrinsics::Float32));
+  return result;
+}
+
+template <>
+inline intrinsics::Float64 NanUnboxFPRegToFloat(uint64_t arg) {
+  return bit_cast<intrinsics::Float64>(arg);
+}
+
+template <typename FloatType>
+inline uint64_t NanBoxFloatToFPReg(FloatType arg);
+
+template <>
+inline uint64_t NanBoxFloatToFPReg(intrinsics::Float32 arg) {
+  return bit_cast<uint32_t>(arg) | 0xffff'ffff'0000'0000;
+}
+
+template <>
+inline uint64_t NanBoxFloatToFPReg(intrinsics::Float64 arg) {
+  return bit_cast<uint64_t>(arg);
+}
+
+}  // namespace berberis
+
+#endif  // BERBERIS_FP_REGS_H_
diff --git a/interpreter/riscv64/interpreter.cc b/interpreter/riscv64/interpreter.cc
index f4e82fea..a8fcbf61 100644
--- a/interpreter/riscv64/interpreter.cc
+++ b/interpreter/riscv64/interpreter.cc
@@ -28,10 +28,11 @@
 #include "berberis/decoder/riscv64/semantics_player.h"
 #include "berberis/guest_state/guest_addr.h"
 #include "berberis/guest_state/guest_state_riscv64.h"
-#include "berberis/intrinsics/riscv64/guest_fpstate.h"
+#include "berberis/intrinsics/riscv64_to_x86_64/intrinsics_float.h"
 #include "berberis/kernel_api/run_guest_syscall.h"
 
 #include "atomics.h"
+#include "fp_regs.h"
 
 namespace berberis {
 
@@ -42,6 +43,8 @@ class Interpreter {
   using Decoder = Decoder<SemanticsPlayer<Interpreter>>;
   using Register = uint64_t;
   using FpRegister = uint64_t;
+  using Float32 = intrinsics::Float32;
+  using Float64 = intrinsics::Float64;
 
   explicit Interpreter(ThreadState* state) : state_(state), branch_taken_(false) {}
 
@@ -352,6 +355,38 @@ class Interpreter {
     return RunGuestSyscall(syscall_nr, arg0, arg1, arg2, arg3, arg4, arg5);
   }
 
+  FpRegister OpFp(Decoder::OpFpOpcode opcode,
+                  Decoder::FloatSize float_size,
+                  uint8_t rm,
+                  FpRegister arg1,
+                  FpRegister arg2) {
+    switch (float_size) {
+      case Decoder::FloatSize::kFloat:
+        return NanBoxFloatToFPReg(OpFp<Float32>(
+            opcode, rm, NanUnboxFPRegToFloat<Float32>(arg1), NanUnboxFPRegToFloat<Float32>(arg2)));
+      case Decoder::FloatSize::kDouble:
+        return NanBoxFloatToFPReg(OpFp<Float64>(
+            opcode, rm, NanUnboxFPRegToFloat<Float64>(arg1), NanUnboxFPRegToFloat<Float64>(arg2)));
+      default:
+        Unimplemented();
+        return {};
+    }
+  }
+
+  // TODO(b/278812060): switch to intrinsics when they would become available and stop using
+  // ExecuteFloatOperation directly.
+  template <typename FloatType>
+  FloatType OpFp(Decoder::OpFpOpcode opcode, uint8_t rm, FloatType arg1, FloatType arg2) {
+    switch (opcode) {
+      case Decoder::OpFpOpcode::kFAdd:
+        return intrinsics::ExecuteFloatOperation<FloatType>(
+            rm, state_->cpu.frm, [](auto x, auto y) { return x + y; }, arg1, arg2);
+      default:
+        Unimplemented();
+        return {};
+    }
+  }
+
   Register ShiftImm(Decoder::ShiftImmOpcode opcode, Register arg, uint16_t imm) {
     switch (opcode) {
       case Decoder::ShiftImmOpcode::kSlli:
diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc
index c3d2b5f1..0d5c344a 100644
--- a/interpreter/riscv64/interpreter_test.cc
+++ b/interpreter/riscv64/interpreter_test.cc
@@ -102,6 +102,17 @@ class Riscv64InterpreterTest : public ::testing::Test {
     }
   }
 
+  void InterpretOpFp(uint32_t insn_bytes,
+                     std::initializer_list<std::tuple<uint64_t, uint64_t, uint64_t>> args) {
+    for (auto [arg1, arg2, expected_result] : args) {
+      state_.cpu.insn_addr = ToGuestAddr(&insn_bytes);
+      SetFReg<2>(state_.cpu, arg1);
+      SetFReg<3>(state_.cpu, arg2);
+      InterpretInsn(&state_);
+      EXPECT_EQ(GetFReg<1>(state_.cpu), expected_result);
+    }
+  }
+
   void InterpretFence(uint32_t insn_bytes) {
     state_.cpu.insn_addr = ToGuestAddr(&insn_bytes);
     InterpretInsn(&state_);
@@ -609,6 +620,17 @@ TEST_F(Riscv64InterpreterTest, OpImm32Instructions) {
   InterpretOpImm(0x4001509b, {{0x0000'0000'f000'0000ULL, 12, 0xffff'ffff'ffff'0000ULL}});
 }
 
+TEST_F(Riscv64InterpreterTest, OpFpInstructions) {
+  // FAdd.S
+  InterpretOpFp(0x003100d3,
+                {{bit_cast<uint32_t>(1.0f) | 0xffff'ffff'0000'0000,
+                  bit_cast<uint32_t>(2.0f) | 0xffff'ffff'0000'0000,
+                  bit_cast<uint32_t>(3.0f) | 0xffff'ffff'0000'0000}});
+  // FAdd.D
+  InterpretOpFp(0x023100d3,
+                {{bit_cast<uint64_t>(1.0), bit_cast<uint64_t>(2.0), bit_cast<uint64_t>(3.0)}});
+}
+
 TEST_F(Riscv64InterpreterTest, LoadInstructions) {
   // Offset is always 8.
   // Lbu
diff --git a/intrinsics/include/berberis/intrinsics/intrinsics_float.h b/intrinsics/include/berberis/intrinsics/intrinsics_float.h
index 30f83e55..694afba9 100644
--- a/intrinsics/include/berberis/intrinsics/intrinsics_float.h
+++ b/intrinsics/include/berberis/intrinsics/intrinsics_float.h
@@ -64,23 +64,15 @@ class WrappedFloatType {
   explicit constexpr operator uint32_t() const { return value_; }
   explicit constexpr operator int64_t() const { return value_; }
   explicit constexpr operator uint64_t() const { return value_; }
-
-  auto BitCastToIntOfSameSize() {
-    if constexpr (std::is_same_v<BaseType, float>) {
-      return bit_cast<int32_t>(value_);
-    } else {
-      static_assert(std::is_same_v<BaseType, double>, "Only float and double BaseType supported.");
-      return bit_cast<int64_t>(value_);
-    }
-  }
-
-  // Only valid for BaseType==double. Returns the bit representation of the fp value.
   explicit constexpr operator WrappedFloatType<float>() const {
     return WrappedFloatType<float>(value_);
   }
   explicit constexpr operator WrappedFloatType<double>() const {
     return WrappedFloatType<double>(value_);
   }
+#if defined(__i386__) || defined(__x86_64__)
+  explicit constexpr operator long double() const { return value_; }
+#endif
   // Note: we don't provide unary operator-.  That's done on purpose: with floats -x and 0.-x
   // produce different results which could be surprising.  Use fneg instead of unary operator-.
   friend WrappedFloatType operator+(const WrappedFloatType& v1, const WrappedFloatType& v2);
diff --git a/intrinsics/include/berberis/intrinsics/riscv64_to_x86_64/intrinsics_float.h b/intrinsics/include/berberis/intrinsics/riscv64_to_x86_64/intrinsics_float.h
new file mode 100644
index 00000000..db8ff249
--- /dev/null
+++ b/intrinsics/include/berberis/intrinsics/riscv64_to_x86_64/intrinsics_float.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BERBERIS_INTRINSICS_RISCV64_TO_X86_64_INTRINSICS_FLOAT_H_
+#define BERBERIS_INTRINSICS_RISCV64_TO_X86_64_INTRINSICS_FLOAT_H_
+
+#include <limits>
+
+#include "berberis/base/bit_util.h"
+#include "berberis/intrinsics/intrinsics_float.h"
+#include "berberis/intrinsics/riscv64/guest_fpstate.h"  // ScopedRoundingMode
+#include "berberis/intrinsics/type_traits.h"
+
+namespace berberis::intrinsics {
+
+// x86 architecture doesn't support RMM (aka FE_TIESAWAY), but it can be easily emulated since it
+// have support for 80bit floats: if calculations are done with one bit (or more) of extra precision
+// in the FE_TOWARDZERO mode then we can easily adjust fraction part and would only need to remember
+// this addition may overflow.
+template <typename FloatType, typename OperationType, typename... Args>
+inline auto ExecuteFloatOperationRmm(OperationType operation, Args... args)
+    -> std::enable_if_t<(std::is_same_v<Args, FloatType> && ...), FloatType> {
+  using Wide = typename TypeTraits<FloatType>::Wide;
+  Wide wide_result = operation(Wide(args)...);
+  if constexpr (std::is_same_v<FloatType, Float32>) {
+    // In the 32bit->64bit case everything happens almost automatically, we just need to clear low
+    // bits to ensure that we are getting ±∞ and not NaN.
+    auto int_result = bit_cast<std::make_unsigned_t<typename TypeTraits<Wide>::Int>>(wide_result);
+    if ((int_result & 0x7ff0'0000'0000'0000) == 0x7ff0'0000'0000'0000) {
+      return FloatType(wide_result);
+    }
+    int_result += 0x0000'0000'1000'0000;
+    int_result &= 0xffff'ffff'e000'0000;
+    wide_result = bit_cast<Wide>(int_result);
+  } else if constexpr (std::is_same_v<FloatType, Float64>) {
+    // In 64bit->80bit case we need to adjust significand bits to ensure we are creating ±∞ and not
+    // pseudo-infinity (supported on 8087/80287, but not on modern CPUs).
+    struct {
+      uint64_t significand;
+      uint16_t exponent;
+      uint8_t padding[sizeof(Wide) - sizeof(uint64_t) - sizeof(uint16_t)];
+    } fp80_parts;
+    static_assert(sizeof fp80_parts == sizeof(Wide));
+    memcpy(&fp80_parts, &wide_result, sizeof(wide_result));
+    // Don't try to round ±∞, NaNs and ±0 (denormals are not supported by RISC-V).
+    if ((fp80_parts.exponent & 0x7fff) == 0x7fff ||
+        (fp80_parts.significand & 0x8000'0000'0000'0000) == 0) {
+      return FloatType(wide_result);
+    }
+    fp80_parts.significand += 0x0000'0000'0000'0400;
+    fp80_parts.significand &= 0xffff'ffff'ffff'f800;
+    if (fp80_parts.significand == 0) {
+      fp80_parts.exponent++;
+      fp80_parts.significand = 0x8000'0000'0000'0000;
+    }
+    memcpy(&wide_result, &fp80_parts, sizeof(wide_result));
+  }
+  return FloatType(wide_result);
+}
+
+// Note: first round of rm/frm verification must happen before that function because RISC-V
+// postulates that invalid rm or frm should trigger illegal instruction exception.
+// Here we can assume both rm and frm fields are valid.
+template <typename FloatType, typename OperationType, typename... Args>
+inline auto ExecuteFloatOperation(uint8_t requested_rm,
+                                  uint8_t current_rm,
+                                  OperationType operation,
+                                  Args... args)
+    -> std::enable_if_t<(std::is_same_v<Args, FloatType> && ...), FloatType> {
+  int host_requested_rm = ToHostRoundingMode(requested_rm);
+  int host_current_rm = ToHostRoundingMode(current_rm);
+  if (requested_rm == FPFlags::DYN || host_requested_rm == host_current_rm) {
+    uint8_t rm = requested_rm == FPFlags::DYN ? current_rm : requested_rm;
+    if (rm == FPFlags::RMM) {
+      return ExecuteFloatOperationRmm<FloatType>(operation, args...);
+    }
+    return operation(args...);
+  }
+  ScopedRoundingMode scoped_rounding_mode{host_requested_rm};
+  if (requested_rm == FPFlags::RMM) {
+    return ExecuteFloatOperationRmm<FloatType>(operation, args...);
+  }
+  return operation(args...);
+}
+
+}  // namespace berberis::intrinsics
+
+#endif  // BERBERIS_INTRINSICS_RISCV64_TO_X86_64_INTRINSICS_FLOAT_H_
diff --git a/intrinsics/include/berberis/intrinsics/type_traits.h b/intrinsics/include/berberis/intrinsics/type_traits.h
index f1f6f75d..2785b92d 100644
--- a/intrinsics/include/berberis/intrinsics/type_traits.h
+++ b/intrinsics/include/berberis/intrinsics/type_traits.h
@@ -93,11 +93,17 @@ struct TypeTraits<int64_t> {
 template <>
 struct TypeTraits<intrinsics::Float32> {
   using Int = int32_t;
+  using Wide = intrinsics::Float64;
 };
 
 template <>
 struct TypeTraits<intrinsics::Float64> {
   using Int = int64_t;
+  using Narrow = intrinsics::Float32;
+#if defined(__i386__) || defined(__x86_64__)
+  static_assert(sizeof(long double) > sizeof(intrinsics::Float64));
+  using Wide = long double;
+#endif
 };
 
 #if defined(__x86_64__)
-- 
cgit v1.2.3