firefox-desktop/js/src/jit/x86-shared/MacroAssembler-x86-shared.cpp

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 * vim: set ts=8 sts=2 et sw=2 tw=80:
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "jit/x86-shared/MacroAssembler-x86-shared.h"

#include "mozilla/Casting.h"

#include "jsmath.h"

#include "jit/JitFrames.h"
#include "jit/MacroAssembler.h"
#include "js/ScalarType.h"  // js::Scalar::Type

#include "jit/MacroAssembler-inl.h"

using namespace js;
using namespace js::jit;

// Note: this function clobbers the input register.
void MacroAssembler::clampDoubleToUint8(FloatRegister input, Register output) {
  ScratchDoubleScope scratch(*this);
  MOZ_ASSERT(input != scratch);
  Label positive, done;

  // <= 0 or NaN --> 0
  zeroDouble(scratch);
  branchDouble(DoubleGreaterThan, input, scratch, &positive);
  {
    move32(Imm32(0), output);
    jump(&done);
  }

  bind(&positive);

  if (HasRoundInstruction(RoundingMode::NearestTiesToEven)) {
    // Round input to nearest integer.
    nearbyIntDouble(RoundingMode::NearestTiesToEven, input, input);

    // Truncate to int32 and ensure the result <= 255. This relies on the
    // processor setting output to a value > 255 for doubles outside the int32
    // range (for instance 0x80000000).
    vcvttsd2si(input, output);
    branch32(Assembler::BelowOrEqual, output, Imm32(255), &done);
    move32(Imm32(255), output);
  } else {
    Label outOfRange;

    // Truncate to int32 and ensure the result <= 255. This relies on the
    // processor setting output to a value > 255 for doubles outside the int32
    // range (for instance 0x80000000).
    vcvttsd2si(input, output);
    branch32(Assembler::AboveOrEqual, output, Imm32(255), &outOfRange);
    {
      // Check if we had a tie.
      convertInt32ToDouble(output, scratch);
      subDouble(scratch, input);

      loadConstantDouble(0.5, scratch);

      Label roundUp;
      vucomisd(scratch, input);
      j(Above, &roundUp);
      j(NotEqual, &done);

      // It was a tie. Round up if the output is odd.
      branchTest32(Zero, output, Imm32(1), &done);

      bind(&roundUp);
      add32(Imm32(1), output);
      jump(&done);
    }

    // > 255 --> 255
    bind(&outOfRange);
    move32(Imm32(255), output);
  }

  bind(&done);
}

bool MacroAssemblerX86Shared::buildOOLFakeExitFrame(void* fakeReturnAddr) {
  asMasm().PushFrameDescriptor(FrameType::IonJS);
  asMasm().Push(ImmPtr(fakeReturnAddr));
  asMasm().Push(FramePointer);
  return true;
}

void MacroAssemblerX86Shared::branchNegativeZero(FloatRegister reg,
                                                 Register scratch, Label* label,
                                                 bool maybeNonZero) {
  // Determines whether the low double contained in the XMM register reg
  // is equal to -0.0.

#if defined(JS_CODEGEN_X86)
  Label nonZero;

  // if not already compared to zero
  if (maybeNonZero) {
    ScratchDoubleScope scratchDouble(asMasm());

    // Compare to zero. Lets through {0, -0}.
    zeroDouble(scratchDouble);

    // If reg is non-zero, jump to nonZero.
    asMasm().branchDouble(DoubleNotEqual, reg, scratchDouble, &nonZero);
  }
  // Input register is either zero or negative zero. Retrieve sign of input.
  vmovmskpd(reg, scratch);

  // If reg is 1 or 3, input is negative zero.
  // If reg is 0 or 2, input is a normal zero.
  asMasm().branchTest32(NonZero, scratch, Imm32(1), label);

  bind(&nonZero);
#elif defined(JS_CODEGEN_X64)
  vmovq(reg, scratch);
  cmpq(Imm32(1), scratch);
  j(Overflow, label);
#endif
}

void MacroAssemblerX86Shared::branchNegativeZeroFloat32(FloatRegister reg,
                                                        Register scratch,
                                                        Label* label) {
  vmovd(reg, scratch);
  cmp32(scratch, Imm32(1));
  j(Overflow, label);
}

MacroAssembler& MacroAssemblerX86Shared::asMasm() {
  return *static_cast<MacroAssembler*>(this);
}

const MacroAssembler& MacroAssemblerX86Shared::asMasm() const {
  return *static_cast<const MacroAssembler*>(this);
}

template <class T, class Map>
T* MacroAssemblerX86Shared::getConstant(const typename T::Pod& value, Map& map,
                                        Vector<T, 0, SystemAllocPolicy>& vec) {
  using AddPtr = typename Map::AddPtr;
  size_t index;
  if (AddPtr p = map.lookupForAdd(value)) {
    index = p->value();
  } else {
    index = vec.length();
    enoughMemory_ &= vec.append(T(value));
    if (!enoughMemory_) {
      return nullptr;
    }
    enoughMemory_ &= map.add(p, value, index);
    if (!enoughMemory_) {
      return nullptr;
    }
  }
  return &vec[index];
}

MacroAssemblerX86Shared::Float* MacroAssemblerX86Shared::getFloat(float f) {
  return getConstant<Float, FloatMap>(f, floatMap_, floats_);
}

MacroAssemblerX86Shared::Double* MacroAssemblerX86Shared::getDouble(double d) {
  return getConstant<Double, DoubleMap>(d, doubleMap_, doubles_);
}

MacroAssemblerX86Shared::SimdData* MacroAssemblerX86Shared::getSimdData(
    const SimdConstant& v) {
  return getConstant<SimdData, SimdMap>(v, simdMap_, simds_);
}

void MacroAssemblerX86Shared::binarySimd128(
    const SimdConstant& rhs, FloatRegister lhsDest,
    void (MacroAssembler::*regOp)(const Operand&, FloatRegister, FloatRegister),
    void (MacroAssembler::*constOp)(const SimdConstant&, FloatRegister)) {
  ScratchSimd128Scope scratch(asMasm());
  if (maybeInlineSimd128Int(rhs, scratch)) {
    (asMasm().*regOp)(Operand(scratch), lhsDest, lhsDest);
  } else {
    (asMasm().*constOp)(rhs, lhsDest);
  }
}

void MacroAssemblerX86Shared::binarySimd128(
    FloatRegister lhs, const SimdConstant& rhs, FloatRegister dest,
    void (MacroAssembler::*regOp)(const Operand&, FloatRegister, FloatRegister),
    void (MacroAssembler::*constOp)(const SimdConstant&, FloatRegister,
                                    FloatRegister)) {
  ScratchSimd128Scope scratch(asMasm());
  if (maybeInlineSimd128Int(rhs, scratch)) {
    (asMasm().*regOp)(Operand(scratch), lhs, dest);
  } else {
    (asMasm().*constOp)(rhs, lhs, dest);
  }
}

void MacroAssemblerX86Shared::binarySimd128(
    const SimdConstant& rhs, FloatRegister lhs,
    void (MacroAssembler::*regOp)(const Operand&, FloatRegister),
    void (MacroAssembler::*constOp)(const SimdConstant&, FloatRegister)) {
  ScratchSimd128Scope scratch(asMasm());
  if (maybeInlineSimd128Int(rhs, scratch)) {
    (asMasm().*regOp)(Operand(scratch), lhs);
  } else {
    (asMasm().*constOp)(rhs, lhs);
  }
}

void MacroAssemblerX86Shared::bitwiseTestSimd128(const SimdConstant& rhs,
                                                 FloatRegister lhs) {
  ScratchSimd128Scope scratch(asMasm());
  if (maybeInlineSimd128Int(rhs, scratch)) {
    vptest(scratch, lhs);
  } else {
    asMasm().vptestSimd128(rhs, lhs);
  }
}

void MacroAssemblerX86Shared::minMaxDouble(FloatRegister first,
                                           FloatRegister second, bool canBeNaN,
                                           bool isMax) {
  Label done, nan, minMaxInst;

  // Do a vucomisd to catch equality and NaNs, which both require special
  // handling. If the operands are ordered and inequal, we branch straight to
  // the min/max instruction. If we wanted, we could also branch for less-than
  // or greater-than here instead of using min/max, however these conditions
  // will sometimes be hard on the branch predictor.
  vucomisd(second, first);
  j(Assembler::NotEqual, &minMaxInst);
  if (canBeNaN) {
    j(Assembler::Parity, &nan);
  }

  // Ordered and equal. The operands are bit-identical unless they are zero
  // and negative zero. These instructions merge the sign bits in that
  // case, and are no-ops otherwise.
  if (isMax) {
    vandpd(second, first, first);
  } else {
    vorpd(second, first, first);
  }
  jump(&done);

  // x86's min/max are not symmetric; if either operand is a NaN, they return
  // the read-only operand. We need to return a NaN if either operand is a
  // NaN, so we explicitly check for a NaN in the read-write operand.
  if (canBeNaN) {
    bind(&nan);
    vucomisd(first, first);
    j(Assembler::Parity, &done);
  }

  // When the values are inequal, or second is NaN, x86's min and max will
  // return the value we need.
  bind(&minMaxInst);
  if (isMax) {
    vmaxsd(second, first, first);
  } else {
    vminsd(second, first, first);
  }

  bind(&done);
}

void MacroAssemblerX86Shared::minMaxFloat32(FloatRegister first,
                                            FloatRegister second, bool canBeNaN,
                                            bool isMax) {
  Label done, nan, minMaxInst;

  // Do a vucomiss to catch equality and NaNs, which both require special
  // handling. If the operands are ordered and inequal, we branch straight to
  // the min/max instruction. If we wanted, we could also branch for less-than
  // or greater-than here instead of using min/max, however these conditions
  // will sometimes be hard on the branch predictor.
  vucomiss(second, first);
  j(Assembler::NotEqual, &minMaxInst);
  if (canBeNaN) {
    j(Assembler::Parity, &nan);
  }

  // Ordered and equal. The operands are bit-identical unless they are zero
  // and negative zero. These instructions merge the sign bits in that
  // case, and are no-ops otherwise.
  if (isMax) {
    vandps(second, first, first);
  } else {
    vorps(second, first, first);
  }
  jump(&done);

  // x86's min/max are not symmetric; if either operand is a NaN, they return
  // the read-only operand. We need to return a NaN if either operand is a
  // NaN, so we explicitly check for a NaN in the read-write operand.
  if (canBeNaN) {
    bind(&nan);
    vucomiss(first, first);
    j(Assembler::Parity, &done);
  }

  // When the values are inequal, or second is NaN, x86's min and max will
  // return the value we need.
  bind(&minMaxInst);
  if (isMax) {
    vmaxss(second, first, first);
  } else {
    vminss(second, first, first);
  }

  bind(&done);
}

#ifdef ENABLE_WASM_SIMD
bool MacroAssembler::MustMaskShiftCountSimd128(wasm::SimdOp op, int32_t* mask) {
  switch (op) {
    case wasm::SimdOp::I8x16Shl:
    case wasm::SimdOp::I8x16ShrU:
    case wasm::SimdOp::I8x16ShrS:
      *mask = 7;
      break;
    case wasm::SimdOp::I16x8Shl:
    case wasm::SimdOp::I16x8ShrU:
    case wasm::SimdOp::I16x8ShrS:
      *mask = 15;
      break;
    case wasm::SimdOp::I32x4Shl:
    case wasm::SimdOp::I32x4ShrU:
    case wasm::SimdOp::I32x4ShrS:
      *mask = 31;
      break;
    case wasm::SimdOp::I64x2Shl:
    case wasm::SimdOp::I64x2ShrU:
    case wasm::SimdOp::I64x2ShrS:
      *mask = 63;
      break;
    default:
      MOZ_CRASH("Unexpected shift operation");
  }
  return true;
}
#endif

//{{{ check_macroassembler_style
// ===============================================================
// MacroAssembler high-level usage.

void MacroAssembler::flush() {}

void MacroAssembler::comment(const char* msg) { masm.comment(msg); }

// This operation really consists of five phases, in order to enforce the
// restriction that on x86_shared, srcDest must be eax and edx will be
// clobbered.
//
//     Input: { rhs, lhsOutput }
//
//  [PUSH] Preserve registers
//  [MOVE] Generate moves to specific registers
//
//  [DIV] Input: { regForRhs, EAX }
//  [DIV] extend EAX into EDX
//  [DIV] x86 Division operator
//  [DIV] Ouptut: { EAX, EDX }
//
//  [MOVE] Move specific registers to outputs
//  [POP] Restore registers
//
//    Output: { lhsOutput, remainderOutput }
void MacroAssembler::flexibleDivMod32(Register rhs, Register lhsOutput,
                                      Register remOutput, bool isUnsigned,
                                      const LiveRegisterSet&) {
  // Currently this helper can't handle this situation.
  MOZ_ASSERT(lhsOutput != rhs);
  MOZ_ASSERT(lhsOutput != remOutput);

  // Choose a register that is not edx, or eax to hold the rhs;
  // ebx is chosen arbitrarily, and will be preserved if necessary.
  Register regForRhs = (rhs == eax || rhs == edx) ? ebx : rhs;

  // Add registers we will be clobbering as live, but
  // also remove the set we do not restore.
  LiveRegisterSet preserve;
  preserve.add(edx);
  preserve.add(eax);
  preserve.add(regForRhs);

  preserve.takeUnchecked(lhsOutput);
  preserve.takeUnchecked(remOutput);

  PushRegsInMask(preserve);

  // Shuffle input into place.
  moveRegPair(lhsOutput, rhs, eax, regForRhs);

  // Sign extend eax into edx to make (edx:eax): idiv/udiv are 64-bit.
  if (isUnsigned) {
    mov(ImmWord(0), edx);
    udiv(regForRhs);
  } else {
    cdq();
    idiv(regForRhs);
  }

  moveRegPair(eax, edx, lhsOutput, remOutput);

  PopRegsInMask(preserve);
}

void MacroAssembler::flexibleQuotient32(
    Register rhs, Register srcDest, bool isUnsigned,
    const LiveRegisterSet& volatileLiveRegs) {
  // Choose an arbitrary register that isn't eax, edx, rhs or srcDest;
  AllocatableGeneralRegisterSet regs(GeneralRegisterSet::All());
  regs.takeUnchecked(eax);
  regs.takeUnchecked(edx);
  regs.takeUnchecked(rhs);
  regs.takeUnchecked(srcDest);

  Register remOut = regs.takeAny();
  push(remOut);
  flexibleDivMod32(rhs, srcDest, remOut, isUnsigned, volatileLiveRegs);
  pop(remOut);
}

void MacroAssembler::flexibleRemainder32(
    Register rhs, Register srcDest, bool isUnsigned,
    const LiveRegisterSet& volatileLiveRegs) {
  // Choose an arbitrary register that isn't eax, edx, rhs or srcDest
  AllocatableGeneralRegisterSet regs(GeneralRegisterSet::All());
  regs.takeUnchecked(eax);
  regs.takeUnchecked(edx);
  regs.takeUnchecked(rhs);
  regs.takeUnchecked(srcDest);

  Register remOut = regs.takeAny();
  push(remOut);
  flexibleDivMod32(rhs, srcDest, remOut, isUnsigned, volatileLiveRegs);
  mov(remOut, srcDest);
  pop(remOut);
}

// ===============================================================
// Stack manipulation functions.

size_t MacroAssembler::PushRegsInMaskSizeInBytes(LiveRegisterSet set) {
  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
  return set.gprs().size() * sizeof(intptr_t) + fpuSet.getPushSizeInBytes();
}

void MacroAssembler::PushRegsInMask(LiveRegisterSet set) {
  mozilla::DebugOnly<size_t> framePushedInitial = framePushed();

  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
  unsigned numFpu = fpuSet.size();
  int32_t diffF = fpuSet.getPushSizeInBytes();
  int32_t diffG = set.gprs().size() * sizeof(intptr_t);

  // On x86, always use push to push the integer registers, as it's fast
  // on modern hardware and it's a small instruction.
  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
    diffG -= sizeof(intptr_t);
    Push(*iter);
  }
  MOZ_ASSERT(diffG == 0);
  (void)diffG;

  reserveStack(diffF);
  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
    FloatRegister reg = *iter;
    diffF -= reg.size();
    numFpu -= 1;
    Address spillAddress(StackPointer, diffF);
    if (reg.isDouble()) {
      storeDouble(reg, spillAddress);
    } else if (reg.isSingle()) {
      storeFloat32(reg, spillAddress);
    } else if (reg.isSimd128()) {
      storeUnalignedSimd128(reg, spillAddress);
    } else {
      MOZ_CRASH("Unknown register type.");
    }
  }
  MOZ_ASSERT(numFpu == 0);
  (void)numFpu;

  // x64 padding to keep the stack aligned on uintptr_t. Keep in sync with
  // GetPushSizeInBytes.
  size_t alignExtra = ((size_t)diffF) % sizeof(uintptr_t);
  MOZ_ASSERT_IF(sizeof(uintptr_t) == 8, alignExtra == 0 || alignExtra == 4);
  MOZ_ASSERT_IF(sizeof(uintptr_t) == 4, alignExtra == 0);
  diffF -= alignExtra;
  MOZ_ASSERT(diffF == 0);

  // The macroassembler will keep the stack sizeof(uintptr_t)-aligned, so
  // we don't need to take into account `alignExtra` here.
  MOZ_ASSERT(framePushed() - framePushedInitial ==
             PushRegsInMaskSizeInBytes(set));
}

void MacroAssembler::storeRegsInMask(LiveRegisterSet set, Address dest,
                                     Register) {
  mozilla::DebugOnly<size_t> offsetInitial = dest.offset;

  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
  unsigned numFpu = fpuSet.size();
  int32_t diffF = fpuSet.getPushSizeInBytes();
  int32_t diffG = set.gprs().size() * sizeof(intptr_t);

  MOZ_ASSERT(dest.offset >= diffG + diffF);

  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
    diffG -= sizeof(intptr_t);
    dest.offset -= sizeof(intptr_t);
    storePtr(*iter, dest);
  }
  MOZ_ASSERT(diffG == 0);
  (void)diffG;

  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
    FloatRegister reg = *iter;
    diffF -= reg.size();
    numFpu -= 1;
    dest.offset -= reg.size();
    if (reg.isDouble()) {
      storeDouble(reg, dest);
    } else if (reg.isSingle()) {
      storeFloat32(reg, dest);
    } else if (reg.isSimd128()) {
      storeUnalignedSimd128(reg, dest);
    } else {
      MOZ_CRASH("Unknown register type.");
    }
  }
  MOZ_ASSERT(numFpu == 0);
  (void)numFpu;

  // x64 padding to keep the stack aligned on uintptr_t. Keep in sync with
  // GetPushSizeInBytes.
  size_t alignExtra = ((size_t)diffF) % sizeof(uintptr_t);
  MOZ_ASSERT_IF(sizeof(uintptr_t) == 8, alignExtra == 0 || alignExtra == 4);
  MOZ_ASSERT_IF(sizeof(uintptr_t) == 4, alignExtra == 0);
  diffF -= alignExtra;
  MOZ_ASSERT(diffF == 0);

  // What this means is: if `alignExtra` is nonzero, then the save area size
  // actually used is `alignExtra` bytes smaller than what
  // PushRegsInMaskSizeInBytes claims.  Hence we need to compensate for that.
  MOZ_ASSERT(alignExtra + offsetInitial - dest.offset ==
             PushRegsInMaskSizeInBytes(set));
}

void MacroAssembler::PopRegsInMaskIgnore(LiveRegisterSet set,
                                         LiveRegisterSet ignore) {
  mozilla::DebugOnly<size_t> framePushedInitial = framePushed();

  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
  unsigned numFpu = fpuSet.size();
  int32_t diffG = set.gprs().size() * sizeof(intptr_t);
  int32_t diffF = fpuSet.getPushSizeInBytes();
  const int32_t reservedG = diffG;
  const int32_t reservedF = diffF;

  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
    FloatRegister reg = *iter;
    diffF -= reg.size();
    numFpu -= 1;
    if (ignore.has(reg)) {
      continue;
    }

    Address spillAddress(StackPointer, diffF);
    if (reg.isDouble()) {
      loadDouble(spillAddress, reg);
    } else if (reg.isSingle()) {
      loadFloat32(spillAddress, reg);
    } else if (reg.isSimd128()) {
      loadUnalignedSimd128(spillAddress, reg);
    } else {
      MOZ_CRASH("Unknown register type.");
    }
  }
  freeStack(reservedF);
  MOZ_ASSERT(numFpu == 0);
  (void)numFpu;
  // x64 padding to keep the stack aligned on uintptr_t. Keep in sync with
  // GetPushBytesInSize.
  diffF -= diffF % sizeof(uintptr_t);
  MOZ_ASSERT(diffF == 0);

  // On x86, use pop to pop the integer registers, if we're not going to
  // ignore any slots, as it's fast on modern hardware and it's a small
  // instruction.
  if (ignore.emptyGeneral()) {
    for (GeneralRegisterForwardIterator iter(set.gprs()); iter.more(); ++iter) {
      diffG -= sizeof(intptr_t);
      Pop(*iter);
    }
  } else {
    for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more();
         ++iter) {
      diffG -= sizeof(intptr_t);
      if (!ignore.has(*iter)) {
        loadPtr(Address(StackPointer, diffG), *iter);
      }
    }
    freeStack(reservedG);
  }
  MOZ_ASSERT(diffG == 0);

  MOZ_ASSERT(framePushedInitial - framePushed() ==
             PushRegsInMaskSizeInBytes(set));
}

void MacroAssembler::Push(const Operand op) {
  push(op);
  adjustFrame(sizeof(intptr_t));
}

void MacroAssembler::Push(Register reg) {
  push(reg);
  adjustFrame(sizeof(intptr_t));
}

void MacroAssembler::Push(const Imm32 imm) {
  push(imm);
  adjustFrame(sizeof(intptr_t));
}

void MacroAssembler::Push(const ImmWord imm) {
  push(imm);
  adjustFrame(sizeof(intptr_t));
}

void MacroAssembler::Push(const ImmPtr imm) {
  Push(ImmWord(uintptr_t(imm.value)));
}

void MacroAssembler::Push(const ImmGCPtr ptr) {
  push(ptr);
  adjustFrame(sizeof(intptr_t));
}

void MacroAssembler::Push(FloatRegister t) {
  push(t);
  adjustFrame(sizeof(double));
}

void MacroAssembler::PushFlags() {
  pushFlags();
  adjustFrame(sizeof(intptr_t));
}

void MacroAssembler::Pop(const Operand op) {
  pop(op);
  implicitPop(sizeof(intptr_t));
}

void MacroAssembler::Pop(Register reg) {
  pop(reg);
  implicitPop(sizeof(intptr_t));
}

void MacroAssembler::Pop(FloatRegister reg) {
  pop(reg);
  implicitPop(sizeof(double));
}

void MacroAssembler::Pop(const ValueOperand& val) {
  popValue(val);
  implicitPop(sizeof(Value));
}

void MacroAssembler::PopFlags() {
  popFlags();
  implicitPop(sizeof(intptr_t));
}

void MacroAssembler::PopStackPtr() { Pop(StackPointer); }

void MacroAssembler::freeStackTo(uint32_t framePushed) {
  MOZ_ASSERT(framePushed <= framePushed_);
  lea(Operand(FramePointer, -int32_t(framePushed)), StackPointer);
  framePushed_ = framePushed;
}

// ===============================================================
// Simple call functions.

CodeOffset MacroAssembler::call(Register reg) { return Assembler::call(reg); }

CodeOffset MacroAssembler::call(Label* label) { return Assembler::call(label); }

void MacroAssembler::call(const Address& addr) {
  Assembler::call(Operand(addr.base, addr.offset));
}

CodeOffset MacroAssembler::call(wasm::SymbolicAddress target) {
  mov(target, eax);
  return Assembler::call(eax);
}

void MacroAssembler::call(ImmWord target) { Assembler::call(target); }

void MacroAssembler::call(ImmPtr target) { Assembler::call(target); }

void MacroAssembler::call(JitCode* target) { Assembler::call(target); }

CodeOffset MacroAssembler::callWithPatch() {
  return Assembler::callWithPatch();
}
void MacroAssembler::patchCall(uint32_t callerOffset, uint32_t calleeOffset) {
  Assembler::patchCall(callerOffset, calleeOffset);
}

void MacroAssembler::callAndPushReturnAddress(Register reg) { call(reg); }

void MacroAssembler::callAndPushReturnAddress(Label* label) { call(label); }

// ===============================================================
// Patchable near/far jumps.

CodeOffset MacroAssembler::farJumpWithPatch() {
  return Assembler::farJumpWithPatch();
}

void MacroAssembler::patchFarJump(CodeOffset farJump, uint32_t targetOffset) {
  Assembler::patchFarJump(farJump, targetOffset);
}

void MacroAssembler::patchFarJump(uint8_t* farJump, uint8_t* target) {
  Assembler::patchFarJump(farJump, target);
}

CodeOffset MacroAssembler::nopPatchableToCall() {
  masm.nop_five();
  return CodeOffset(currentOffset());
}

void MacroAssembler::patchNopToCall(uint8_t* callsite, uint8_t* target) {
  Assembler::patchFiveByteNopToCall(callsite, target);
}

void MacroAssembler::patchCallToNop(uint8_t* callsite) {
  Assembler::patchCallToFiveByteNop(callsite);
}

CodeOffset MacroAssembler::move32WithPatch(Register dest) {
  movl(Imm32(-1), dest);
  return CodeOffset(currentOffset());
}

void MacroAssembler::patchMove32(CodeOffset offset, Imm32 n) {
  X86Encoding::SetInt32(masm.data() + offset.offset(), n.value);
}

// ===============================================================
// Jit Frames.

uint32_t MacroAssembler::pushFakeReturnAddress(Register scratch) {
  CodeLabel cl;

  mov(&cl, scratch);
  Push(scratch);
  bind(&cl);
  uint32_t retAddr = currentOffset();

  addCodeLabel(cl);
  return retAddr;
}

// ===============================================================
// WebAssembly

FaultingCodeOffset MacroAssembler::wasmTrapInstruction() {
  return FaultingCodeOffset(ud2().offset());
}

void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
                                       Register boundsCheckLimit, Label* ok) {
  cmp32(index, boundsCheckLimit);
  j(cond, ok);
  if (JitOptions.spectreIndexMasking) {
    cmovCCl(cond, Operand(boundsCheckLimit), index);
  }
}

void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
                                       Address boundsCheckLimit, Label* ok) {
  cmp32(index, Operand(boundsCheckLimit));
  j(cond, ok);
  if (JitOptions.spectreIndexMasking) {
    cmovCCl(cond, Operand(boundsCheckLimit), index);
  }
}

// RAII class that generates the jumps to traps when it's destructed, to
// prevent some code duplication in the outOfLineWasmTruncateXtoY methods.
struct MOZ_RAII AutoHandleWasmTruncateToIntErrors {
  MacroAssembler& masm;
  Label inputIsNaN;
  Label intOverflow;
  wasm::BytecodeOffset off;

  explicit AutoHandleWasmTruncateToIntErrors(MacroAssembler& masm,
                                             wasm::BytecodeOffset off)
      : masm(masm), off(off) {}

  ~AutoHandleWasmTruncateToIntErrors() {
    // Handle errors.  These cases are not in arbitrary order: code will
    // fall through to intOverflow.
    masm.bind(&intOverflow);
    masm.wasmTrap(wasm::Trap::IntegerOverflow, off);

    masm.bind(&inputIsNaN);
    masm.wasmTrap(wasm::Trap::InvalidConversionToInteger, off);
  }
};

void MacroAssembler::wasmTruncateDoubleToInt32(FloatRegister input,
                                               Register output,
                                               bool isSaturating,
                                               Label* oolEntry) {
  vcvttsd2si(input, output);
  cmp32(output, Imm32(1));
  j(Assembler::Overflow, oolEntry);
}

void MacroAssembler::wasmTruncateFloat32ToInt32(FloatRegister input,
                                                Register output,
                                                bool isSaturating,
                                                Label* oolEntry) {
  vcvttss2si(input, output);
  cmp32(output, Imm32(1));
  j(Assembler::Overflow, oolEntry);
}

void MacroAssembler::oolWasmTruncateCheckF64ToI32(FloatRegister input,
                                                  Register output,
                                                  TruncFlags flags,
                                                  wasm::BytecodeOffset off,
                                                  Label* rejoin) {
  bool isUnsigned = flags & TRUNC_UNSIGNED;
  bool isSaturating = flags & TRUNC_SATURATING;

  if (isSaturating) {
    if (isUnsigned) {
      // Negative overflow and NaN both are converted to 0, and the only
      // other case is positive overflow which is converted to
      // UINT32_MAX.
      Label nonNegative;
      ScratchDoubleScope fpscratch(*this);
      loadConstantDouble(0.0, fpscratch);
      branchDouble(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
                   &nonNegative);
      move32(Imm32(0), output);
      jump(rejoin);

      bind(&nonNegative);
      move32(Imm32(UINT32_MAX), output);
    } else {
      // Negative overflow is already saturated to INT32_MIN, so we only
      // have to handle NaN and positive overflow here.
      Label notNaN;
      branchDouble(Assembler::DoubleOrdered, input, input, &notNaN);
      move32(Imm32(0), output);
      jump(rejoin);

      bind(&notNaN);
      ScratchDoubleScope fpscratch(*this);
      loadConstantDouble(0.0, fpscratch);
      branchDouble(Assembler::DoubleLessThan, input, fpscratch, rejoin);
      sub32(Imm32(1), output);
    }
    jump(rejoin);
    return;
  }

  AutoHandleWasmTruncateToIntErrors traps(*this, off);

  // Eagerly take care of NaNs.
  branchDouble(Assembler::DoubleUnordered, input, input, &traps.inputIsNaN);

  // For unsigned, fall through to intOverflow failure case.
  if (isUnsigned) {
    return;
  }

  // Handle special values.

  // We've used vcvttsd2si. The only valid double values that can
  // truncate to INT32_MIN are in ]INT32_MIN - 1; INT32_MIN].
  ScratchDoubleScope fpscratch(*this);
  loadConstantDouble(double(INT32_MIN) - 1.0, fpscratch);
  branchDouble(Assembler::DoubleLessThanOrEqual, input, fpscratch,
               &traps.intOverflow);

  loadConstantDouble(0.0, fpscratch);
  branchDouble(Assembler::DoubleGreaterThan, input, fpscratch,
               &traps.intOverflow);
  jump(rejoin);
}

void MacroAssembler::oolWasmTruncateCheckF32ToI32(FloatRegister input,
                                                  Register output,
                                                  TruncFlags flags,
                                                  wasm::BytecodeOffset off,
                                                  Label* rejoin) {
  bool isUnsigned = flags & TRUNC_UNSIGNED;
  bool isSaturating = flags & TRUNC_SATURATING;

  if (isSaturating) {
    if (isUnsigned) {
      // Negative overflow and NaN both are converted to 0, and the only
      // other case is positive overflow which is converted to
      // UINT32_MAX.
      Label nonNegative;
      ScratchFloat32Scope fpscratch(*this);
      loadConstantFloat32(0.0f, fpscratch);
      branchFloat(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
                  &nonNegative);
      move32(Imm32(0), output);
      jump(rejoin);

      bind(&nonNegative);
      move32(Imm32(UINT32_MAX), output);
    } else {
      // Negative overflow is already saturated to INT32_MIN, so we only
      // have to handle NaN and positive overflow here.
      Label notNaN;
      branchFloat(Assembler::DoubleOrdered, input, input, &notNaN);
      move32(Imm32(0), output);
      jump(rejoin);

      bind(&notNaN);
      ScratchFloat32Scope fpscratch(*this);
      loadConstantFloat32(0.0f, fpscratch);
      branchFloat(Assembler::DoubleLessThan, input, fpscratch, rejoin);
      sub32(Imm32(1), output);
    }
    jump(rejoin);
    return;
  }

  AutoHandleWasmTruncateToIntErrors traps(*this, off);

  // Eagerly take care of NaNs.
  branchFloat(Assembler::DoubleUnordered, input, input, &traps.inputIsNaN);

  // For unsigned, fall through to intOverflow failure case.
  if (isUnsigned) {
    return;
  }

  // Handle special values.

  // We've used vcvttss2si. Check that the input wasn't
  // float(INT32_MIN), which is the only legimitate input that
  // would truncate to INT32_MIN.
  ScratchFloat32Scope fpscratch(*this);
  loadConstantFloat32(float(INT32_MIN), fpscratch);
  branchFloat(Assembler::DoubleNotEqual, input, fpscratch, &traps.intOverflow);
  jump(rejoin);
}

void MacroAssembler::oolWasmTruncateCheckF64ToI64(FloatRegister input,
                                                  Register64 output,
                                                  TruncFlags flags,
                                                  wasm::BytecodeOffset off,
                                                  Label* rejoin) {
  bool isUnsigned = flags & TRUNC_UNSIGNED;
  bool isSaturating = flags & TRUNC_SATURATING;

  if (isSaturating) {
    if (isUnsigned) {
      // Negative overflow and NaN both are converted to 0, and the only
      // other case is positive overflow which is converted to
      // UINT64_MAX.
      Label positive;
      ScratchDoubleScope fpscratch(*this);
      loadConstantDouble(0.0, fpscratch);
      branchDouble(Assembler::DoubleGreaterThan, input, fpscratch, &positive);
      move64(Imm64(0), output);
      jump(rejoin);

      bind(&positive);
      move64(Imm64(UINT64_MAX), output);
    } else {
      // Negative overflow is already saturated to INT64_MIN, so we only
      // have to handle NaN and positive overflow here.
      Label notNaN;
      branchDouble(Assembler::DoubleOrdered, input, input, &notNaN);
      move64(Imm64(0), output);
      jump(rejoin);

      bind(&notNaN);
      ScratchDoubleScope fpscratch(*this);
      loadConstantDouble(0.0, fpscratch);
      branchDouble(Assembler::DoubleLessThan, input, fpscratch, rejoin);
      sub64(Imm64(1), output);
    }
    jump(rejoin);
    return;
  }

  AutoHandleWasmTruncateToIntErrors traps(*this, off);

  // Eagerly take care of NaNs.
  branchDouble(Assembler::DoubleUnordered, input, input, &traps.inputIsNaN);

  // Handle special values.
  if (isUnsigned) {
    ScratchDoubleScope fpscratch(*this);
    loadConstantDouble(0.0, fpscratch);
    branchDouble(Assembler::DoubleGreaterThan, input, fpscratch,
                 &traps.intOverflow);
    loadConstantDouble(-1.0, fpscratch);
    branchDouble(Assembler::DoubleLessThanOrEqual, input, fpscratch,
                 &traps.intOverflow);
    jump(rejoin);
    return;
  }

  // We've used vcvtsd2sq. The only legit value whose i64
  // truncation is INT64_MIN is double(INT64_MIN): exponent is so
  // high that the highest resolution around is much more than 1.
  ScratchDoubleScope fpscratch(*this);
  loadConstantDouble(double(int64_t(INT64_MIN)), fpscratch);
  branchDouble(Assembler::DoubleNotEqual, input, fpscratch, &traps.intOverflow);
  jump(rejoin);
}

void MacroAssembler::oolWasmTruncateCheckF32ToI64(FloatRegister input,
                                                  Register64 output,
                                                  TruncFlags flags,
                                                  wasm::BytecodeOffset off,
                                                  Label* rejoin) {
  bool isUnsigned = flags & TRUNC_UNSIGNED;
  bool isSaturating = flags & TRUNC_SATURATING;

  if (isSaturating) {
    if (isUnsigned) {
      // Negative overflow and NaN both are converted to 0, and the only
      // other case is positive overflow which is converted to
      // UINT64_MAX.
      Label positive;
      ScratchFloat32Scope fpscratch(*this);
      loadConstantFloat32(0.0f, fpscratch);
      branchFloat(Assembler::DoubleGreaterThan, input, fpscratch, &positive);
      move64(Imm64(0), output);
      jump(rejoin);

      bind(&positive);
      move64(Imm64(UINT64_MAX), output);
    } else {
      // Negative overflow is already saturated to INT64_MIN, so we only
      // have to handle NaN and positive overflow here.
      Label notNaN;
      branchFloat(Assembler::DoubleOrdered, input, input, &notNaN);
      move64(Imm64(0), output);
      jump(rejoin);

      bind(&notNaN);
      ScratchFloat32Scope fpscratch(*this);
      loadConstantFloat32(0.0f, fpscratch);
      branchFloat(Assembler::DoubleLessThan, input, fpscratch, rejoin);
      sub64(Imm64(1), output);
    }
    jump(rejoin);
    return;
  }

  AutoHandleWasmTruncateToIntErrors traps(*this, off);

  // Eagerly take care of NaNs.
  branchFloat(Assembler::DoubleUnordered, input, input, &traps.inputIsNaN);

  // Handle special values.
  if (isUnsigned) {
    ScratchFloat32Scope fpscratch(*this);
    loadConstantFloat32(0.0f, fpscratch);
    branchFloat(Assembler::DoubleGreaterThan, input, fpscratch,
                &traps.intOverflow);
    loadConstantFloat32(-1.0f, fpscratch);
    branchFloat(Assembler::DoubleLessThanOrEqual, input, fpscratch,
                &traps.intOverflow);
    jump(rejoin);
    return;
  }

  // We've used vcvtss2sq. See comment in outOfLineWasmTruncateDoubleToInt64.
  ScratchFloat32Scope fpscratch(*this);
  loadConstantFloat32(float(int64_t(INT64_MIN)), fpscratch);
  branchFloat(Assembler::DoubleNotEqual, input, fpscratch, &traps.intOverflow);
  jump(rejoin);
}

void MacroAssembler::enterFakeExitFrameForWasm(Register cxreg, Register scratch,
                                               ExitFrameType type) {
  enterFakeExitFrame(cxreg, scratch, type);
}

CodeOffset MacroAssembler::sub32FromMemAndBranchIfNegativeWithPatch(
    Address address, Label* label) {
  // -128 is arbitrary, but makes `*address` count upwards, which may help
  // to identify cases where the subsequent ::patch..() call was forgotten.
  int numImmBytes = subl(Imm32(-128), Operand(address));
  // This is vitally important for patching
  MOZ_RELEASE_ASSERT(numImmBytes == 1);
  // Points immediately after the location to patch
  CodeOffset patchPoint = CodeOffset(currentOffset());
  jSrc(Condition::Signed, label);
  return patchPoint;
}

void MacroAssembler::patchSub32FromMemAndBranchIfNegative(CodeOffset offset,
                                                          Imm32 imm) {
  int32_t val = imm.value;
  // Patching it to zero would make the insn pointless
  MOZ_RELEASE_ASSERT(val >= 1 && val <= 127);
  uint8_t* ptr = (uint8_t*)masm.data() + offset.offset() - 1;
  MOZ_RELEASE_ASSERT(*ptr == uint8_t(-128));  // as created above
  *ptr = uint8_t(val) & 0x7F;
}

// ========================================================================
// Primitive atomic operations.

static void ExtendTo32(MacroAssembler& masm, Scalar::Type type, Register r) {
  switch (type) {
    case Scalar::Int8:
      masm.movsbl(r, r);
      break;
    case Scalar::Uint8:
      masm.movzbl(r, r);
      break;
    case Scalar::Int16:
      masm.movswl(r, r);
      break;
    case Scalar::Uint16:
      masm.movzwl(r, r);
      break;
    case Scalar::Int32:
    case Scalar::Uint32:
      break;
    default:
      MOZ_CRASH("unexpected type");
  }
}

#ifdef DEBUG
static inline bool IsByteReg(Register r) {
  AllocatableGeneralRegisterSet byteRegs(Registers::SingleByteRegs);
  return byteRegs.has(r);
}

static inline bool IsByteReg(Imm32 r) {
  // Nothing
  return true;
}
#endif

template <typename T>
static void CompareExchange(MacroAssembler& masm,
                            const wasm::MemoryAccessDesc* access,
                            Scalar::Type type, const T& mem, Register oldval,
                            Register newval, Register output) {
  MOZ_ASSERT(output == eax);

  if (oldval != output) {
    masm.movl(oldval, output);
  }

  if (access) {
    masm.append(*access, wasm::TrapMachineInsn::Atomic,
                FaultingCodeOffset(masm.currentOffset()));
  }

  // NOTE: the generated code must match the assembly code in gen_cmpxchg in
  // GenerateAtomicOperations.py
  switch (Scalar::byteSize(type)) {
    case 1:
      MOZ_ASSERT(IsByteReg(newval));
      masm.lock_cmpxchgb(newval, Operand(mem));
      break;
    case 2:
      masm.lock_cmpxchgw(newval, Operand(mem));
      break;
    case 4:
      masm.lock_cmpxchgl(newval, Operand(mem));
      break;
    default:
      MOZ_CRASH("Invalid");
  }

  ExtendTo32(masm, type, output);
}

void MacroAssembler::compareExchange(Scalar::Type type, Synchronization,
                                     const Address& mem, Register oldval,
                                     Register newval, Register output) {
  CompareExchange(*this, nullptr, type, mem, oldval, newval, output);
}

void MacroAssembler::compareExchange(Scalar::Type type, Synchronization,
                                     const BaseIndex& mem, Register oldval,
                                     Register newval, Register output) {
  CompareExchange(*this, nullptr, type, mem, oldval, newval, output);
}

void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
                                         const Address& mem, Register oldval,
                                         Register newval, Register output) {
  CompareExchange(*this, &access, access.type(), mem, oldval, newval, output);
}

void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
                                         const BaseIndex& mem, Register oldval,
                                         Register newval, Register output) {
  CompareExchange(*this, &access, access.type(), mem, oldval, newval, output);
}

template <typename T>
static void AtomicExchange(MacroAssembler& masm,
                           const wasm::MemoryAccessDesc* access,
                           Scalar::Type type, const T& mem, Register value,
                           Register output)
// NOTE: the generated code must match the assembly code in gen_exchange in
// GenerateAtomicOperations.py
{
  if (value != output) {
    masm.movl(value, output);
  }

  if (access) {
    masm.append(*access, wasm::TrapMachineInsn::Atomic,
                FaultingCodeOffset(masm.currentOffset()));
  }

  switch (Scalar::byteSize(type)) {
    case 1:
      MOZ_ASSERT(IsByteReg(output));
      masm.xchgb(output, Operand(mem));
      break;
    case 2:
      masm.xchgw(output, Operand(mem));
      break;
    case 4:
      masm.xchgl(output, Operand(mem));
      break;
    default:
      MOZ_CRASH("Invalid");
  }
  ExtendTo32(masm, type, output);
}

void MacroAssembler::atomicExchange(Scalar::Type type, Synchronization,
                                    const Address& mem, Register value,
                                    Register output) {
  AtomicExchange(*this, nullptr, type, mem, value, output);
}

void MacroAssembler::atomicExchange(Scalar::Type type, Synchronization,
                                    const BaseIndex& mem, Register value,
                                    Register output) {
  AtomicExchange(*this, nullptr, type, mem, value, output);
}

void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
                                        const Address& mem, Register value,
                                        Register output) {
  AtomicExchange(*this, &access, access.type(), mem, value, output);
}

void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
                                        const BaseIndex& mem, Register value,
                                        Register output) {
  AtomicExchange(*this, &access, access.type(), mem, value, output);
}

static void SetupValue(MacroAssembler& masm, AtomicOp op, Imm32 src,
                       Register output) {
  if (op == AtomicOp::Sub) {
    masm.movl(Imm32(-src.value), output);
  } else {
    masm.movl(src, output);
  }
}

static void SetupValue(MacroAssembler& masm, AtomicOp op, Register src,
                       Register output) {
  if (src != output) {
    masm.movl(src, output);
  }
  if (op == AtomicOp::Sub) {
    masm.negl(output);
  }
}

static auto WasmTrapMachineInsn(Scalar::Type arrayType, AtomicOp op) {
  switch (op) {
    case AtomicOp::Add:
    case AtomicOp::Sub:
      return wasm::TrapMachineInsn::Atomic;
    case AtomicOp::And:
    case AtomicOp::Or:
    case AtomicOp::Xor:
      switch (arrayType) {
        case Scalar::Int8:
        case Scalar::Uint8:
          return wasm::TrapMachineInsn::Load8;
        case Scalar::Int16:
        case Scalar::Uint16:
          return wasm::TrapMachineInsn::Load16;
        case Scalar::Int32:
        case Scalar::Uint32:
          return wasm::TrapMachineInsn::Load32;
        default:
          break;
      }
      [[fallthrough]];
    default:
      break;
  }
  MOZ_CRASH();
}

template <typename T, typename V>
static void AtomicFetchOp(MacroAssembler& masm,
                          const wasm::MemoryAccessDesc* access,
                          Scalar::Type arrayType, AtomicOp op, V value,
                          const T& mem, Register temp, Register output) {
  // Note value can be an Imm or a Register.

  // NOTE: the generated code must match the assembly code in gen_fetchop in
  // GenerateAtomicOperations.py

  // Setup the output register.
  switch (op) {
    case AtomicOp::Add:
    case AtomicOp::Sub:
      MOZ_ASSERT(temp == InvalidReg);
      MOZ_ASSERT_IF(Scalar::byteSize(arrayType) == 1,
                    IsByteReg(output) && IsByteReg(value));

      SetupValue(masm, op, value, output);
      break;
    case AtomicOp::And:
    case AtomicOp::Or:
    case AtomicOp::Xor:
      MOZ_ASSERT(output != temp && output == eax);
      MOZ_ASSERT_IF(Scalar::byteSize(arrayType) == 1,
                    IsByteReg(output) && IsByteReg(temp));

      // Bitwise operations don't require any additional setup.
      break;
    default:
      MOZ_CRASH();
  }

  auto lock_xadd = [&]() {
    switch (arrayType) {
      case Scalar::Int8:
      case Scalar::Uint8:
        masm.lock_xaddb(output, Operand(mem));
        break;
      case Scalar::Int16:
      case Scalar::Uint16:
        masm.lock_xaddw(output, Operand(mem));
        break;
      case Scalar::Int32:
      case Scalar::Uint32:
        masm.lock_xaddl(output, Operand(mem));
        break;
      default:
        MOZ_CRASH();
    }
  };

  auto load = [&]() {
    switch (arrayType) {
      case Scalar::Int8:
      case Scalar::Uint8:
        masm.movzbl(Operand(mem), eax);
        break;
      case Scalar::Int16:
      case Scalar::Uint16:
        masm.movzwl(Operand(mem), eax);
        break;
      case Scalar::Int32:
      case Scalar::Uint32:
        masm.movl(Operand(mem), eax);
        break;
      default:
        MOZ_CRASH();
    }
  };

  auto bitwiseOp = [&]() {
    switch (op) {
      case AtomicOp::And:
        masm.andl(value, temp);
        break;
      case AtomicOp::Or:
        masm.orl(value, temp);
        break;
      case AtomicOp::Xor:
        masm.xorl(value, temp);
        break;
      default:
        MOZ_CRASH();
    }
  };

  auto lock_cmpxchg = [&]() {
    switch (arrayType) {
      case Scalar::Int8:
      case Scalar::Uint8:
        masm.lock_cmpxchgb(temp, Operand(mem));
        break;
      case Scalar::Int16:
      case Scalar::Uint16:
        masm.lock_cmpxchgw(temp, Operand(mem));
        break;
      case Scalar::Int32:
      case Scalar::Uint32:
        masm.lock_cmpxchgl(temp, Operand(mem));
        break;
      default:
        MOZ_CRASH();
    }
  };

  // Add trap instruction directly before the load.
  if (access) {
    masm.append(*access, WasmTrapMachineInsn(arrayType, op),
                FaultingCodeOffset(masm.currentOffset()));
  }

  switch (op) {
    case AtomicOp::Add:
    case AtomicOp::Sub:
      // `add` and `sub` operations can be optimized with XADD.
      lock_xadd();

      ExtendTo32(masm, arrayType, output);
      break;

    case AtomicOp::And:
    case AtomicOp::Or:
    case AtomicOp::Xor: {
      // Bitwise operations need a CAS loop.

      // Load memory into eax.
      load();

      // Loop.
      Label again;
      masm.bind(&again);
      masm.movl(eax, temp);

      // temp = temp <op> value.
      bitwiseOp();

      // Compare and swap `temp` with memory.
      lock_cmpxchg();

      // Repeat if the comparison failed.
      masm.j(MacroAssembler::NonZero, &again);

      // Sign-extend the zero-extended load.
      if (Scalar::isSignedIntType(arrayType)) {
        ExtendTo32(masm, arrayType, eax);
      }
      break;
    }

    default:
      MOZ_CRASH();
  }
}

void MacroAssembler::atomicFetchOp(Scalar::Type arrayType, Synchronization,
                                   AtomicOp op, Register value,
                                   const BaseIndex& mem, Register temp,
                                   Register output) {
  AtomicFetchOp(*this, nullptr, arrayType, op, value, mem, temp, output);
}

void MacroAssembler::atomicFetchOp(Scalar::Type arrayType, Synchronization,
                                   AtomicOp op, Register value,
                                   const Address& mem, Register temp,
                                   Register output) {
  AtomicFetchOp(*this, nullptr, arrayType, op, value, mem, temp, output);
}

void MacroAssembler::atomicFetchOp(Scalar::Type arrayType, Synchronization,
                                   AtomicOp op, Imm32 value,
                                   const BaseIndex& mem, Register temp,
                                   Register output) {
  AtomicFetchOp(*this, nullptr, arrayType, op, value, mem, temp, output);
}

void MacroAssembler::atomicFetchOp(Scalar::Type arrayType, Synchronization,
                                   AtomicOp op, Imm32 value, const Address& mem,
                                   Register temp, Register output) {
  AtomicFetchOp(*this, nullptr, arrayType, op, value, mem, temp, output);
}

void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
                                       AtomicOp op, Register value,
                                       const Address& mem, Register temp,
                                       Register output) {
  AtomicFetchOp(*this, &access, access.type(), op, value, mem, temp, output);
}

void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
                                       AtomicOp op, Imm32 value,
                                       const Address& mem, Register temp,
                                       Register output) {
  AtomicFetchOp(*this, &access, access.type(), op, value, mem, temp, output);
}

void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
                                       AtomicOp op, Register value,
                                       const BaseIndex& mem, Register temp,
                                       Register output) {
  AtomicFetchOp(*this, &access, access.type(), op, value, mem, temp, output);
}

void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
                                       AtomicOp op, Imm32 value,
                                       const BaseIndex& mem, Register temp,
                                       Register output) {
  AtomicFetchOp(*this, &access, access.type(), op, value, mem, temp, output);
}

template <typename T, typename V>
static void AtomicEffectOp(MacroAssembler& masm,
                           const wasm::MemoryAccessDesc* access,
                           Scalar::Type arrayType, AtomicOp op, V value,
                           const T& mem) {
  if (access) {
    masm.append(*access, wasm::TrapMachineInsn::Atomic,
                FaultingCodeOffset(masm.currentOffset()));
  }

  switch (Scalar::byteSize(arrayType)) {
    case 1:
      switch (op) {
        case AtomicOp::Add:
          masm.lock_addb(value, Operand(mem));
          break;
        case AtomicOp::Sub:
          masm.lock_subb(value, Operand(mem));
          break;
        case AtomicOp::And:
          masm.lock_andb(value, Operand(mem));
          break;
        case AtomicOp::Or:
          masm.lock_orb(value, Operand(mem));
          break;
        case AtomicOp::Xor:
          masm.lock_xorb(value, Operand(mem));
          break;
        default:
          MOZ_CRASH();
      }
      break;
    case 2:
      switch (op) {
        case AtomicOp::Add:
          masm.lock_addw(value, Operand(mem));
          break;
        case AtomicOp::Sub:
          masm.lock_subw(value, Operand(mem));
          break;
        case AtomicOp::And:
          masm.lock_andw(value, Operand(mem));
          break;
        case AtomicOp::Or:
          masm.lock_orw(value, Operand(mem));
          break;
        case AtomicOp::Xor:
          masm.lock_xorw(value, Operand(mem));
          break;
        default:
          MOZ_CRASH();
      }
      break;
    case 4:
      switch (op) {
        case AtomicOp::Add:
          masm.lock_addl(value, Operand(mem));
          break;
        case AtomicOp::Sub:
          masm.lock_subl(value, Operand(mem));
          break;
        case AtomicOp::And:
          masm.lock_andl(value, Operand(mem));
          break;
        case AtomicOp::Or:
          masm.lock_orl(value, Operand(mem));
          break;
        case AtomicOp::Xor:
          masm.lock_xorl(value, Operand(mem));
          break;
        default:
          MOZ_CRASH();
      }
      break;
    default:
      MOZ_CRASH();
  }
}

void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
                                        AtomicOp op, Register value,
                                        const Address& mem, Register temp) {
  MOZ_ASSERT(temp == InvalidReg);
  AtomicEffectOp(*this, &access, access.type(), op, value, mem);
}

void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
                                        AtomicOp op, Imm32 value,
                                        const Address& mem, Register temp) {
  MOZ_ASSERT(temp == InvalidReg);
  AtomicEffectOp(*this, &access, access.type(), op, value, mem);
}

void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
                                        AtomicOp op, Register value,
                                        const BaseIndex& mem, Register temp) {
  MOZ_ASSERT(temp == InvalidReg);
  AtomicEffectOp(*this, &access, access.type(), op, value, mem);
}

void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
                                        AtomicOp op, Imm32 value,
                                        const BaseIndex& mem, Register temp) {
  MOZ_ASSERT(temp == InvalidReg);
  AtomicEffectOp(*this, &access, access.type(), op, value, mem);
}

// ========================================================================
// JS atomic operations.

template <typename T>
static void CompareExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
                              Synchronization sync, const T& mem,
                              Register oldval, Register newval, Register temp,
                              AnyRegister output) {
  if (arrayType == Scalar::Uint32) {
    masm.compareExchange(arrayType, sync, mem, oldval, newval, temp);
    masm.convertUInt32ToDouble(temp, output.fpu());
  } else {
    masm.compareExchange(arrayType, sync, mem, oldval, newval, output.gpr());
  }
}

void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
                                       Synchronization sync, const Address& mem,
                                       Register oldval, Register newval,
                                       Register temp, AnyRegister output) {
  CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, temp, output);
}

void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
                                       Synchronization sync,
                                       const BaseIndex& mem, Register oldval,
                                       Register newval, Register temp,
                                       AnyRegister output) {
  CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, temp, output);
}

template <typename T>
static void AtomicExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
                             Synchronization sync, const T& mem, Register value,
                             Register temp, AnyRegister output) {
  if (arrayType == Scalar::Uint32) {
    masm.atomicExchange(arrayType, sync, mem, value, temp);
    masm.convertUInt32ToDouble(temp, output.fpu());
  } else {
    masm.atomicExchange(arrayType, sync, mem, value, output.gpr());
  }
}

void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
                                      Synchronization sync, const Address& mem,
                                      Register value, Register temp,
                                      AnyRegister output) {
  AtomicExchangeJS(*this, arrayType, sync, mem, value, temp, output);
}

void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
                                      Synchronization sync,
                                      const BaseIndex& mem, Register value,
                                      Register temp, AnyRegister output) {
  AtomicExchangeJS(*this, arrayType, sync, mem, value, temp, output);
}

template <typename T>
static void AtomicFetchOpJS(MacroAssembler& masm, Scalar::Type arrayType,
                            Synchronization sync, AtomicOp op, Register value,
                            const T& mem, Register temp1, Register temp2,
                            AnyRegister output) {
  if (arrayType == Scalar::Uint32) {
    masm.atomicFetchOp(arrayType, sync, op, value, mem, temp2, temp1);
    masm.convertUInt32ToDouble(temp1, output.fpu());
  } else {
    masm.atomicFetchOp(arrayType, sync, op, value, mem, temp1, output.gpr());
  }
}

void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
                                     Synchronization sync, AtomicOp op,
                                     Register value, const Address& mem,
                                     Register temp1, Register temp2,
                                     AnyRegister output) {
  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, temp1, temp2, output);
}

void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
                                     Synchronization sync, AtomicOp op,
                                     Register value, const BaseIndex& mem,
                                     Register temp1, Register temp2,
                                     AnyRegister output) {
  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, temp1, temp2, output);
}

void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType, Synchronization,
                                      AtomicOp op, Register value,
                                      const BaseIndex& mem, Register temp) {
  MOZ_ASSERT(temp == InvalidReg);
  AtomicEffectOp(*this, nullptr, arrayType, op, value, mem);
}

void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType, Synchronization,
                                      AtomicOp op, Register value,
                                      const Address& mem, Register temp) {
  MOZ_ASSERT(temp == InvalidReg);
  AtomicEffectOp(*this, nullptr, arrayType, op, value, mem);
}

void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType, Synchronization,
                                      AtomicOp op, Imm32 value,
                                      const Address& mem, Register temp) {
  MOZ_ASSERT(temp == InvalidReg);
  AtomicEffectOp(*this, nullptr, arrayType, op, value, mem);
}

void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType,
                                      Synchronization sync, AtomicOp op,
                                      Imm32 value, const BaseIndex& mem,
                                      Register temp) {
  MOZ_ASSERT(temp == InvalidReg);
  AtomicEffectOp(*this, nullptr, arrayType, op, value, mem);
}

template <typename T>
static void AtomicFetchOpJS(MacroAssembler& masm, Scalar::Type arrayType,
                            Synchronization sync, AtomicOp op, Imm32 value,
                            const T& mem, Register temp1, Register temp2,
                            AnyRegister output) {
  if (arrayType == Scalar::Uint32) {
    masm.atomicFetchOp(arrayType, sync, op, value, mem, temp2, temp1);
    masm.convertUInt32ToDouble(temp1, output.fpu());
  } else {
    masm.atomicFetchOp(arrayType, sync, op, value, mem, temp1, output.gpr());
  }
}

void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
                                     Synchronization sync, AtomicOp op,
                                     Imm32 value, const Address& mem,
                                     Register temp1, Register temp2,
                                     AnyRegister output) {
  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, temp1, temp2, output);
}

void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
                                     Synchronization sync, AtomicOp op,
                                     Imm32 value, const BaseIndex& mem,
                                     Register temp1, Register temp2,
                                     AnyRegister output) {
  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, temp1, temp2, output);
}

void MacroAssembler::atomicPause() { masm.pause(); }

// ========================================================================
// Spectre Mitigations.

void MacroAssembler::speculationBarrier() {
  // Spectre mitigation recommended by Intel and AMD suggest to use lfence as
  // a way to force all speculative execution of instructions to end.
  MOZ_ASSERT(HasSSE2());
  masm.lfence();
}

void MacroAssembler::floorFloat32ToInt32(FloatRegister src, Register dest,
                                         Label* fail) {
  if (HasSSE41()) {
    // Fail on negative-zero.
    branchNegativeZeroFloat32(src, dest, fail);

    // Round toward -Infinity.
    {
      ScratchFloat32Scope scratch(*this);
      vroundss(X86Encoding::RoundDown, src, scratch);
      truncateFloat32ToInt32(scratch, dest, fail);
    }
  } else {
    Label negative, end;

    // Branch to a slow path for negative inputs. Doesn't catch NaN or -0.
    {
      ScratchFloat32Scope scratch(*this);
      zeroFloat32(scratch);
      branchFloat(Assembler::DoubleLessThan, src, scratch, &negative);
    }

    // Fail on negative-zero.
    branchNegativeZeroFloat32(src, dest, fail);

    // Input is non-negative, so truncation correctly rounds.
    truncateFloat32ToInt32(src, dest, fail);
    jump(&end);

    // Input is negative, but isn't -0.
    // Negative values go on a comparatively expensive path, since no
    // native rounding mode matches JS semantics. Still better than callVM.
    bind(&negative);
    {
      // Truncate and round toward zero.
      // This is off-by-one for everything but integer-valued inputs.
      //
      // Directly call vcvttss2si instead of truncateFloat32ToInt32 because we
      // want to perform failure handling ourselves.
      vcvttss2si(src, dest);

      // Test whether the input double was integer-valued.
      {
        ScratchFloat32Scope scratch(*this);
        convertInt32ToFloat32(dest, scratch);
        branchFloat(Assembler::DoubleEqualOrUnordered, src, scratch, &end);
      }

      // Input is not integer-valued, so we rounded off-by-one in the
      // wrong direction. Correct by subtraction.
      //
      // Overflows if vcvttss2si returned the failure return value INT_MIN.
      branchSub32(Assembler::Overflow, Imm32(1), dest, fail);
    }

    bind(&end);
  }
}

void MacroAssembler::floorDoubleToInt32(FloatRegister src, Register dest,
                                        Label* fail) {
  if (HasSSE41()) {
    // Fail on negative-zero.
    branchNegativeZero(src, dest, fail);

    // Round toward -Infinity.
    {
      ScratchDoubleScope scratch(*this);
      vroundsd(X86Encoding::RoundDown, src, scratch);
      truncateDoubleToInt32(scratch, dest, fail);
    }
  } else {
    Label negative, end;

    // Branch to a slow path for negative inputs. Doesn't catch NaN or -0.
    {
      ScratchDoubleScope scratch(*this);
      zeroDouble(scratch);
      branchDouble(Assembler::DoubleLessThan, src, scratch, &negative);
    }

    // Fail on negative-zero.
    branchNegativeZero(src, dest, fail);

    // Input is non-negative, so truncation correctly rounds.
    truncateDoubleToInt32(src, dest, fail);
    jump(&end);

    // Input is negative, but isn't -0.
    // Negative values go on a comparatively expensive path, since no
    // native rounding mode matches JS semantics. Still better than callVM.
    bind(&negative);
    {
      // Truncate and round toward zero.
      // This is off-by-one for everything but integer-valued inputs.
      //
      // Directly call vcvttsd2si instead of truncateDoubleToInt32 because we
      // want to perform failure handling ourselves.
      vcvttsd2si(src, dest);

      // Test whether the input double was integer-valued.
      {
        ScratchDoubleScope scratch(*this);
        convertInt32ToDouble(dest, scratch);
        branchDouble(Assembler::DoubleEqualOrUnordered, src, scratch, &end);
      }

      // Input is not integer-valued, so we rounded off-by-one in the
      // wrong direction. Correct by subtraction.
      //
      // Overflows if vcvttsd2si returned the failure return value INT_MIN.
      branchSub32(Assembler::Overflow, Imm32(1), dest, fail);
    }

    bind(&end);
  }
}

void MacroAssembler::ceilFloat32ToInt32(FloatRegister src, Register dest,
                                        Label* fail) {
  ScratchFloat32Scope scratch(*this);

  Label lessThanOrEqualMinusOne;

  // If x is in ]-1,0], ceil(x) is -0, which cannot be represented as an int32.
  // Fail if x > -1 and the sign bit is set.
  loadConstantFloat32(-1.f, scratch);
  branchFloat(Assembler::DoubleLessThanOrEqualOrUnordered, src, scratch,
              &lessThanOrEqualMinusOne);
  vmovmskps(src, dest);
  branchTest32(Assembler::NonZero, dest, Imm32(1), fail);

  if (HasSSE41()) {
    // x <= -1 or x > -0
    bind(&lessThanOrEqualMinusOne);
    // Round toward +Infinity.
    vroundss(X86Encoding::RoundUp, src, scratch);
    truncateFloat32ToInt32(scratch, dest, fail);
    return;
  }

  // No SSE4.1
  Label end;

  // x >= 0 and x is not -0.0. We can truncate integer values, and truncate and
  // add 1 to non-integer values. This will also work for values >= INT_MAX + 1,
  // as the truncate operation will return INT_MIN and we'll fail.
  truncateFloat32ToInt32(src, dest, fail);
  convertInt32ToFloat32(dest, scratch);
  branchFloat(Assembler::DoubleEqualOrUnordered, src, scratch, &end);

  // Input is not integer-valued, add 1 to obtain the ceiling value.
  // If input > INT_MAX, output == INT_MAX so adding 1 will overflow.
  branchAdd32(Assembler::Overflow, Imm32(1), dest, fail);
  jump(&end);

  // x <= -1, truncation is the way to go.
  bind(&lessThanOrEqualMinusOne);
  truncateFloat32ToInt32(src, dest, fail);

  bind(&end);
}

void MacroAssembler::ceilDoubleToInt32(FloatRegister src, Register dest,
                                       Label* fail) {
  ScratchDoubleScope scratch(*this);

  Label lessThanOrEqualMinusOne;

  // If x is in ]-1,0], ceil(x) is -0, which cannot be represented as an int32.
  // Fail if x > -1 and the sign bit is set.
  loadConstantDouble(-1.0, scratch);
  branchDouble(Assembler::DoubleLessThanOrEqualOrUnordered, src, scratch,
               &lessThanOrEqualMinusOne);
  vmovmskpd(src, dest);
  branchTest32(Assembler::NonZero, dest, Imm32(1), fail);

  if (HasSSE41()) {
    // x <= -1 or x > -0
    bind(&lessThanOrEqualMinusOne);
    // Round toward +Infinity.
    vroundsd(X86Encoding::RoundUp, src, scratch);
    truncateDoubleToInt32(scratch, dest, fail);
    return;
  }

  // No SSE4.1
  Label end;

  // x >= 0 and x is not -0.0. We can truncate integer values, and truncate and
  // add 1 to non-integer values. This will also work for values >= INT_MAX + 1,
  // as the truncate operation will return INT_MIN and we'll fail.
  truncateDoubleToInt32(src, dest, fail);
  convertInt32ToDouble(dest, scratch);
  branchDouble(Assembler::DoubleEqualOrUnordered, src, scratch, &end);

  // Input is not integer-valued, add 1 to obtain the ceiling value.
  // If input > INT_MAX, output == INT_MAX so adding 1 will overflow.
  branchAdd32(Assembler::Overflow, Imm32(1), dest, fail);
  jump(&end);

  // x <= -1, truncation is the way to go.
  bind(&lessThanOrEqualMinusOne);
  truncateDoubleToInt32(src, dest, fail);

  bind(&end);
}

void MacroAssembler::truncDoubleToInt32(FloatRegister src, Register dest,
                                        Label* fail) {
  Label lessThanOrEqualMinusOne;

  // Bail on ]-1; -0] range
  {
    ScratchDoubleScope scratch(*this);
    loadConstantDouble(-1, scratch);
    branchDouble(Assembler::DoubleLessThanOrEqualOrUnordered, src, scratch,
                 &lessThanOrEqualMinusOne);
  }

  // Test for remaining values with the sign bit set, i.e. ]-1; -0]
  vmovmskpd(src, dest);
  branchTest32(Assembler::NonZero, dest, Imm32(1), fail);

  // x <= -1 or x >= +0, truncation is the way to go.
  bind(&lessThanOrEqualMinusOne);
  truncateDoubleToInt32(src, dest, fail);
}

void MacroAssembler::truncFloat32ToInt32(FloatRegister src, Register dest,
                                         Label* fail) {
  Label lessThanOrEqualMinusOne;

  // Bail on ]-1; -0] range
  {
    ScratchFloat32Scope scratch(*this);
    loadConstantFloat32(-1.f, scratch);
    branchFloat(Assembler::DoubleLessThanOrEqualOrUnordered, src, scratch,
                &lessThanOrEqualMinusOne);
  }

  // Test for remaining values with the sign bit set, i.e. ]-1; -0]
  vmovmskps(src, dest);
  branchTest32(Assembler::NonZero, dest, Imm32(1), fail);

  // x <= -1 or x >= +0, truncation is the way to go.
  bind(&lessThanOrEqualMinusOne);
  truncateFloat32ToInt32(src, dest, fail);
}

void MacroAssembler::roundFloat32ToInt32(FloatRegister src, Register dest,
                                         FloatRegister temp, Label* fail) {
  ScratchFloat32Scope scratch(*this);

  Label negativeOrZero, negative, end;

  // Branch to a slow path for non-positive inputs. Doesn't catch NaN.
  zeroFloat32(scratch);
  loadConstantFloat32(GetBiggestNumberLessThan(0.5f), temp);
  branchFloat(Assembler::DoubleLessThanOrEqual, src, scratch, &negativeOrZero);
  {
    // Input is strictly positive or NaN. Add the biggest float less than 0.5
    // and truncate, rounding down (because if the input is the biggest float
    // less than 0.5, adding 0.5 would undesirably round up to 1). Note that we
    // have to add the input to the temp register because we're not allowed to
    // modify the input register.
    addFloat32(src, temp);
    truncateFloat32ToInt32(temp, dest, fail);
    jump(&end);
  }

  // Input is negative, +0 or -0.
  bind(&negativeOrZero);
  {
    // Branch on negative input.
    j(Assembler::NotEqual, &negative);

    // Fail on negative-zero.
    branchNegativeZeroFloat32(src, dest, fail);

    // Input is +0.
    xor32(dest, dest);
    jump(&end);
  }

  // Input is negative.
  bind(&negative);
  {
    // Inputs in [-0.5, 0) are rounded to -0. Fail.
    loadConstantFloat32(-0.5f, scratch);
    branchFloat(Assembler::DoubleGreaterThanOrEqual, src, scratch, fail);

    // Other negative inputs need the biggest float less than 0.5 added.
    //
    // The result is stored in the temp register (currently contains the biggest
    // float less than 0.5).
    addFloat32(src, temp);

    if (HasSSE41()) {
      // Round toward -Infinity.
      vroundss(X86Encoding::RoundDown, temp, scratch);

      // Truncate.
      truncateFloat32ToInt32(scratch, dest, fail);
    } else {
      // Round toward -Infinity without the benefit of ROUNDSS.

      // Truncate and round toward zero.
      // This is off-by-one for everything but integer-valued inputs.
      //
      // Directly call vcvttss2si instead of truncateFloat32ToInt32 because we
      // want to perform failure handling ourselves.
      vcvttss2si(temp, dest);

      // Test whether the truncated float was integer-valued.
      convertInt32ToFloat32(dest, scratch);
      branchFloat(Assembler::DoubleEqualOrUnordered, temp, scratch, &end);

      // Input is not integer-valued, so we rounded off-by-one in the
      // wrong direction. Correct by subtraction.
      //
      // Overflows if vcvttss2si returned the failure return value INT_MIN.
      branchSub32(Assembler::Overflow, Imm32(1), dest, fail);
    }
  }

  bind(&end);
}

void MacroAssembler::roundDoubleToInt32(FloatRegister src, Register dest,
                                        FloatRegister temp, Label* fail) {
  ScratchDoubleScope scratch(*this);

  Label negativeOrZero, negative, end;

  // Branch to a slow path for non-positive inputs. Doesn't catch NaN.
  zeroDouble(scratch);
  loadConstantDouble(GetBiggestNumberLessThan(0.5), temp);
  branchDouble(Assembler::DoubleLessThanOrEqual, src, scratch, &negativeOrZero);
  {
    // Input is strictly positive or NaN. Add the biggest double less than 0.5
    // and truncate, rounding down (because if the input is the biggest double
    // less than 0.5, adding 0.5 would undesirably round up to 1). Note that we
    // have to add the input to the temp register because we're not allowed to
    // modify the input register.
    addDouble(src, temp);
    truncateDoubleToInt32(temp, dest, fail);
    jump(&end);
  }

  // Input is negative, +0 or -0.
  bind(&negativeOrZero);
  {
    // Branch on negative input.
    j(Assembler::NotEqual, &negative);

    // Fail on negative-zero.
    branchNegativeZero(src, dest, fail, /* maybeNonZero = */ false);

    // Input is +0
    xor32(dest, dest);
    jump(&end);
  }

  // Input is negative.
  bind(&negative);
  {
    // Inputs in [-0.5, 0) are rounded to -0. Fail.
    loadConstantDouble(-0.5, scratch);
    branchDouble(Assembler::DoubleGreaterThanOrEqual, src, scratch, fail);

    // Other negative inputs need the biggest double less than 0.5 added.
    //
    // The result is stored in the temp register (currently contains the biggest
    // double less than 0.5).
    addDouble(src, temp);

    if (HasSSE41()) {
      // Round toward -Infinity.
      vroundsd(X86Encoding::RoundDown, temp, scratch);

      // Truncate.
      truncateDoubleToInt32(scratch, dest, fail);
    } else {
      // Round toward -Infinity without the benefit of ROUNDSD.

      // Truncate and round toward zero.
      // This is off-by-one for everything but integer-valued inputs.
      //
      // Directly call vcvttsd2si instead of truncateDoubleToInt32 because we
      // want to perform failure handling ourselves.
      vcvttsd2si(temp, dest);

      // Test whether the truncated double was integer-valued.
      convertInt32ToDouble(dest, scratch);
      branchDouble(Assembler::DoubleEqualOrUnordered, temp, scratch, &end);

      // Input is not integer-valued, so we rounded off-by-one in the
      // wrong direction. Correct by subtraction.
      //
      // Overflows if vcvttsd2si returned the failure return value INT_MIN.
      branchSub32(Assembler::Overflow, Imm32(1), dest, fail);
    }
  }

  bind(&end);
}

void MacroAssembler::nearbyIntDouble(RoundingMode mode, FloatRegister src,
                                     FloatRegister dest) {
  MOZ_ASSERT(HasRoundInstruction(mode));
  vroundsd(Assembler::ToX86RoundingMode(mode), src, dest);
}

void MacroAssembler::nearbyIntFloat32(RoundingMode mode, FloatRegister src,
                                      FloatRegister dest) {
  MOZ_ASSERT(HasRoundInstruction(mode));
  vroundss(Assembler::ToX86RoundingMode(mode), src, dest);
}

void MacroAssembler::copySignDouble(FloatRegister lhs, FloatRegister rhs,
                                    FloatRegister output) {
  ScratchDoubleScope scratch(*this);

  // TODO Support AVX2
  if (rhs == output) {
    MOZ_ASSERT(lhs != rhs);
    double keepSignMask = mozilla::BitwiseCast<double>(INT64_MIN);
    loadConstantDouble(keepSignMask, scratch);
    vandpd(scratch, rhs, output);

    double clearSignMask = mozilla::BitwiseCast<double>(INT64_MAX);
    loadConstantDouble(clearSignMask, scratch);
    vandpd(lhs, scratch, scratch);
  } else {
    double clearSignMask = mozilla::BitwiseCast<double>(INT64_MAX);
    loadConstantDouble(clearSignMask, scratch);
    vandpd(scratch, lhs, output);

    double keepSignMask = mozilla::BitwiseCast<double>(INT64_MIN);
    loadConstantDouble(keepSignMask, scratch);
    vandpd(rhs, scratch, scratch);
  }

  vorpd(scratch, output, output);
}

void MacroAssembler::copySignFloat32(FloatRegister lhs, FloatRegister rhs,
                                     FloatRegister output) {
  ScratchFloat32Scope scratch(*this);

  // TODO Support AVX2
  if (rhs == output) {
    MOZ_ASSERT(lhs != rhs);
    float keepSignMask = mozilla::BitwiseCast<float>(INT32_MIN);
    loadConstantFloat32(keepSignMask, scratch);
    vandps(scratch, output, output);

    float clearSignMask = mozilla::BitwiseCast<float>(INT32_MAX);
    loadConstantFloat32(clearSignMask, scratch);
    vandps(lhs, scratch, scratch);
  } else {
    float clearSignMask = mozilla::BitwiseCast<float>(INT32_MAX);
    loadConstantFloat32(clearSignMask, scratch);
    vandps(scratch, lhs, output);

    float keepSignMask = mozilla::BitwiseCast<float>(INT32_MIN);
    loadConstantFloat32(keepSignMask, scratch);
    vandps(rhs, scratch, scratch);
  }

  vorps(scratch, output, output);
}

void MacroAssembler::shiftIndex32AndAdd(Register indexTemp32, int shift,
                                        Register pointer) {
  if (IsShiftInScaleRange(shift)) {
    computeEffectiveAddress(
        BaseIndex(pointer, indexTemp32, ShiftToScale(shift)), pointer);
    return;
  }
  lshift32(Imm32(shift), indexTemp32);
  addPtr(indexTemp32, pointer);
}

CodeOffset MacroAssembler::wasmMarkedSlowCall(const wasm::CallSiteDesc& desc,
                                              const Register reg) {
  CodeOffset offset = call(desc, reg);
  wasmMarkCallAsSlow();
  return offset;
}

//}}} check_macroassembler_style