package compiler // This file implements the compiler for amd64/x86_64 target. // Please refer to https://www.felixcloutier.com/x86/index.html // if unfamiliar with amd64 instructions used here. import ( "fmt" "math" "github.com/tetratelabs/wazero/internal/asm" "github.com/tetratelabs/wazero/internal/asm/amd64" "github.com/tetratelabs/wazero/internal/platform" "github.com/tetratelabs/wazero/internal/u32" "github.com/tetratelabs/wazero/internal/u64" "github.com/tetratelabs/wazero/internal/wasm" "github.com/tetratelabs/wazero/internal/wazeroir" ) var ( _minimum32BitSignedInt int32 = math.MinInt32 _maximum32BitSignedInt int32 = math.MaxInt32 _maximum32BitUnsignedInt uint32 = math.MaxUint32 _minimum64BitSignedInt int64 = math.MinInt64 _maximum64BitSignedInt int64 = math.MaxInt64 _maximum64BitUnsignedInt uint64 = math.MaxUint64 _float32SignBitMask uint32 = 1 << 31 _float32RestBitMask = ^_float32SignBitMask _float64SignBitMask uint64 = 1 << 63 _float64RestBitMask = ^_float64SignBitMask _float32ForMinimumSigned32bitInteger = uint32(0xCF00_0000) _float64ForMinimumSigned32bitInteger = uint64(0xC1E0_0000_0020_0000) _float32ForMinimumSigned64bitInteger = uint32(0xDF00_0000) _float64ForMinimumSigned64bitInteger = uint64(0xC3E0_0000_0000_0000) _float32ForMaximumSigned32bitIntPlusOne = uint32(0x4F00_0000) _float64ForMaximumSigned32bitIntPlusOne = uint64(0x41E0_0000_0000_0000) _float32ForMaximumSigned64bitIntPlusOne = uint32(0x5F00_0000) _float64ForMaximumSigned64bitIntPlusOne = uint64(0x43E0_0000_0000_0000) ) var ( // amd64ReservedRegisterForCallEngine: pointer to callEngine (i.e. *callEngine as uintptr) amd64ReservedRegisterForCallEngine = amd64.RegR13 // amd64ReservedRegisterForStackBasePointerAddress: stack base pointer's address (callEngine.stackBasePointer) in the current function call. amd64ReservedRegisterForStackBasePointerAddress = amd64.RegR14 // amd64ReservedRegisterForMemory: pointer to the memory slice's data (i.e. &memory.Buffer[0] as uintptr). amd64ReservedRegisterForMemory = amd64.RegR15 ) var ( amd64UnreservedVectorRegisters = []asm.Register{ //nolint amd64.RegX0, amd64.RegX1, amd64.RegX2, amd64.RegX3, amd64.RegX4, amd64.RegX5, amd64.RegX6, amd64.RegX7, amd64.RegX8, amd64.RegX9, amd64.RegX10, amd64.RegX11, amd64.RegX12, amd64.RegX13, amd64.RegX14, amd64.RegX15, } // Note that we never invoke "call" instruction, // so we don't need to care about the calling convention. // TODO: Maybe it is safe just save rbp, rsp somewhere // in Go-allocated variables, and reuse these registers // in compiled functions and write them back before returns. amd64UnreservedGeneralPurposeRegisters = []asm.Register{ //nolint amd64.RegAX, amd64.RegCX, amd64.RegDX, amd64.RegBX, amd64.RegSI, amd64.RegDI, amd64.RegR8, amd64.RegR9, amd64.RegR10, amd64.RegR11, amd64.RegR12, } ) // amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds *wasm.ModuleInstance of the // next executing function instance. The value is set and used when making function calls // or function returns in the ModuleContextInitialization. See compileModuleContextInitialization. var amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister = amd64.RegR12 func (c *amd64Compiler) String() string { return c.locationStack.String() } // compileNOP implements compiler.compileNOP for the amd64 architecture. func (c *amd64Compiler) compileNOP() asm.Node { return c.assembler.CompileStandAlone(amd64.NOP) } type amd64Compiler struct { assembler amd64.Assembler ir *wazeroir.CompilationResult cpuFeatures platform.CpuFeatureFlags // locationStack holds the state of wazeroir virtual stack. // and each item is either placed in register or the actual memory stack. locationStack *runtimeValueLocationStack // labels hold per wazeroir label specific information in this function. labels [wazeroir.LabelKindNum][]amd64LabelInfo // stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation. stackPointerCeil uint64 // assignStackPointerCeilNeeded holds an asm.Node whose AssignDestinationConstant must be called with the determined stack pointer ceiling. assignStackPointerCeilNeeded asm.Node compiledTrapTargets [nativeCallStatusModuleClosed]asm.Node withListener bool typ *wasm.FunctionType // locationStackForEntrypoint is the initial location stack for all functions. To reuse the allocated stack, // we cache it here, and reset and set to .locationStack in the Init method. locationStackForEntrypoint runtimeValueLocationStack // frameIDMax tracks the maximum value of frame id per function. frameIDMax int brTableTmp []runtimeValueLocation fourZeros, eightZeros, minimum32BitSignedInt, maximum32BitSignedInt, maximum32BitUnsignedInt, minimum64BitSignedInt, maximum64BitSignedInt, maximum64BitUnsignedInt, float32SignBitMask, float32RestBitMask, float64SignBitMask, float64RestBitMask, float32ForMinimumSigned32bitInteger, float64ForMinimumSigned32bitInteger, float32ForMinimumSigned64bitInteger, float64ForMinimumSigned64bitInteger, float32ForMaximumSigned32bitIntPlusOne, float64ForMaximumSigned32bitIntPlusOne, float32ForMaximumSigned64bitIntPlusOne, float64ForMaximumSigned64bitIntPlusOne *asm.StaticConst } func newAmd64Compiler() compiler { c := &amd64Compiler{ assembler: amd64.NewAssembler(), locationStackForEntrypoint: newRuntimeValueLocationStack(), cpuFeatures: platform.CpuFeatures, } c.fourZeros = asm.NewStaticConst([]byte{0, 0, 0, 0}) c.eightZeros = asm.NewStaticConst([]byte{0, 0, 0, 0, 0, 0, 0, 0}) c.minimum32BitSignedInt = asm.NewStaticConst(u32.LeBytes(uint32(_minimum32BitSignedInt))) c.maximum32BitSignedInt = asm.NewStaticConst(u32.LeBytes(uint32(_maximum32BitSignedInt))) c.maximum32BitUnsignedInt = asm.NewStaticConst(u32.LeBytes(_maximum32BitUnsignedInt)) c.minimum64BitSignedInt = asm.NewStaticConst(u64.LeBytes(uint64(_minimum64BitSignedInt))) c.maximum64BitSignedInt = asm.NewStaticConst(u64.LeBytes(uint64(_maximum64BitSignedInt))) c.maximum64BitUnsignedInt = asm.NewStaticConst(u64.LeBytes(_maximum64BitUnsignedInt)) c.float32SignBitMask = asm.NewStaticConst(u32.LeBytes(_float32SignBitMask)) c.float32RestBitMask = asm.NewStaticConst(u32.LeBytes(_float32RestBitMask)) c.float64SignBitMask = asm.NewStaticConst(u64.LeBytes(_float64SignBitMask)) c.float64RestBitMask = asm.NewStaticConst(u64.LeBytes(_float64RestBitMask)) c.float32ForMinimumSigned32bitInteger = asm.NewStaticConst(u32.LeBytes(_float32ForMinimumSigned32bitInteger)) c.float64ForMinimumSigned32bitInteger = asm.NewStaticConst(u64.LeBytes(_float64ForMinimumSigned32bitInteger)) c.float32ForMinimumSigned64bitInteger = asm.NewStaticConst(u32.LeBytes(_float32ForMinimumSigned64bitInteger)) c.float64ForMinimumSigned64bitInteger = asm.NewStaticConst(u64.LeBytes(_float64ForMinimumSigned64bitInteger)) c.float32ForMaximumSigned32bitIntPlusOne = asm.NewStaticConst(u32.LeBytes(_float32ForMaximumSigned32bitIntPlusOne)) c.float64ForMaximumSigned32bitIntPlusOne = asm.NewStaticConst(u64.LeBytes(_float64ForMaximumSigned32bitIntPlusOne)) c.float32ForMaximumSigned64bitIntPlusOne = asm.NewStaticConst(u32.LeBytes(_float32ForMaximumSigned64bitIntPlusOne)) c.float64ForMaximumSigned64bitIntPlusOne = asm.NewStaticConst(u64.LeBytes(_float64ForMaximumSigned64bitIntPlusOne)) return c } // Init implements compiler.Init. func (c *amd64Compiler) Init(typ *wasm.FunctionType, ir *wazeroir.CompilationResult, withListener bool) { c.assembler.Reset() c.locationStackForEntrypoint.reset() c.resetLabels() *c = amd64Compiler{ ir: ir, withListener: withListener, typ: typ, assembler: c.assembler, cpuFeatures: c.cpuFeatures, labels: c.labels, locationStackForEntrypoint: c.locationStackForEntrypoint, brTableTmp: c.brTableTmp, fourZeros: c.fourZeros, eightZeros: c.eightZeros, minimum32BitSignedInt: c.minimum32BitSignedInt, maximum32BitSignedInt: c.maximum32BitSignedInt, maximum32BitUnsignedInt: c.maximum32BitUnsignedInt, minimum64BitSignedInt: c.minimum64BitSignedInt, maximum64BitSignedInt: c.maximum64BitSignedInt, maximum64BitUnsignedInt: c.maximum64BitUnsignedInt, float32SignBitMask: c.float32SignBitMask, float32RestBitMask: c.float32RestBitMask, float64SignBitMask: c.float64SignBitMask, float64RestBitMask: c.float64RestBitMask, float32ForMinimumSigned32bitInteger: c.float32ForMinimumSigned32bitInteger, float64ForMinimumSigned32bitInteger: c.float64ForMinimumSigned32bitInteger, float32ForMinimumSigned64bitInteger: c.float32ForMinimumSigned64bitInteger, float64ForMinimumSigned64bitInteger: c.float64ForMinimumSigned64bitInteger, float32ForMaximumSigned32bitIntPlusOne: c.float32ForMaximumSigned32bitIntPlusOne, float64ForMaximumSigned32bitIntPlusOne: c.float64ForMaximumSigned32bitIntPlusOne, float32ForMaximumSigned64bitIntPlusOne: c.float32ForMaximumSigned64bitIntPlusOne, float64ForMaximumSigned64bitIntPlusOne: c.float64ForMaximumSigned64bitIntPlusOne, } // Reuses the initial location stack for the compilation of subsequent functions. c.locationStack = &c.locationStackForEntrypoint } // resetLabels resets the existing content in arm64Compiler.labels so that // we could reuse the allocated slices and stacks in the subsequent compilations. func (c *amd64Compiler) resetLabels() { for i := range c.labels { for j := range c.labels[i] { if j > c.frameIDMax { // Only need to reset until the maximum frame id. This makes the compilation faster for large binary. break } l := &c.labels[i][j] l.initialInstruction = nil l.stackInitialized = false l.initialStack.reset() } } } // runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture. func (c *amd64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack { return c.locationStack } // setLocationStack sets the given runtimeValueLocationStack to .locationStack field, // while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks. // This is called when we branch into different block. func (c *amd64Compiler) setLocationStack(newStack *runtimeValueLocationStack) { if c.stackPointerCeil < c.locationStack.stackPointerCeil { c.stackPointerCeil = c.locationStack.stackPointerCeil } c.locationStack = newStack } // pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for amd64. func (c *amd64Compiler) pushRuntimeValueLocationOnRegister(reg asm.Register, vt runtimeValueType) (ret *runtimeValueLocation) { ret = c.locationStack.pushRuntimeValueLocationOnRegister(reg, vt) c.locationStack.markRegisterUsed(reg) return } // pushVectorRuntimeValueLocationOnRegister implements compiler.pushVectorRuntimeValueLocationOnRegister for amd64. func (c *amd64Compiler) pushVectorRuntimeValueLocationOnRegister(reg asm.Register) (lowerBitsLocation *runtimeValueLocation) { lowerBitsLocation = c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Lo) c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Hi) c.locationStack.markRegisterUsed(reg) return } type amd64LabelInfo struct { // initialInstruction is the initial instruction for this label so other block can jump into it. initialInstruction asm.Node // initialStack is the initial value location stack from which we start compiling this label. initialStack runtimeValueLocationStack stackInitialized bool } func (c *amd64Compiler) label(label wazeroir.Label) *amd64LabelInfo { kind := label.Kind() frames := c.labels[kind] frameID := label.FrameID() if c.frameIDMax < frameID { c.frameIDMax = frameID } // If the frameID is not allocated yet, expand the slice by twice of the diff, // so that we could reduce the allocation in the subsequent compilation. if diff := frameID - len(frames) + 1; diff > 0 { for i := 0; i < diff; i++ { frames = append(frames, amd64LabelInfo{initialStack: newRuntimeValueLocationStack()}) } c.labels[kind] = frames } return &frames[frameID] } // compileBuiltinFunctionCheckExitCode implements compiler.compileBuiltinFunctionCheckExitCode for the amd64 architecture. func (c *amd64Compiler) compileBuiltinFunctionCheckExitCode() error { if err := c.compileCallBuiltinFunction(builtinFunctionIndexCheckExitCode); err != nil { return err } // After the function call, we have to initialize the stack base pointer and memory reserved registers. c.compileReservedStackBasePointerInitialization() c.compileReservedMemoryPointerInitialization() return nil } // compileGoDefinedHostFunction constructs the entire code to enter the host function implementation, // and return to the caller. func (c *amd64Compiler) compileGoDefinedHostFunction() error { // First we must update the location stack to reflect the number of host function inputs. c.locationStack.init(c.typ) if c.withListener { if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil { return err } } // Host function needs access to the caller's Function Instance, and the caller's information is stored in the stack // (as described in the doc of callEngine.stack). Here, we get the caller's *wasm.FunctionInstance from the stack, // and save it in callEngine.exitContext.callerFunctionInstance so we can pass it to the host function // without sacrificing the performance. c.compileReservedStackBasePointerInitialization() // Alias for readability. tmp := amd64.RegAX // Get the location of the callerFunction (*function) in the stack, which depends on the signature. _, _, callerFunction := c.locationStack.getCallFrameLocations(c.typ) // Load the value into the tmp register: tmp = &function{..} callerFunction.setRegister(tmp) c.compileLoadValueOnStackToRegister(callerFunction) // tmp = *(tmp+functionSourceOffset) = &wasm.ModuleInstance{...} c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, functionModuleInstanceOffset, tmp) // Load it onto callEngine.exitContext.callerFunctionInstance. c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmp, amd64ReservedRegisterForCallEngine, callEngineExitContextCallerModuleInstanceOffset) // Reset the state of callerFunction value location so that we won't mess up subsequent code generation below. c.locationStack.releaseRegister(callerFunction) if err := c.compileCallGoHostFunction(); err != nil { return err } // Initializes the reserved stack base pointer which is used to retrieve the call frame stack. c.compileReservedStackBasePointerInitialization() // Go function can change the module state in arbitrary way, so we have to force // the callEngine.moduleContext initialization on the function return. To do so, // we zero-out callEngine.moduleInstance. c.assembler.CompileConstToMemory(amd64.MOVQ, 0, amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset) return c.compileReturnFunction() } // compile implements compiler.compile for the amd64 architecture. func (c *amd64Compiler) compile(buf asm.Buffer) (stackPointerCeil uint64, err error) { // c.stackPointerCeil tracks the stack pointer ceiling (max seen) value across all runtimeValueLocationStack(s) // used for all labels (via setLocationStack), excluding the current one. // Hence, we check here if the final block's max one exceeds the current c.stackPointerCeil. stackPointerCeil = c.stackPointerCeil if stackPointerCeil < c.locationStack.stackPointerCeil { stackPointerCeil = c.locationStack.stackPointerCeil } // Now that the max stack pointer is determined, we are invoking the callback. // Note this MUST be called before Assemble() below. c.assignStackPointerCeil(stackPointerCeil) err = c.assembler.Assemble(buf) return } // compileUnreachable implements compiler.compileUnreachable for the amd64 architecture. func (c *amd64Compiler) compileUnreachable() error { c.compileExitFromNativeCode(nativeCallStatusCodeUnreachable) return nil } // assignStackPointerCeil implements compilerImpl.assignStackPointerCeil for the amd64 architecture. func (c *amd64Compiler) assignStackPointerCeil(ceil uint64) { if c.assignStackPointerCeilNeeded != nil { c.assignStackPointerCeilNeeded.AssignDestinationConstant(int64(ceil) << 3) } } // compileSet implements compiler.compileSet for the amd64 architecture. func (c *amd64Compiler) compileSet(o *wazeroir.UnionOperation) error { depth := int(o.U1) isTargetVector := o.B3 setTargetIndex := int(c.locationStack.sp) - 1 - depth if isTargetVector { _ = c.locationStack.pop() // ignore the higher 64-bits. } v := c.locationStack.pop() if err := c.compileEnsureOnRegister(v); err != nil { return err } targetLocation := &c.locationStack.stack[setTargetIndex] if targetLocation.onRegister() { // We no longer need the register previously used by the target location. c.locationStack.markRegisterUnused(targetLocation.register) } reg := v.register targetLocation.setRegister(reg) targetLocation.valueType = v.valueType if isTargetVector { hi := &c.locationStack.stack[setTargetIndex+1] hi.setRegister(reg) } return nil } // compileGlobalGet implements compiler.compileGlobalGet for the amd64 architecture. func (c *amd64Compiler) compileGlobalGet(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } intReg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // First, move the pointer to the global slice into the allocated register. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg) index := o.U1 // Now, move the location of the global instance into the register. c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(index), intReg) // When an integer, reuse the pointer register for the value. Otherwise, allocate a float register for it. valueReg := intReg var vt runtimeValueType var inst asm.Instruction switch c.ir.Globals[index].ValType { case wasm.ValueTypeI32: inst = amd64.MOVL vt = runtimeValueTypeI32 case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref: inst = amd64.MOVQ vt = runtimeValueTypeI64 case wasm.ValueTypeF32: inst = amd64.MOVL vt = runtimeValueTypeF32 valueReg, err = c.allocateRegister(registerTypeVector) if err != nil { return err } case wasm.ValueTypeF64: inst = amd64.MOVQ vt = runtimeValueTypeF64 valueReg, err = c.allocateRegister(registerTypeVector) if err != nil { return err } case wasm.ValueTypeV128: inst = amd64.MOVDQU vt = runtimeValueTypeV128Lo valueReg, err = c.allocateRegister(registerTypeVector) if err != nil { return err } default: panic("BUG: unknown runtime value type") } // Using the register holding the pointer to the target instance, move its value into a register. c.assembler.CompileMemoryToRegister(inst, intReg, globalInstanceValueOffset, valueReg) // Record that the retrieved global value on the top of the stack is now in a register. if vt == runtimeValueTypeV128Lo { c.pushVectorRuntimeValueLocationOnRegister(valueReg) } else { c.pushRuntimeValueLocationOnRegister(valueReg, vt) } return nil } // compileGlobalSet implements compiler.compileGlobalSet for the amd64 architecture. func (c *amd64Compiler) compileGlobalSet(o *wazeroir.UnionOperation) error { index := o.U1 wasmValueType := c.ir.Globals[index].ValType isV128 := wasmValueType == wasm.ValueTypeV128 // First, move the value to set into a temporary register. val := c.locationStack.pop() if isV128 { // The previous val is higher 64-bits, and have to use lower 64-bit's runtimeValueLocation for allocation, etc. val = c.locationStack.pop() } if err := c.compileEnsureOnRegister(val); err != nil { return err } // Allocate a register to hold the memory location of the target global instance. intReg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // First, move the pointer to the global slice into the allocated register. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg) // Now, move the location of the global instance into the register. c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(index), intReg) // Now ready to write the value to the global instance location. var inst asm.Instruction if isV128 { inst = amd64.MOVDQU } else if wasmValueType == wasm.ValueTypeI32 || wasmValueType == wasm.ValueTypeF32 { inst = amd64.MOVL } else { inst = amd64.MOVQ } c.assembler.CompileRegisterToMemory(inst, val.register, intReg, globalInstanceValueOffset) // Since the value is now written to memory, release the value register. c.locationStack.releaseRegister(val) return nil } // compileBr implements compiler.compileBr for the amd64 architecture. func (c *amd64Compiler) compileBr(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } return c.branchInto(wazeroir.Label(o.U1)) } // branchInto adds instruction necessary to jump into the given branch target. func (c *amd64Compiler) branchInto(target wazeroir.Label) error { if target.IsReturnTarget() { return c.compileReturnFunction() } else { if c.ir.LabelCallers[target] > 1 { // We can only re-use register state if when there's a single call-site. // Release existing values on registers to the stack if there's multiple ones to have // the consistent value location state at the beginning of label. if err := c.compileReleaseAllRegistersToStack(); err != nil { return err } } // Set the initial stack of the target label, so we can start compiling the label // with the appropriate value locations. Note we clone the stack here as we maybe // manipulate the stack before compiler reaches the label. targetLabel := c.label(target) if !targetLabel.stackInitialized { targetLabel.initialStack.cloneFrom(*c.locationStack) targetLabel.stackInitialized = true } jmp := c.assembler.CompileJump(amd64.JMP) c.assignJumpTarget(target, jmp) } return nil } // compileBrIf implements compiler.compileBrIf for the amd64 architecture. func (c *amd64Compiler) compileBrIf(o *wazeroir.UnionOperation) error { cond := c.locationStack.pop() var jmpWithCond asm.Node if cond.onConditionalRegister() { var inst asm.Instruction switch cond.conditionalRegister { case amd64.ConditionalRegisterStateE: inst = amd64.JEQ case amd64.ConditionalRegisterStateNE: inst = amd64.JNE case amd64.ConditionalRegisterStateS: inst = amd64.JMI case amd64.ConditionalRegisterStateNS: inst = amd64.JPL case amd64.ConditionalRegisterStateG: inst = amd64.JGT case amd64.ConditionalRegisterStateGE: inst = amd64.JGE case amd64.ConditionalRegisterStateL: inst = amd64.JLT case amd64.ConditionalRegisterStateLE: inst = amd64.JLE case amd64.ConditionalRegisterStateA: inst = amd64.JHI case amd64.ConditionalRegisterStateAE: inst = amd64.JCC case amd64.ConditionalRegisterStateB: inst = amd64.JCS case amd64.ConditionalRegisterStateBE: inst = amd64.JLS } jmpWithCond = c.assembler.CompileJump(inst) } else { // Usually the comparison operand for br_if is on the conditional register, // but in some cases, they are on the stack or register. // For example, the following code // i64.const 1 // local.get 1 // i64.add // br_if .... // will try to use the result of i64.add, which resides on the (virtual) stack, // as the operand for br_if instruction. if err := c.compileEnsureOnRegister(cond); err != nil { return err } // Check if the value not equals zero. c.assembler.CompileRegisterToRegister(amd64.TESTQ, cond.register, cond.register) // Emit jump instruction which jumps when the value does not equals zero. jmpWithCond = c.assembler.CompileJump(amd64.JNE) c.locationStack.markRegisterUnused(cond.register) } // Make sure that the next coming label is the else jump target. thenTarget := wazeroir.Label(o.U1) elseTarget := wazeroir.Label(o.U2) thenToDrop := o.U3 // Here's the diagram of how we organize the instructions necessarily for brif operation. // // jmp_with_cond -> jmp (.Else) -> Then operations... // |---------(satisfied)------------^^^ // // Note that .Else branch doesn't have ToDrop as .Else is in reality // corresponding to either If's Else block or Br_if's else block in Wasm. // Emit the else branch. if elseTarget.IsReturnTarget() { if err := c.compileReturnFunction(); err != nil { return err } } else { labelInfo := c.label(elseTarget) if !labelInfo.stackInitialized { labelInfo.initialStack.cloneFrom(*c.locationStack) labelInfo.stackInitialized = true } elseJmp := c.assembler.CompileJump(amd64.JMP) c.assignJumpTarget(elseTarget, elseJmp) } // Handle then branch. c.assembler.SetJumpTargetOnNext(jmpWithCond) if err := compileDropRange(c, thenToDrop); err != nil { return err } if thenTarget.IsReturnTarget() { return c.compileReturnFunction() } else { thenLabel := thenTarget if c.ir.LabelCallers[thenLabel] > 1 { // We can only re-use register state if when there's a single call-site. // Release existing values on registers to the stack if there's multiple ones to have // the consistent value location state at the beginning of label. if err := c.compileReleaseAllRegistersToStack(); err != nil { return err } } // Set the initial stack of the target label, so we can start compiling the label // with the appropriate value locations. Note we clone the stack here as we maybe // manipulate the stack before compiler reaches the label. labelInfo := c.label(thenLabel) if !labelInfo.stackInitialized { labelInfo.initialStack.cloneFrom(*c.locationStack) labelInfo.stackInitialized = true } thenJmp := c.assembler.CompileJump(amd64.JMP) c.assignJumpTarget(thenLabel, thenJmp) return nil } } // compileBrTable implements compiler.compileBrTable for the amd64 architecture. func (c *amd64Compiler) compileBrTable(o *wazeroir.UnionOperation) error { index := c.locationStack.pop() // If the operation only consists of the default target, we branch into it and return early. if len(o.Us) == 2 { c.locationStack.releaseRegister(index) if err := compileDropRange(c, o.Us[1]); err != nil { return err } return c.branchInto(wazeroir.Label(o.Us[0])) } // Otherwise, we jump into the selected branch. if err := c.compileEnsureOnRegister(index); err != nil { return err } tmp, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // First, we move the length of target list into the tmp register. c.assembler.CompileConstToRegister(amd64.MOVQ, int64(len(o.Us)/2-1), tmp) // Then, we compare the value with the length of targets. c.assembler.CompileRegisterToRegister(amd64.CMPL, tmp, index.register) // If the value is larger than the length, // we round the index to the length as the spec states that // if the index is larger than or equal the length of list, // branch into the default branch. c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, tmp, index.register) // We prepare the static data which holds the offset of // each target's first instruction (incl. default) // relative to the beginning of label tables. // // For example, if we have targets=[L0, L1] and default=L_DEFAULT, // we emit the the code like this at [Emit the code for each targets and default branch] below. // // L0: // 0x123001: XXXX, ... // ..... // L1: // 0x123005: YYY, ... // ..... // L_DEFAULT: // 0x123009: ZZZ, ... // // then offsetData becomes like [0x0, 0x5, 0x8]. // By using this offset list, we could jump into the label for the index by // "jmp offsetData[index]+0x123001" and "0x123001" can be acquired by "LEA" // instruction. // // Note: We store each offset of 32-bite unsigned integer as 4 consecutive bytes. So more precisely, // the above example's offsetData would be [0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0]. // // Note: this is similar to how GCC implements Switch statements in C. offsetData := asm.NewStaticConst(make([]byte, 4*(len(o.Us)/2))) // Load the offsetData's address into tmp. if err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, offsetData, tmp); err != nil { return err } // Now we have the address of first byte of offsetData in tmp register. // So the target offset's first byte is at tmp+index*4 as we store // the offset as 4 bytes for a 32-byte integer. // Here, we store the offset into the index.register. c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVL, tmp, 0, index.register, 4, index.register) // Now we read the address of the beginning of the jump table. // In the above example, this corresponds to reading the address of 0x123001. c.assembler.CompileReadInstructionAddress(tmp, amd64.JMP) // Now we have the address of L0 in tmp register, and the offset to the target label in the index.register. // So we could achieve the br_table jump by adding them and jump into the resulting address. c.assembler.CompileRegisterToRegister(amd64.ADDQ, index.register, tmp) c.assembler.CompileJumpToRegister(amd64.JMP, tmp) // We no longer need the index's register, so mark it unused. c.locationStack.markRegisterUnused(index.register) // [Emit the code for each targets and default branch] labelInitialInstructions := make([]asm.Node, len(o.Us)/2) // Since we might end up having the different stack state in each branch, // we need to save the initial stack state here, and use the same initial state // for each iteration. initialLocationStack := c.getSavedTemporaryLocationStack() for i := range labelInitialInstructions { // Emit the initial instruction of each target. // We use NOP as we don't yet know the next instruction in each label. // Assembler would optimize out this NOP during code generation, so this is harmless. labelInitialInstructions[i] = c.assembler.CompileStandAlone(amd64.NOP) targetLabel := wazeroir.Label(o.Us[i*2]) targetToDrop := o.Us[i*2+1] if err = compileDropRange(c, targetToDrop); err != nil { return err } if err = c.branchInto(targetLabel); err != nil { return err } // After the iteration, reset the stack's state with initialLocationStack. c.locationStack.cloneFrom(initialLocationStack) } c.assembler.BuildJumpTable(offsetData, labelInitialInstructions) return nil } func (c *amd64Compiler) getSavedTemporaryLocationStack() runtimeValueLocationStack { initialLocationStack := *c.locationStack // Take copy! // Use c.brTableTmp for the underlying stack so that we could reduce the allocations. if diff := int(initialLocationStack.sp) - len(c.brTableTmp); diff > 0 { c.brTableTmp = append(c.brTableTmp, make([]runtimeValueLocation, diff)...) } copy(c.brTableTmp, initialLocationStack.stack[:initialLocationStack.sp]) initialLocationStack.stack = c.brTableTmp return initialLocationStack } func (c *amd64Compiler) assignJumpTarget(label wazeroir.Label, jmpInstruction asm.Node) { jmpTargetLabel := c.label(label) targetInst := jmpTargetLabel.initialInstruction if targetInst == nil { // If the label isn't compiled yet, allocate the NOP node, and set as the initial instruction. targetInst = c.assembler.AllocateNOP() jmpTargetLabel.initialInstruction = targetInst } jmpInstruction.AssignJumpTarget(targetInst) } // compileLabel implements compiler.compileLabel for the amd64 architecture. func (c *amd64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipLabel bool) { label := wazeroir.Label(o.U1) labelInfo := c.label(label) // If initialStack is not set, that means this label has never been reached. if !labelInfo.stackInitialized { skipLabel = true return } // We use NOP as a beginning of instructions in a label. if labelBegin := labelInfo.initialInstruction; labelBegin == nil { // We use NOP as a beginning of instructions in a label. // This should be eventually optimized out by assembler. labelInfo.initialInstruction = c.assembler.CompileStandAlone(amd64.NOP) } else { c.assembler.Add(labelBegin) } // Set the initial stack. c.setLocationStack(&labelInfo.initialStack) return } // compileCall implements compiler.compileCall for the amd64 architecture. func (c *amd64Compiler) compileCall(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } functionIndex := o.U1 target := c.ir.Functions[functionIndex] targetType := &c.ir.Types[target] targetAddressRegister, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // First, push the index to the callEngine.functionsElement0Address into the target register. c.assembler.CompileConstToRegister(amd64.MOVQ, int64(functionIndex)*functionSize, targetAddressRegister) // Next, we add the address of the first item of callEngine.functions slice (= &callEngine.functions[0]) // to the target register. c.assembler.CompileMemoryToRegister(amd64.ADDQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset, targetAddressRegister) if err := c.compileCallFunctionImpl(targetAddressRegister, targetType); err != nil { return err } return nil } // compileCallIndirect implements compiler.compileCallIndirect for the amd64 architecture. func (c *amd64Compiler) compileCallIndirect(o *wazeroir.UnionOperation) error { offset := c.locationStack.pop() if err := c.compileEnsureOnRegister(offset); err != nil { return nil } typeIndex := o.U1 tableIndex := o.U2 tmp, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.locationStack.markRegisterUsed(tmp) tmp2, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.locationStack.markRegisterUsed(tmp2) // Load the address of the target table: tmp = &module.Tables[0] c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp) // tmp = &module.Tables[0] + Index*8 = &module.Tables[0] + sizeOf(*TableInstance)*index = module.Tables[o.TableIndex]. c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp) // Then, we need to trap if the offset exceeds the length of table. c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register) c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess) // next we check if the target's type matches the operation's one. // In order to get the type instance's address, we have to multiply the offset // by 8 as the offset is the "length" of table in Go's "[]uintptr{}", // and size of uintptr equals 8 bytes == (2^3). c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, offset.register) // Adds the address of wasm.Table[0] stored as callEngine.tableElement0Address to the offset. c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, offset.register) // "offset = (*offset) (== table[offset] == *code type)" c.assembler.CompileMemoryToRegister(amd64.MOVQ, offset.register, 0, offset.register) // At this point offset.register holds the address of *code (as uintptr) at wasm.Table[offset]. // // Check if the value of table[offset] equals zero, meaning that the target is uninitialized. c.assembler.CompileRegisterToRegister(amd64.TESTQ, offset.register, offset.register) // Skipped if the target is initialized. c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusCodeInvalidTableAccess) // Next, we need to check the type matches, i.e. table[offset].source.TypeID == targetFunctionType's typeID. // // "tmp2 = [&moduleInstance.TypeIDs[0] + index * 4] (== moduleInstance.TypeIDs[index])" c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset, tmp2) c.assembler.CompileMemoryToRegister(amd64.MOVL, tmp2, int64(typeIndex)*4, tmp2) // Skipped if the type matches. c.assembler.CompileMemoryToRegister(amd64.CMPL, offset.register, functionTypeIDOffset, tmp2) c.compileMaybeExitFromNativeCode(amd64.JEQ, nativeCallStatusCodeTypeMismatchOnIndirectCall) targetFunctionType := &c.ir.Types[typeIndex] if err = c.compileCallFunctionImpl(offset.register, targetFunctionType); err != nil { return nil } // The offset register should be marked as un-used as we consumed in the function call. c.locationStack.markRegisterUnused(offset.register, tmp, tmp2) return nil } // compileDrop implements compiler.compileDrop for the amd64 architecture. func (c *amd64Compiler) compileDrop(o *wazeroir.UnionOperation) error { return compileDropRange(c, o.U1) } // compileSelectV128Impl implements compileSelect for vector values. func (c *amd64Compiler) compileSelectV128Impl(selectorReg asm.Register) error { x2 := c.locationStack.popV128() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.popV128() if err := c.compileEnsureOnRegister(x1); err != nil { return err } // Compare the conditional value with zero. c.assembler.CompileRegisterToRegister(amd64.TESTQ, selectorReg, selectorReg) // Set the jump if the top value is not zero. jmpIfNotZero := c.assembler.CompileJump(amd64.JNE) // In this branch, we select the value of x2, so we move the value into x1.register so that // we can have the result in x1.register regardless of the selection. c.assembler.CompileRegisterToRegister(amd64.MOVDQU, x2.register, x1.register) // Else, we don't need to adjust value, just need to jump to the next instruction. c.assembler.SetJumpTargetOnNext(jmpIfNotZero) // As noted, the result exists in x1.register regardless of the selector. c.pushVectorRuntimeValueLocationOnRegister(x1.register) // Plus, x2.register is no longer used. c.locationStack.markRegisterUnused(x2.register) c.locationStack.markRegisterUnused(selectorReg) return nil } // compileSelect implements compiler.compileSelect for the amd64 architecture. // // The emitted native code depends on whether the values are on // the physical registers or memory stack, or maybe conditional register. func (c *amd64Compiler) compileSelect(o *wazeroir.UnionOperation) error { cv := c.locationStack.pop() if err := c.compileEnsureOnRegister(cv); err != nil { return err } isTargetVector := o.B3 if isTargetVector { return c.compileSelectV128Impl(cv.register) } x2 := c.locationStack.pop() // We do not consume x1 here, but modify the value according to // the conditional value "c" above. peekedX1 := c.locationStack.peek() // Compare the conditional value with zero. c.assembler.CompileRegisterToRegister(amd64.TESTQ, cv.register, cv.register) // Now we can use c.register as temporary location. // We alias it here for readability. tmpRegister := cv.register // Set the jump if the top value is not zero. jmpIfNotZero := c.assembler.CompileJump(amd64.JNE) // If the value is zero, we must place the value of x2 onto the stack position of x1. // First we copy the value of x2 to the temporary register if x2 is not currently on a register. if x2.onStack() { x2.register = tmpRegister c.compileLoadValueOnStackToRegister(x2) } // // At this point x2's value is always on a register. // // Then release the value in the x2's register to the x1's stack position. if peekedX1.onRegister() { c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2.register, peekedX1.register) } else { peekedX1.register = x2.register c.compileReleaseRegisterToStack(peekedX1) // Note inside we mark the register unused! } // Else, we don't need to adjust value, just need to jump to the next instruction. c.assembler.SetJumpTargetOnNext(jmpIfNotZero) // In any case, we don't need x2 and c anymore! c.locationStack.releaseRegister(x2) c.locationStack.releaseRegister(cv) return nil } // compilePick implements compiler.compilePick for the amd64 architecture. func (c *amd64Compiler) compilePick(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } depth := o.U1 isTargetVector := o.B3 // TODO: if we track the type of values on the stack, // we could optimize the instruction according to the bit size of the value. // For now, we just move the entire register i.e. as a quad word (8 bytes). pickTarget := &c.locationStack.stack[c.locationStack.sp-1-uint64(depth)] reg, err := c.allocateRegister(pickTarget.getRegisterType()) if err != nil { return err } if pickTarget.onRegister() { var inst asm.Instruction if isTargetVector { inst = amd64.MOVDQU } else if pickTarget.valueType == runtimeValueTypeI32 { // amd64 cannot copy single-precisions between registers. inst = amd64.MOVL } else { inst = amd64.MOVQ } c.assembler.CompileRegisterToRegister(inst, pickTarget.register, reg) } else if pickTarget.onStack() { // Copy the value from the stack. var inst asm.Instruction if isTargetVector { inst = amd64.MOVDQU } else if pickTarget.valueType == runtimeValueTypeI32 || pickTarget.valueType == runtimeValueTypeF32 { inst = amd64.MOVL } else { inst = amd64.MOVQ } // Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range. c.assembler.CompileMemoryToRegister(inst, amd64ReservedRegisterForStackBasePointerAddress, int64(pickTarget.stackPointer)*8, reg) } // Now we already placed the picked value on the register, // so push the location onto the stack. if isTargetVector { c.pushVectorRuntimeValueLocationOnRegister(reg) } else { c.pushRuntimeValueLocationOnRegister(reg, pickTarget.valueType) } return nil } // compileAdd implements compiler.compileAdd for the amd64 architecture. func (c *amd64Compiler) compileAdd(o *wazeroir.UnionOperation) error { // TODO: if the previous instruction is const, then // this can be optimized. Same goes for other arithmetic instructions. var instruction asm.Instruction unsignedType := wazeroir.UnsignedType(o.B1) switch unsignedType { case wazeroir.UnsignedTypeI32: instruction = amd64.ADDL case wazeroir.UnsignedTypeI64: instruction = amd64.ADDQ case wazeroir.UnsignedTypeF32: instruction = amd64.ADDSS case wazeroir.UnsignedTypeF64: instruction = amd64.ADDSD } x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.peek() // Note this is peek! if err := c.compileEnsureOnRegister(x1); err != nil { return err } // x1 += x2. c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register) // We no longer need x2 register after ADD operation here, // so we release it. c.locationStack.releaseRegister(x2) return nil } // compileSub implements compiler.compileSub for the amd64 architecture. func (c *amd64Compiler) compileSub(o *wazeroir.UnionOperation) error { // TODO: if the previous instruction is const, then // this can be optimized. Same goes for other arithmetic instructions. var instruction asm.Instruction unsignedType := wazeroir.UnsignedType(o.B1) switch unsignedType { case wazeroir.UnsignedTypeI32: instruction = amd64.SUBL case wazeroir.UnsignedTypeI64: instruction = amd64.SUBQ case wazeroir.UnsignedTypeF32: instruction = amd64.SUBSS case wazeroir.UnsignedTypeF64: instruction = amd64.SUBSD } x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.peek() // Note this is peek! if err := c.compileEnsureOnRegister(x1); err != nil { return err } // x1 -= x2. c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register) // We no longer need x2 register after ADD operation here, // so we release it. c.locationStack.releaseRegister(x2) return nil } // compileMul implements compiler.compileMul for the amd64 architecture. func (c *amd64Compiler) compileMul(o *wazeroir.UnionOperation) (err error) { unsignedType := wazeroir.UnsignedType(o.B1) switch unsignedType { case wazeroir.UnsignedTypeI32: err = c.compileMulForInts(true, amd64.MULL) case wazeroir.UnsignedTypeI64: err = c.compileMulForInts(false, amd64.MULQ) case wazeroir.UnsignedTypeF32: err = c.compileMulForFloats(amd64.MULSS) case wazeroir.UnsignedTypeF64: err = c.compileMulForFloats(amd64.MULSD) } return } // compileMulForInts emits instructions to perform integer multiplication for // top two values on the stack. If unfamiliar with the convention for integer // multiplication on x86, see https://www.felixcloutier.com/x86/mul. // // In summary, one of the values must be on the AX register, // and the mul instruction stores the overflow info in DX register which we don't use. // Here, we mean "the overflow info" by 65 bit or higher part of the result for 64 bit case. // // So, we have to ensure that // 1. Previously located value on DX must be saved to memory stack. That is because // the existing value will be overridden after the mul execution. // 2. One of the operands (x1 or x2) must be on AX register. // // See https://www.felixcloutier.com/x86/mul#description for detail semantics. func (c *amd64Compiler) compileMulForInts(is32Bit bool, mulInstruction asm.Instruction) error { const ( resultRegister = amd64.RegAX reservedRegister = amd64.RegDX ) x2 := c.locationStack.pop() x1 := c.locationStack.pop() var valueOnAX *runtimeValueLocation if x1.register == resultRegister { valueOnAX = x1 } else if x2.register == resultRegister { valueOnAX = x2 } else { valueOnAX = x2 // This case we move x2 to AX register. c.onValueReleaseRegisterToStack(resultRegister) if x2.onConditionalRegister() { c.compileMoveConditionalToGeneralPurposeRegister(x2, resultRegister) } else if x2.onStack() { x2.setRegister(resultRegister) c.compileLoadValueOnStackToRegister(x2) c.locationStack.markRegisterUsed(resultRegister) } else { var inst asm.Instruction if is32Bit { inst = amd64.MOVL } else { inst = amd64.MOVQ } c.assembler.CompileRegisterToRegister(inst, x2.register, resultRegister) // We no longer uses the prev register of x2. c.locationStack.releaseRegister(x2) x2.setRegister(resultRegister) c.locationStack.markRegisterUsed(resultRegister) } } // We have to make sure that at this point the operands must be on registers. if err := c.compileEnsureOnRegister(x2); err != nil { return err } if err := c.compileEnsureOnRegister(x1); err != nil { return err } // We have to save the existing value on DX. // If the DX register is used by either x1 or x2, we don't need to // save the value because it is consumed by mul anyway. if x1.register != reservedRegister && x2.register != reservedRegister { c.onValueReleaseRegisterToStack(reservedRegister) } // Now ready to emit the mul instruction. if x1 == valueOnAX { c.assembler.CompileRegisterToNone(mulInstruction, x2.register) } else { c.assembler.CompileRegisterToNone(mulInstruction, x1.register) } c.locationStack.markRegisterUnused(x2.register) c.locationStack.markRegisterUnused(x1.register) // Now we have the result in the AX register, // so we record it. c.pushRuntimeValueLocationOnRegister(resultRegister, x1.valueType) return nil } func (c *amd64Compiler) compileMulForFloats(instruction asm.Instruction) error { x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.peek() // Note this is peek! if err := c.compileEnsureOnRegister(x1); err != nil { return err } // x1 *= x2. c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register) // We no longer need x2 register after MUL operation here, // so we release it. c.locationStack.releaseRegister(x2) return nil } // compileClz implements compiler.compileClz for the amd64 architecture. func (c *amd64Compiler) compileClz(o *wazeroir.UnionOperation) error { target := c.locationStack.pop() if err := c.compileEnsureOnRegister(target); err != nil { return err } unsignedInt := wazeroir.UnsignedInt(o.B1) if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) { if unsignedInt == wazeroir.UnsignedInt32 { c.assembler.CompileRegisterToRegister(amd64.LZCNTL, target.register, target.register) } else { c.assembler.CompileRegisterToRegister(amd64.LZCNTQ, target.register, target.register) } } else { // On processors that do not support LZCNT, we combine BSR (calculating // most significant set bit) with XOR. This logic is described in // "Replace Raw Assembly Code with Builtin Intrinsics" section in: // https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code. // First, we have to check if the target is non-zero as BSR is undefined // on zero. See https://www.felixcloutier.com/x86/bsr. c.assembler.CompileRegisterToRegister(amd64.TESTQ, target.register, target.register) jmpIfNonZero := c.assembler.CompileJump(amd64.JNE) // If the value is zero, we just push the const value. if unsignedInt == wazeroir.UnsignedInt32 { c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register) } else { c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register) } // Emit the jmp instruction to jump to the position right after // the non-zero case. jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP) // Start emitting non-zero case. c.assembler.SetJumpTargetOnNext(jmpIfNonZero) // First, we calculate the most significant set bit. if unsignedInt == wazeroir.UnsignedInt32 { c.assembler.CompileRegisterToRegister(amd64.BSRL, target.register, target.register) } else { c.assembler.CompileRegisterToRegister(amd64.BSRQ, target.register, target.register) } // Now we XOR the value with the bit length minus one. if unsignedInt == wazeroir.UnsignedInt32 { c.assembler.CompileConstToRegister(amd64.XORL, 31, target.register) } else { c.assembler.CompileConstToRegister(amd64.XORQ, 63, target.register) } // Finally the end jump instruction of zero case must target towards // the next instruction. c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero) } // We reused the same register of target for the result. c.locationStack.markRegisterUnused(target.register) c.pushRuntimeValueLocationOnRegister(target.register, target.valueType) return nil } // compileCtz implements compiler.compileCtz for the amd64 architecture. func (c *amd64Compiler) compileCtz(o *wazeroir.UnionOperation) error { target := c.locationStack.pop() if err := c.compileEnsureOnRegister(target); err != nil { return err } unsignedInt := wazeroir.UnsignedInt(o.B1) if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) { if unsignedInt == wazeroir.UnsignedInt32 { c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register) } else { c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register) } } else { // On processors that do not support TZCNT, the BSF instruction is // executed instead. The key difference between TZCNT and BSF // instruction is that if source operand is zero, the content of // destination operand is undefined. // https://www.felixcloutier.com/x86/tzcnt.html // First we compare the target with zero. c.assembler.CompileRegisterToRegister(amd64.TESTQ, target.register, target.register) jmpIfNonZero := c.assembler.CompileJump(amd64.JNE) // If the value is zero, we just push the const value. if unsignedInt == wazeroir.UnsignedInt32 { c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register) } else { c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register) } // Emit the jmp instruction to jump to the position right after // the non-zero case. jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP) // Otherwise, emit the TZCNT. c.assembler.SetJumpTargetOnNext(jmpIfNonZero) if unsignedInt == wazeroir.UnsignedInt32 { c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register) } else { c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register) } // Finally the end jump instruction of zero case must target towards // the next instruction. c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero) } // We reused the same register of target for the result. c.locationStack.markRegisterUnused(target.register) c.pushRuntimeValueLocationOnRegister(target.register, target.valueType) return nil } // compilePopcnt implements compiler.compilePopcnt for the amd64 architecture. func (c *amd64Compiler) compilePopcnt(o *wazeroir.UnionOperation) error { target := c.locationStack.pop() if err := c.compileEnsureOnRegister(target); err != nil { return err } unsignedInt := wazeroir.UnsignedInt(o.B1) if unsignedInt == wazeroir.UnsignedInt32 { c.assembler.CompileRegisterToRegister(amd64.POPCNTL, target.register, target.register) } else { c.assembler.CompileRegisterToRegister(amd64.POPCNTQ, target.register, target.register) } // We reused the same register of target for the result. c.locationStack.markRegisterUnused(target.register) c.pushRuntimeValueLocationOnRegister(target.register, target.valueType) return nil } // compileDiv implements compiler.compileDiv for the amd64 architecture. func (c *amd64Compiler) compileDiv(o *wazeroir.UnionOperation) (err error) { signedType := wazeroir.SignedType(o.B1) switch signedType { case wazeroir.SignedTypeUint32: err = c.compileDivForInts(true, false) case wazeroir.SignedTypeUint64: err = c.compileDivForInts(false, false) case wazeroir.SignedTypeInt32: err = c.compileDivForInts(true, true) case wazeroir.SignedTypeInt64: err = c.compileDivForInts(false, true) case wazeroir.SignedTypeFloat32: err = c.compileDivForFloats(true) case wazeroir.SignedTypeFloat64: err = c.compileDivForFloats(false) } return } // compileDivForInts emits the instructions to perform division on the top // two values of integer type on the stack and puts the quotient of the result // onto the stack. For example, stack [..., 10, 3] results in [..., 3] where // the remainder is discarded. func (c *amd64Compiler) compileDivForInts(is32Bit bool, signed bool) error { if err := c.performDivisionOnInts(false, is32Bit, signed); err != nil { return err } // Now we have the quotient of the division result in the AX register, // so we record it. if is32Bit { c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI32) } else { c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI64) } return nil } // compileRem implements compiler.compileRem for the amd64 architecture. func (c *amd64Compiler) compileRem(o *wazeroir.UnionOperation) (err error) { var vt runtimeValueType signedInt := wazeroir.SignedInt(o.B1) switch signedInt { case wazeroir.SignedInt32: err = c.performDivisionOnInts(true, true, true) vt = runtimeValueTypeI32 case wazeroir.SignedInt64: err = c.performDivisionOnInts(true, false, true) vt = runtimeValueTypeI64 case wazeroir.SignedUint32: err = c.performDivisionOnInts(true, true, false) vt = runtimeValueTypeI32 case wazeroir.SignedUint64: err = c.performDivisionOnInts(true, false, false) vt = runtimeValueTypeI64 } if err != nil { return err } // Now we have the remainder of the division result in the DX register, // so we record it. c.pushRuntimeValueLocationOnRegister(amd64.RegDX, vt) return } // performDivisionOnInts emits the instructions to do divisions on top two integers on the stack // via DIV (unsigned div) and IDIV (signed div) instructions. // See the following explanation of these instructions' semantics from https://www.lri.fr/~filliatr/ens/compil/x86-64.pdf // // >> Division requires special arrangements: idiv (signed) and div (unsigned) operate on a 2n-byte dividend and // >> an n-byte divisor to produce an n-byte quotient and n-byte remainder. The dividend always lives in a fixed pair of // >> registers (%edx and %eax for the 32-bit case; %rdx and %rax for the 64-bit case); the divisor is specified as the // >> source operand in the instruction. The quotient goes in %eax (resp. %rax); the remainder in %edx (resp. %rdx). For // >> signed division, the cltd (resp. ctqo) instruction is used to prepare %edx (resp. %rdx) with the sign extension of // >> %eax (resp. %rax). For example, if a,b, c are memory locations holding quad words, then we could set c = a/b // >> using the sequence: movq a(%rip), %rax; ctqo; idivq b(%rip); movq %rax, c(%rip). // // tl;dr is that the division result is placed in AX and DX registers after instructions emitted by this function // where AX holds the quotient while DX the remainder of the division result. func (c *amd64Compiler) performDivisionOnInts(isRem, is32Bit, signed bool) error { const ( quotientRegister = amd64.RegAX remainderRegister = amd64.RegDX ) if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } // Ensures that previous values on these registers are saved to memory. c.onValueReleaseRegisterToStack(quotientRegister) c.onValueReleaseRegisterToStack(remainderRegister) // In order to ensure x2 is placed on a temporary register for x2 value other than AX and DX, // we mark them as used here. c.locationStack.markRegisterUsed(quotientRegister) c.locationStack.markRegisterUsed(remainderRegister) // Ensure that x2 is placed on a register which is not either AX or DX. x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } // Now we successfully place x2 on a temp register, so we no longer need to // mark these registers used. c.locationStack.markRegisterUnused(quotientRegister) c.locationStack.markRegisterUnused(remainderRegister) // Check if the x2 equals zero. if is32Bit { c.assembler.CompileRegisterToRegister(amd64.TESTL, x2.register, x2.register) } else { c.assembler.CompileRegisterToRegister(amd64.TESTQ, x2.register, x2.register) } // Skipped if the divisor is nonzero. c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusIntegerDivisionByZero) // next, we ensure that x1 is placed on AX. x1 := c.locationStack.pop() if x1.onRegister() && x1.register != quotientRegister { // Move x1 to quotientRegister. if is32Bit { c.assembler.CompileRegisterToRegister(amd64.MOVL, x1.register, quotientRegister) } else { c.assembler.CompileRegisterToRegister(amd64.MOVQ, x1.register, quotientRegister) } c.locationStack.markRegisterUnused(x1.register) x1.setRegister(quotientRegister) } else if x1.onStack() { x1.setRegister(quotientRegister) c.compileLoadValueOnStackToRegister(x1) } // Note: at this point, x1 is placed on AX, x2 is on a register which is not AX or DX. isSignedRem := isRem && signed isSignedDiv := !isRem && signed var signedRemMinusOneDivisorJmp asm.Node if isSignedRem { // If this is for getting remainder of signed division, // we have to treat the special case where the divisor equals -1. // For example, if this is 32-bit case, the result of (-2^31) / -1 equals (quotient=2^31, remainder=0) // where quotient doesn't fit in the 32-bit range whose maximum is 2^31-1. // x86 in this case cause floating point exception, but according to the Wasm spec // if the divisor equals -1, the result must be zero (not undefined!) as opposed to be "undefined" // for divisions on (-2^31) / -1 where we do not need to emit the special branches. // For detail, please refer to https://stackoverflow.com/questions/56303282/why-idiv-with-1-causes-floating-point-exception // First we store zero into the remainder result register (DX) and compare the divisor with -1. if is32Bit { c.assembler.CompileRegisterToRegister(amd64.XORL, remainderRegister, remainderRegister) c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1) } else { c.assembler.CompileRegisterToRegister(amd64.XORQ, remainderRegister, remainderRegister) c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1) } // If it equals minus one, we skip the normal case. signedRemMinusOneDivisorJmp = c.assembler.CompileJump(amd64.JEQ) } else if isSignedDiv { // For signed division, we have to have branches for "math.MinInt{32,64} / -1" // case which results in the floating point exception via division error as // the resulting value exceeds the maximum of signed int. // First we compare the division with -1. if is32Bit { c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1) } else { c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1) } // If it doesn't equal minus one, we jump to the normal case. nonMinusOneDivisorJmp := c.assembler.CompileJump(amd64.JNE) // next we check if the quotient is the most negative value for the signed integer. // That means whether or not we try to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively. if is32Bit { if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPL, x1.register, c.minimum32BitSignedInt); err != nil { return err } } else { if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPQ, x1.register, c.minimum64BitSignedInt); err != nil { return err } } // Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1), // as that is the overflow in division as the result becomes 2^31 which is larger than // the maximum of signed 32-bit int (2^31-1). c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusIntegerOverflow) // Set the normal case's jump target. c.assembler.SetJumpTargetOnNext(nonMinusOneDivisorJmp) } // Now ready to emit the div instruction. // Since the div instructions takes 2n byte dividend placed in DX:AX registers... // * signed case - we need to sign-extend the dividend into DX register via CDQ (32 bit) or CQO (64 bit). // * unsigned case - we need to zero DX register via "XOR DX DX" if is32Bit && signed { // Emit sign-extension to have 64 bit dividend over DX and AX registers. c.assembler.CompileStandAlone(amd64.CDQ) c.assembler.CompileRegisterToNone(amd64.IDIVL, x2.register) } else if is32Bit && !signed { // Zeros DX register to have 64 bit dividend over DX and AX registers. c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX) c.assembler.CompileRegisterToNone(amd64.DIVL, x2.register) } else if !is32Bit && signed { // Emits sign-extension to have 128 bit dividend over DX and AX registers. c.assembler.CompileStandAlone(amd64.CQO) c.assembler.CompileRegisterToNone(amd64.IDIVQ, x2.register) } else if !is32Bit && !signed { // Zeros DX register to have 128 bit dividend over DX and AX registers. c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX) c.assembler.CompileRegisterToNone(amd64.DIVQ, x2.register) } // If this is signed rem instruction, we must set the jump target of // the exit jump from division -1 case towards the next instruction. if signedRemMinusOneDivisorJmp != nil { c.assembler.SetJumpTargetOnNext(signedRemMinusOneDivisorJmp) } // We mark them as unused so that we can push one of them onto the location stack at call sites. c.locationStack.markRegisterUnused(remainderRegister) c.locationStack.markRegisterUnused(quotientRegister) c.locationStack.markRegisterUnused(x2.register) return nil } // compileDivForFloats emits the instructions to perform division // on the top two values of float type on the stack, placing the result back onto the stack. // For example, stack [..., 1.0, 4.0] results in [..., 0.25]. func (c *amd64Compiler) compileDivForFloats(is32Bit bool) error { if is32Bit { return c.compileSimpleBinaryOp(amd64.DIVSS) } else { return c.compileSimpleBinaryOp(amd64.DIVSD) } } // compileAnd implements compiler.compileAnd for the amd64 architecture. func (c *amd64Compiler) compileAnd(o *wazeroir.UnionOperation) (err error) { unsignedInt := wazeroir.UnsignedInt(o.B1) switch unsignedInt { case wazeroir.UnsignedInt32: err = c.compileSimpleBinaryOp(amd64.ANDL) case wazeroir.UnsignedInt64: err = c.compileSimpleBinaryOp(amd64.ANDQ) } return } // compileOr implements compiler.compileOr for the amd64 architecture. func (c *amd64Compiler) compileOr(o *wazeroir.UnionOperation) (err error) { unsignedInt := wazeroir.UnsignedInt(o.B1) switch unsignedInt { case wazeroir.UnsignedInt32: err = c.compileSimpleBinaryOp(amd64.ORL) case wazeroir.UnsignedInt64: err = c.compileSimpleBinaryOp(amd64.ORQ) } return } // compileXor implements compiler.compileXor for the amd64 architecture. func (c *amd64Compiler) compileXor(o *wazeroir.UnionOperation) (err error) { unsignedInt := wazeroir.UnsignedInt(o.B1) switch unsignedInt { case wazeroir.UnsignedInt32: err = c.compileSimpleBinaryOp(amd64.XORL) case wazeroir.UnsignedInt64: err = c.compileSimpleBinaryOp(amd64.XORQ) } return } // compileSimpleBinaryOp emits instructions to pop two values from the stack // and perform the given instruction on these two values and push the result // onto the stack. func (c *amd64Compiler) compileSimpleBinaryOp(instruction asm.Instruction) error { x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x1); err != nil { return err } c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register) // We consumed x2 register after the operation here, // so we release it. c.locationStack.releaseRegister(x2) // We already stored the result in the register used by x1 // so we record it. c.locationStack.markRegisterUnused(x1.register) c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType) return nil } // compileShl implements compiler.compileShl for the amd64 architecture. func (c *amd64Compiler) compileShl(o *wazeroir.UnionOperation) (err error) { unsignedInt := wazeroir.UnsignedInt(o.B1) switch unsignedInt { case wazeroir.UnsignedInt32: err = c.compileShiftOp(amd64.SHLL, false) case wazeroir.UnsignedInt64: err = c.compileShiftOp(amd64.SHLQ, true) } return } // compileShr implements compiler.compileShr for the amd64 architecture. func (c *amd64Compiler) compileShr(o *wazeroir.UnionOperation) (err error) { signedInt := wazeroir.SignedInt(o.B1) switch signedInt { case wazeroir.SignedInt32: err = c.compileShiftOp(amd64.SARL, true) case wazeroir.SignedInt64: err = c.compileShiftOp(amd64.SARQ, false) case wazeroir.SignedUint32: err = c.compileShiftOp(amd64.SHRL, true) case wazeroir.SignedUint64: err = c.compileShiftOp(amd64.SHRQ, false) } return } // compileRotl implements compiler.compileRotl for the amd64 architecture. func (c *amd64Compiler) compileRotl(o *wazeroir.UnionOperation) (err error) { unsignedInt := wazeroir.UnsignedInt(o.B1) switch unsignedInt { case wazeroir.UnsignedInt32: err = c.compileShiftOp(amd64.ROLL, true) case wazeroir.UnsignedInt64: err = c.compileShiftOp(amd64.ROLQ, false) } return } // compileRotr implements compiler.compileRotr for the amd64 architecture. func (c *amd64Compiler) compileRotr(o *wazeroir.UnionOperation) (err error) { unsignedInt := wazeroir.UnsignedInt(o.B1) switch unsignedInt { case wazeroir.UnsignedInt32: err = c.compileShiftOp(amd64.RORL, true) case wazeroir.UnsignedInt64: err = c.compileShiftOp(amd64.RORQ, false) } return } // compileShiftOp adds instructions for shift operations (SHR, SHL, ROTR, ROTL) // where we have to place the second value (shift counts) on the CX register. func (c *amd64Compiler) compileShiftOp(instruction asm.Instruction, is32Bit bool) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } x2 := c.locationStack.pop() // Ensures that x2 (holding shift counts) is placed on the CX register. const shiftCountRegister = amd64.RegCX if (x2.onRegister() && x2.register != shiftCountRegister) || x2.onStack() { // If another value lives on the CX register, we release it to the stack. c.onValueReleaseRegisterToStack(shiftCountRegister) if x2.onRegister() { x2r := x2.register // If x2 lives on a register, we move the value to CX. if is32Bit { c.assembler.CompileRegisterToRegister(amd64.MOVL, x2r, shiftCountRegister) } else { c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2r, shiftCountRegister) } // We no longer place any value on the original register, so we record it. c.locationStack.markRegisterUnused(x2r) } else { // If it is on stack, we just move the memory allocated value to the CX register. x2.setRegister(shiftCountRegister) c.compileLoadValueOnStackToRegister(x2) } c.locationStack.markRegisterUsed(shiftCountRegister) } x1 := c.locationStack.peek() // Note this is peek! x1r := x1.register if x1.onRegister() { c.assembler.CompileRegisterToRegister(instruction, shiftCountRegister, x1r) } else { // Shift target can be placed on a memory location. // Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range. c.assembler.CompileRegisterToMemory(instruction, shiftCountRegister, amd64ReservedRegisterForStackBasePointerAddress, int64(x1.stackPointer)*8) } // We consumed x2 register after the operation here, // so we release it. c.locationStack.markRegisterUnused(shiftCountRegister) return nil } // compileAbs implements compiler.compileAbs for the amd64 architecture. // // See the following discussions for how we could take the abs of floats on x86 assembly. // https://stackoverflow.com/questions/32408665/fastest-way-to-compute-absolute-value-using-sse/32422471#32422471 // https://stackoverflow.com/questions/44630015/how-would-fabsdouble-be-implemented-on-x86-is-it-an-expensive-operation func (c *amd64Compiler) compileAbs(o *wazeroir.UnionOperation) (err error) { target := c.locationStack.peek() // Note this is peek! if err = c.compileEnsureOnRegister(target); err != nil { return err } // First shift left by one to clear the sign bit, and then shift right by one. if wazeroir.Float(o.B1) == wazeroir.Float32 { c.assembler.CompileConstToRegister(amd64.PSLLD, 1, target.register) c.assembler.CompileConstToRegister(amd64.PSRLD, 1, target.register) } else { c.assembler.CompileConstToRegister(amd64.PSLLQ, 1, target.register) c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, target.register) } return nil } // compileNeg implements compiler.compileNeg for the amd64 architecture. func (c *amd64Compiler) compileNeg(o *wazeroir.UnionOperation) (err error) { target := c.locationStack.peek() // Note this is peek! if err := c.compileEnsureOnRegister(target); err != nil { return err } tmpReg, err := c.allocateRegister(registerTypeVector) if err != nil { return err } // First we move the sign-bit mask (placed in memory) to the tmp register, // since we cannot take XOR directly with float reg and const. // And then negate the value by XOR it with the sign-bit mask. if wazeroir.Float(o.B1) == wazeroir.Float32 { err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32SignBitMask, tmpReg) if err != nil { return err } c.assembler.CompileRegisterToRegister(amd64.XORPS, tmpReg, target.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64SignBitMask, tmpReg) if err != nil { return err } c.assembler.CompileRegisterToRegister(amd64.XORPD, tmpReg, target.register) } return nil } // compileCeil implements compiler.compileCeil for the amd64 architecture. func (c *amd64Compiler) compileCeil(o *wazeroir.UnionOperation) (err error) { // Internally, ceil can be performed via ROUND instruction with 0x02 mode. // See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/ceilf.S for example. return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x02) } // compileFloor implements compiler.compileFloor for the amd64 architecture. func (c *amd64Compiler) compileFloor(o *wazeroir.UnionOperation) (err error) { // Internally, floor can be performed via ROUND instruction with 0x01 mode. // See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/floorf.S for example. return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x01) } // compileTrunc implements compiler.compileTrunc for the amd64 architecture. func (c *amd64Compiler) compileTrunc(o *wazeroir.UnionOperation) error { // Internally, trunc can be performed via ROUND instruction with 0x03 mode. // See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/truncf.S for example. return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x03) } // compileNearest implements compiler.compileNearest for the amd64 architecture. func (c *amd64Compiler) compileNearest(o *wazeroir.UnionOperation) error { // Nearest can be performed via ROUND instruction with 0x00 mode. return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x00) } func (c *amd64Compiler) compileRoundInstruction(is32Bit bool, mode int64) error { target := c.locationStack.peek() // Note this is peek! if err := c.compileEnsureOnRegister(target); err != nil { return err } if is32Bit { c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSS, target.register, target.register, byte(mode)) } else { c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSD, target.register, target.register, byte(mode)) } return nil } // compileMin implements compiler.compileMin for the amd64 architecture. func (c *amd64Compiler) compileMin(o *wazeroir.UnionOperation) error { is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32 if is32Bit { return c.compileMinOrMax(is32Bit, true, amd64.MINSS) } else { return c.compileMinOrMax(is32Bit, true, amd64.MINSD) } } // compileMax implements compiler.compileMax for the amd64 architecture. func (c *amd64Compiler) compileMax(o *wazeroir.UnionOperation) error { is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32 if is32Bit { return c.compileMinOrMax(is32Bit, false, amd64.MAXSS) } else { return c.compileMinOrMax(is32Bit, false, amd64.MAXSD) } } // emitMinOrMax adds instructions to pop two values from the stack, and push back either minimum or // minimum of these two values onto the stack according to the minOrMaxInstruction argument. // minOrMaxInstruction must be one of MAXSS, MAXSD, MINSS or MINSD. // Note: These native min/max instructions are almost compatible with min/max in the Wasm specification, // but it is slightly different with respect to the NaN handling. // Native min/max instructions return non-NaN value if exactly one of target values // is NaN. For example native_{min,max}(5.0, NaN) returns always 5.0, not NaN. // However, WebAssembly specifies that min/max must always return NaN if one of values is NaN. // Therefore, in this function, we have to add conditional jumps to check if one of values is NaN before // the native min/max, which is why we cannot simply emit a native min/max instruction here. // // For the semantics, see wazeroir.Min and wazeroir.Max for detail. func (c *amd64Compiler) compileMinOrMax(is32Bit, isMin bool, minOrMaxInstruction asm.Instruction) error { x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x1); err != nil { return err } // Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case if is32Bit { c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register) } else { c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register) } // At this point, we have the three cases of conditional flags below // (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.) // // 1) Two values are NaN-free and different: All flags are cleared. // 2) Two values are NaN-free and equal: Only ZF flags is set. // 3) One of Two values is NaN: ZF, PF and CF flags are set. // Jump instruction to handle 1) case by checking the ZF flag // as ZF is only set for 2) and 3) cases. nanFreeOrDiffJump := c.assembler.CompileJump(amd64.JNE) // Start handling 2) and 3). // Jump if one of two values is NaN by checking the parity flag (PF). includeNaNJmp := c.assembler.CompileJump(amd64.JPS) // Start handling 2). // Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is // returned if two values are positive and negative zeros. var inst asm.Instruction switch { case is32Bit && isMin: inst = amd64.ORPS case !is32Bit && isMin: inst = amd64.ORPD case is32Bit && !isMin: inst = amd64.ANDPS case !is32Bit && !isMin: inst = amd64.ANDPD } c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register) sameExitJmp := c.assembler.CompileJump(amd64.JMP) // start handling 3). c.assembler.SetJumpTargetOnNext(includeNaNJmp) // We emit the ADD instruction to produce the NaN in x1. if is32Bit { c.assembler.CompileRegisterToRegister(amd64.ADDSS, x2.register, x1.register) } else { c.assembler.CompileRegisterToRegister(amd64.ADDSD, x2.register, x1.register) } // Exit from the NaN case branch. nanExitJmp := c.assembler.CompileJump(amd64.JMP) // Start handling 1). c.assembler.SetJumpTargetOnNext(nanFreeOrDiffJump) // Now handle the NaN-free and different values case. c.assembler.CompileRegisterToRegister(minOrMaxInstruction, x2.register, x1.register) // Set the jump target of 1) and 2) cases to the next instruction after 3) case. c.assembler.SetJumpTargetOnNext(nanExitJmp) c.assembler.SetJumpTargetOnNext(sameExitJmp) // Record that we consumed the x2 and placed the minOrMax result in the x1's register. c.locationStack.markRegisterUnused(x2.register) c.locationStack.markRegisterUnused(x1.register) c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType) return nil } // compileCopysign implements compiler.compileCopysign for the amd64 architecture. func (c *amd64Compiler) compileCopysign(o *wazeroir.UnionOperation) error { is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32 x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x1); err != nil { return err } tmpReg, err := c.allocateRegister(registerTypeVector) if err != nil { return err } // Move the rest bit mask to the temp register. if is32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32RestBitMask, tmpReg) } else { err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64RestBitMask, tmpReg) } if err != nil { return err } // Clear the sign bit of x1 via AND with the mask. if is32Bit { c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x1.register) } else { c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x1.register) } // Move the sign bit mask to the temp register. if is32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32SignBitMask, tmpReg) } else { err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64SignBitMask, tmpReg) } if err != nil { return err } // Clear the non-sign bits of x2 via AND with the mask. if is32Bit { c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x2.register) } else { c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x2.register) } // Finally, copy the sign bit of x2 to x1. if is32Bit { c.assembler.CompileRegisterToRegister(amd64.ORPS, x2.register, x1.register) } else { c.assembler.CompileRegisterToRegister(amd64.ORPD, x2.register, x1.register) } // Record that we consumed the x2 and placed the copysign result in the x1's register. c.locationStack.markRegisterUnused(x2.register) c.locationStack.markRegisterUnused(x1.register) c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType) return nil } // compileSqrt implements compiler.compileSqrt for the amd64 architecture. func (c *amd64Compiler) compileSqrt(o *wazeroir.UnionOperation) error { target := c.locationStack.peek() // Note this is peek! if err := c.compileEnsureOnRegister(target); err != nil { return err } if wazeroir.Float(o.B1) == wazeroir.Float32 { c.assembler.CompileRegisterToRegister(amd64.SQRTSS, target.register, target.register) } else { c.assembler.CompileRegisterToRegister(amd64.SQRTSD, target.register, target.register) } return nil } // compileI32WrapFromI64 implements compiler.compileI32WrapFromI64 for the amd64 architecture. func (c *amd64Compiler) compileI32WrapFromI64() error { target := c.locationStack.peek() // Note this is peek! if err := c.compileEnsureOnRegister(target); err != nil { return err } c.assembler.CompileRegisterToRegister(amd64.MOVL, target.register, target.register) target.valueType = runtimeValueTypeI32 return nil } // compileITruncFromF implements compiler.compileITruncFromF for the amd64 architecture. // // Note: in the following implementation, we use CVTSS2SI and CVTSD2SI to convert floats to signed integers. // According to the Intel manual ([1],[2]), if the source float value is either +-Inf or NaN, or it exceeds representative ranges // of target signed integer, then the instruction returns "masked" response float32SignBitMask (or float64SignBitMask for 64 bit case). // [1] Chapter 11.5.2, SIMD Floating-Point Exception Conditions in "Vol 1, Intel® 64 and IA-32 Architectures Manual" // // https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-1-manual.html // // [2] https://xem.github.io/minix86/manual/intel-x86-and-64-manual-vol1/o_7281d5ea06a5b67a-268.html func (c *amd64Compiler) compileITruncFromF(o *wazeroir.UnionOperation) (err error) { inputType := wazeroir.Float(o.B1) outputType := wazeroir.SignedInt(o.B2) nonTrapping := o.B3 if inputType == wazeroir.Float32 && outputType == wazeroir.SignedInt32 { err = c.emitSignedI32TruncFromFloat(true, nonTrapping) } else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedInt64 { err = c.emitSignedI64TruncFromFloat(true, nonTrapping) } else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedInt32 { err = c.emitSignedI32TruncFromFloat(false, nonTrapping) } else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedInt64 { err = c.emitSignedI64TruncFromFloat(false, nonTrapping) } else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedUint32 { err = c.emitUnsignedI32TruncFromFloat(true, nonTrapping) } else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedUint64 { err = c.emitUnsignedI64TruncFromFloat(true, nonTrapping) } else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedUint32 { err = c.emitUnsignedI32TruncFromFloat(false, nonTrapping) } else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedUint64 { err = c.emitUnsignedI64TruncFromFloat(false, nonTrapping) } return } // emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit unsigned integer. func (c *amd64Compiler) emitUnsignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error { source := c.locationStack.pop() if err := c.compileEnsureOnRegister(source); err != nil { return err } result, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // First, we check the source float value is above or equal math.MaxInt32+1. if isFloat32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMaximumSigned32bitIntPlusOne, source.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMaximumSigned32bitIntPlusOne, source.register) } if err != nil { return err } // Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception. var nonTrappingNaNJump asm.Node if nonTrapping { jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set. // In non trapping case, NaN is casted as zero. // Zero out the result register by XOR itsself. c.assembler.CompileRegisterToRegister(amd64.XORL, result, result) nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP) c.assembler.SetJumpTargetOnNext(jmpIfNotNaN) } else { c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion) } // Jump if the source float value is above or equal math.MaxInt32+1. jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC) // next we convert the value as a signed integer. if isFloat32Bit { c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result) } else { c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result) } // Then if the result is minus, it is invalid conversion from minus float (incl. -Inf). c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result) var nonTrappingMinusJump asm.Node if nonTrapping { jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL) // In non trapping case, the minus value is casted as zero. // Zero out the result register by XOR itsself. c.assembler.CompileRegisterToRegister(amd64.XORL, result, result) nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP) c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf) } else { c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow) } // Otherwise, the values is valid. okJmpForLessThanMaxInt32PlusOne := c.assembler.CompileJump(amd64.JMP) // Now, start handling the case where the original float value is above or equal math.MaxInt32+1. // // First, we subtract the math.MaxInt32+1 from the original value so it can fit in signed 32-bit integer. c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne) if isFloat32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS, c.float32ForMaximumSigned32bitIntPlusOne, source.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD, c.float64ForMaximumSigned32bitIntPlusOne, source.register) } if err != nil { return err } // Then, convert the subtracted value as a signed 32-bit integer. if isFloat32Bit { c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result) } else { c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result) } // next, we have to check if the value is from NaN, +Inf. // NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion, // This means we check if the result int value is minus or not. c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result) // If the result is minus, the conversion is invalid (from NaN or +Inf) var nonTrappingAboveOrEqualMaxInt32PlusOne asm.Node if nonTrapping { jmpIfNotPlusInf := c.assembler.CompileJump(amd64.JPL) err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.maximum32BitUnsignedInt, result) if err != nil { return err } nonTrappingAboveOrEqualMaxInt32PlusOne = c.assembler.CompileJump(amd64.JMP) c.assembler.SetJumpTargetOnNext(jmpIfNotPlusInf) } else { c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow) } // Otherwise, we successfully converted the source float minus (math.MaxInt32+1) to int. // So, we retrieve the original source float value by adding the sign mask. if err = c.assembler.CompileStaticConstToRegister(amd64.ADDL, c.float32SignBitMask, result); err != nil { return err } // We jump to the next instructions for valid cases. c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt32PlusOne) if nonTrapping { c.assembler.SetJumpTargetOnNext(nonTrappingAboveOrEqualMaxInt32PlusOne) c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump) c.assembler.SetJumpTargetOnNext(nonTrappingNaNJump) } // We consumed the source's register and placed the conversion result // in the result register. c.locationStack.markRegisterUnused(source.register) c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) return nil } // emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit unsigned integer. func (c *amd64Compiler) emitUnsignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error { source := c.locationStack.pop() if err := c.compileEnsureOnRegister(source); err != nil { return err } result, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // First, we check the source float value is above or equal math.MaxInt64+1. if isFloat32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMaximumSigned64bitIntPlusOne, source.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMaximumSigned64bitIntPlusOne, source.register) } if err != nil { return err } // Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception. var nonTrappingNaNJump asm.Node if nonTrapping { jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is c.not set. // In non trapping case, NaN is casted as zero. // Zero out the result register by XOR itsself. c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result) nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP) c.assembler.SetJumpTargetOnNext(jmpIfNotNaN) } else { c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion) } // Jump if the source float values is above or equal math.MaxInt64+1. jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC) // next we convert the value as a signed integer. if isFloat32Bit { c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result) } else { c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result) } // Then if the result is minus, it is invalid conversion from minus float (incl. -Inf). c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result) var nonTrappingMinusJump asm.Node if nonTrapping { jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL) // In non trapping case, the minus value is casted as zero. // Zero out the result register by XOR itsself. c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result) nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP) c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf) } else { c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow) } // Otherwise, the values is valid. okJmpForLessThanMaxInt64PlusOne := c.assembler.CompileJump(amd64.JMP) // Now, start handling the case where the original float value is above or equal math.MaxInt64+1. // // First, we subtract the math.MaxInt64+1 from the original value so it can fit in signed 64-bit integer. c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne) if isFloat32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS, c.float32ForMaximumSigned64bitIntPlusOne, source.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD, c.float64ForMaximumSigned64bitIntPlusOne, source.register) } if err != nil { return err } // Then, convert the subtracted value as a signed 64-bit integer. if isFloat32Bit { c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result) } else { c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result) } // next, we have to check if the value is from NaN, +Inf. // NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion, // This means we check if the result int value is minus or not. c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result) // If the result is minus, the conversion is invalid (from NaN or +Inf) var nonTrappingAboveOrEqualMaxInt64PlusOne asm.Node if nonTrapping { jmpIfNotPlusInf := c.assembler.CompileJump(amd64.JPL) err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.maximum64BitUnsignedInt, result) if err != nil { return err } nonTrappingAboveOrEqualMaxInt64PlusOne = c.assembler.CompileJump(amd64.JMP) c.assembler.SetJumpTargetOnNext(jmpIfNotPlusInf) } else { c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow) } // Otherwise, we successfully converted the the source float minus (math.MaxInt64+1) to int. // So, we retrieve the original source float value by adding the sign mask. if err = c.assembler.CompileStaticConstToRegister(amd64.ADDQ, c.float64SignBitMask, result); err != nil { return err } // We jump to the next instructions for valid cases. c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt64PlusOne) if nonTrapping { c.assembler.SetJumpTargetOnNext(nonTrappingAboveOrEqualMaxInt64PlusOne) c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump) c.assembler.SetJumpTargetOnNext(nonTrappingNaNJump) } // We consumed the source's register and placed the conversion result // in the result register. c.locationStack.markRegisterUnused(source.register) c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64) return nil } // emitSignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit signed integer. func (c *amd64Compiler) emitSignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error { source := c.locationStack.pop() if err := c.compileEnsureOnRegister(source); err != nil { return err } result, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float). if isFloat32Bit { c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result) } else { c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result) } // We compare the conversion result with the sign bit mask to check if it is either // 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or // 2) the source equals the minimum signed 32-bit (=-2147483648.000000) whose bit pattern is float32ForMinimumSigned32bitIntegerAddress for 32 bit float // or float64ForMinimumSigned32bitIntegerAddress for 64bit float. err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, c.float32SignBitMask, result) if err != nil { return err } // Otherwise, jump to exit as the result is valid. okJmp := c.assembler.CompileJump(amd64.JNE) // Start handling the case of 1) and 2). // First, check if the value is NaN. if isFloat32Bit { c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register) } else { c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register) } // Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception. var nontrappingNanJump asm.Node if nonTrapping { jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set. // In non trapping case, NaN is casted as zero. // Zero out the result register by XOR itsself. c.assembler.CompileRegisterToRegister(amd64.XORL, result, result) nontrappingNanJump = c.assembler.CompileJump(amd64.JMP) c.assembler.SetJumpTargetOnNext(jmpIfNotNaN) } else { // If the value is NaN, we return the function with nativeCallStatusCodeInvalidFloatToIntConversion. c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion) } // Check if the value is larger than or equal the minimum 32-bit integer value, // meaning that the value exceeds the lower bound of 32-bit signed integer range. if isFloat32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMinimumSigned32bitInteger, source.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMinimumSigned32bitInteger, source.register) } if err != nil { return err } if !nonTrapping { // Trap if the value does not exceed the lower bound. if isFloat32Bit { c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusIntegerOverflow) } else { c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusIntegerOverflow) } // At this point, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum. // So, check if the value equals the minimum signed 32-bit int. if isFloat32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register) } if err != nil { return err } // Trap if the value is not minus (= the minimum signed 32-bit int). c.compileMaybeExitFromNativeCode(amd64.JCS, nativeCallStatusIntegerOverflow) // We jump to the next instructions for valid cases. c.assembler.SetJumpTargetOnNext(okJmp) } else { // Jump if the value does not exceed the lower bound. var jmpIfNotExceedsLowerBound asm.Node if isFloat32Bit { jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JCC) } else { jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JHI) } // If the value exceeds the lower bound, we "saturate" it to the minimum. if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.minimum32BitSignedInt, result); err != nil { return err } nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP) // Otherwise, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum. c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound) if isFloat32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register) } if err != nil { return err } jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 32-bit int). // If the value exceeds signed 32-bit maximum, we saturate it to the maximum. if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.maximum32BitSignedInt, result); err != nil { return err } c.assembler.SetJumpTargetOnNext(okJmp) c.assembler.SetJumpTargetOnNext(nontrappingNanJump) c.assembler.SetJumpTargetOnNext(nonTrappingSaturatedMinimumJump) c.assembler.SetJumpTargetOnNext(jmpIfMinimumSignedInt) } // We consumed the source's register and placed the conversion result // in the result register. c.locationStack.markRegisterUnused(source.register) c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) return nil } // emitSignedI64TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit signed integer. func (c *amd64Compiler) emitSignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error { source := c.locationStack.pop() if err := c.compileEnsureOnRegister(source); err != nil { return err } result, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float). if isFloat32Bit { c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result) } else { c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result) } // We compare the conversion result with the sign bit mask to check if it is either // 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or // 2) the source equals the minimum signed 32-bit (=-9223372036854775808.0) whose bit pattern is float32ForMinimumSigned64bitIntegerAddress for 32 bit float // or float64ForMinimumSigned64bitIntegerAddress for 64bit float. err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, c.float64SignBitMask, result) if err != nil { return err } // Otherwise, we simply jump to exit as the result is valid. okJmp := c.assembler.CompileJump(amd64.JNE) // Start handling the case of 1) and 2). // First, check if the value is NaN. if isFloat32Bit { c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register) } else { c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register) } // Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception. var nontrappingNanJump asm.Node if nonTrapping { jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set. // In non trapping case, NaN is casted as zero. // Zero out the result register by XOR itsself. c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result) nontrappingNanJump = c.assembler.CompileJump(amd64.JMP) c.assembler.SetJumpTargetOnNext(jmpIfNotNaN) } else { c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion) } // Check if the value is larger than or equal the minimum 64-bit integer value, // meaning that the value exceeds the lower bound of 64-bit signed integer range. if isFloat32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMinimumSigned64bitInteger, source.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMinimumSigned64bitInteger, source.register) } if err != nil { return err } if !nonTrapping { // Jump if the value is -Inf. c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusIntegerOverflow) // At this point, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum. // So, check if the value equals the minimum signed 64-bit int. if isFloat32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register) } if err != nil { return err } // Trap if the value is not minus (= the minimum signed 64-bit int). c.compileMaybeExitFromNativeCode(amd64.JCS, nativeCallStatusIntegerOverflow) // We jump to the next instructions for valid cases. c.assembler.SetJumpTargetOnNext(okJmp) } else { // Jump if the value is not -Inf. jmpIfNotExceedsLowerBound := c.assembler.CompileJump(amd64.JCC) // If the value exceeds the lower bound, we "saturate" it to the minimum. err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.minimum64BitSignedInt, result) if err != nil { return err } nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP) // Otherwise, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum. // So, check if the value equals the minimum signed 64-bit int. c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound) if isFloat32Bit { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register) } else { err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register) } if err != nil { return err } jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 64-bit int). // If the value exceeds signed 64-bit maximum, we saturate it to the maximum. if err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.maximum64BitSignedInt, result); err != nil { return err } c.assembler.SetJumpTargetOnNext(okJmp) c.assembler.SetJumpTargetOnNext(jmpIfMinimumSignedInt) c.assembler.SetJumpTargetOnNext(nonTrappingSaturatedMinimumJump) c.assembler.SetJumpTargetOnNext(nontrappingNanJump) } // We consumed the source's register and placed the conversion result // in the result register. c.locationStack.markRegisterUnused(source.register) c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64) return nil } // compileFConvertFromI implements compiler.compileFConvertFromI for the amd64 architecture. func (c *amd64Compiler) compileFConvertFromI(o *wazeroir.UnionOperation) (err error) { inputType := wazeroir.SignedInt(o.B1) outputType := wazeroir.Float(o.B2) if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt32 { err = c.compileSimpleConversion(amd64.CVTSL2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 32bit int } else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt64 { err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int } else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt32 { err = c.compileSimpleConversion(amd64.CVTSL2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 32bit int } else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt64 { err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int } else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint32 { // See the following link for why we use 64bit conversion for unsigned 32bit integer sources: // https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float. // // Here's the summary: // >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float, // >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide // >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values, // >> which allows CVTSI2SS to be used after all. err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int. } else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint32 { // For the same reason above, we use 64bit conversion for unsigned 32bit. err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int. } else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint64 { err = c.emitUnsignedInt64ToFloatConversion(true) } else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint64 { err = c.emitUnsignedInt64ToFloatConversion(false) } return } // emitUnsignedInt64ToFloatConversion is handling the case of unsigned 64-bit integer // in compileFConvertFromI. func (c *amd64Compiler) emitUnsignedInt64ToFloatConversion(isFloat32bit bool) error { // The logic here is exactly the same as GCC emits for the following code: // // float convert(int num) { // float foo; // uint64_t ptr1 = 100; // foo = (float)(ptr1); // return foo; // } // // which is compiled by GCC as // // convert: // push rbp // mov rbp, rsp // mov DWORD PTR [rbp-20], edi // mov DWORD PTR [rbp-4], 100 // mov eax, DWORD PTR [rbp-4] // test rax, rax // js .handle_sign_bit_case // cvtsi2ss xmm0, rax // jmp .exit // .handle_sign_bit_case: // mov rdx, rax // shr rdx // and eax, 1 // or rdx, rax // cvtsi2ss xmm0, rdx // addsd xmm0, xmm0 // .exit: ... // // tl;dr is that we have a branch depending on whether or not sign bit is set. origin := c.locationStack.pop() if err := c.compileEnsureOnRegister(origin); err != nil { return err } dest, err := c.allocateRegister(registerTypeVector) if err != nil { return err } c.locationStack.markRegisterUsed(dest) tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // Check if the most significant bit (sign bit) is set. c.assembler.CompileRegisterToRegister(amd64.TESTQ, origin.register, origin.register) // Jump if the sign bit is set. jmpIfSignbitSet := c.assembler.CompileJump(amd64.JMI) // Otherwise, we could fit the unsigned int into float32. // So, we convert it to float32 and emit jump instruction to exit from this branch. if isFloat32bit { c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, origin.register, dest) } else { c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, origin.register, dest) } exitFromSignbitUnSet := c.assembler.CompileJump(amd64.JMP) // Now handling the case where sign-bit is set. // We emit the following sequences: // mov tmpReg, origin // shr tmpReg, 1 // and origin, 1 // or tmpReg, origin // cvtsi2ss xmm0, tmpReg // addsd xmm0, xmm0 c.assembler.SetJumpTargetOnNext(jmpIfSignbitSet) c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, tmpReg) c.assembler.CompileConstToRegister(amd64.SHRQ, 1, tmpReg) c.assembler.CompileConstToRegister(amd64.ANDQ, 1, origin.register) c.assembler.CompileRegisterToRegister(amd64.ORQ, origin.register, tmpReg) if isFloat32bit { c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, tmpReg, dest) } else { c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, tmpReg, dest) } if isFloat32bit { c.assembler.CompileRegisterToRegister(amd64.ADDSS, dest, dest) } else { c.assembler.CompileRegisterToRegister(amd64.ADDSD, dest, dest) } // Now, we finished the sign-bit set branch. // We have to make the exit jump target of sign-bit unset branch // towards the next instruction. c.assembler.SetJumpTargetOnNext(exitFromSignbitUnSet) // We consumed the origin's register and placed the conversion result // in the dest register. c.locationStack.markRegisterUnused(origin.register) if isFloat32bit { c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF32) } else { c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF64) } return nil } // compileSimpleConversion pops a value type from the stack, and applies the // given instruction on it, and push the result onto a register of the given type. func (c *amd64Compiler) compileSimpleConversion(convInstruction asm.Instruction, destinationRegisterType registerType, destinationValueType runtimeValueType, ) error { origin := c.locationStack.pop() if err := c.compileEnsureOnRegister(origin); err != nil { return err } dest, err := c.allocateRegister(destinationRegisterType) if err != nil { return err } c.assembler.CompileRegisterToRegister(convInstruction, origin.register, dest) c.locationStack.markRegisterUnused(origin.register) c.pushRuntimeValueLocationOnRegister(dest, destinationValueType) return nil } // compileF32DemoteFromF64 implements compiler.compileF32DemoteFromF64 for the amd64 architecture. func (c *amd64Compiler) compileF32DemoteFromF64() error { target := c.locationStack.peek() // Note this is peek! if err := c.compileEnsureOnRegister(target); err != nil { return err } c.assembler.CompileRegisterToRegister(amd64.CVTSD2SS, target.register, target.register) target.valueType = runtimeValueTypeF32 return nil } // compileF64PromoteFromF32 implements compiler.compileF64PromoteFromF32 for the amd64 architecture. func (c *amd64Compiler) compileF64PromoteFromF32() error { target := c.locationStack.peek() // Note this is peek! if err := c.compileEnsureOnRegister(target); err != nil { return err } c.assembler.CompileRegisterToRegister(amd64.CVTSS2SD, target.register, target.register) target.valueType = runtimeValueTypeF64 return nil } // compileI32ReinterpretFromF32 implements compiler.compileI32ReinterpretFromF32 for the amd64 architecture. func (c *amd64Compiler) compileI32ReinterpretFromF32() error { if peek := c.locationStack.peek(); peek.onStack() { // If the value is on the stack, this is no-op as there is nothing to do for converting type. peek.valueType = runtimeValueTypeI32 return nil } return c.compileSimpleConversion(amd64.MOVL, registerTypeGeneralPurpose, runtimeValueTypeI32) } // compileI64ReinterpretFromF64 implements compiler.compileI64ReinterpretFromF64 for the amd64 architecture. func (c *amd64Compiler) compileI64ReinterpretFromF64() error { if peek := c.locationStack.peek(); peek.onStack() { // If the value is on the stack, this is no-op as there is nothing to do for converting type. peek.valueType = runtimeValueTypeI64 return nil } return c.compileSimpleConversion(amd64.MOVQ, registerTypeGeneralPurpose, runtimeValueTypeI64) } // compileF32ReinterpretFromI32 implements compiler.compileF32ReinterpretFromI32 for the amd64 architecture. func (c *amd64Compiler) compileF32ReinterpretFromI32() error { if peek := c.locationStack.peek(); peek.onStack() { // If the value is on the stack, this is no-op as there is nothing to do for converting type. peek.valueType = runtimeValueTypeF32 return nil } return c.compileSimpleConversion(amd64.MOVL, registerTypeVector, runtimeValueTypeF32) } // compileF64ReinterpretFromI64 implements compiler.compileF64ReinterpretFromI64 for the amd64 architecture. func (c *amd64Compiler) compileF64ReinterpretFromI64() error { if peek := c.locationStack.peek(); peek.onStack() { // If the value is on the stack, this is no-op as there is nothing to do for converting type. peek.valueType = runtimeValueTypeF64 return nil } return c.compileSimpleConversion(amd64.MOVQ, registerTypeVector, runtimeValueTypeF64) } // compileExtend implements compiler.compileExtend for the amd64 architecture. func (c *amd64Compiler) compileExtend(o *wazeroir.UnionOperation) error { var inst asm.Instruction signed := o.B1 != 0 if signed { inst = amd64.MOVLQSX // = MOVSXD https://www.felixcloutier.com/x86/movsx:movsxd } else { inst = amd64.MOVL } return c.compileExtendImpl(inst, runtimeValueTypeI64) } // compileSignExtend32From8 implements compiler.compileSignExtend32From8 for the amd64 architecture. func (c *amd64Compiler) compileSignExtend32From8() error { return c.compileExtendImpl(amd64.MOVBLSX, runtimeValueTypeI32) } // compileSignExtend32From16 implements compiler.compileSignExtend32From16 for the amd64 architecture. func (c *amd64Compiler) compileSignExtend32From16() error { return c.compileExtendImpl(amd64.MOVWLSX, runtimeValueTypeI32) } // compileSignExtend64From8 implements compiler.compileSignExtend64From8 for the amd64 architecture. func (c *amd64Compiler) compileSignExtend64From8() error { return c.compileExtendImpl(amd64.MOVBQSX, runtimeValueTypeI64) } // compileSignExtend64From16 implements compiler.compileSignExtend64From16 for the amd64 architecture. func (c *amd64Compiler) compileSignExtend64From16() error { return c.compileExtendImpl(amd64.MOVWQSX, runtimeValueTypeI64) } // compileSignExtend64From32 implements compiler.compileSignExtend64From32 for the amd64 architecture. func (c *amd64Compiler) compileSignExtend64From32() error { return c.compileExtendImpl(amd64.MOVLQSX, runtimeValueTypeI64) } func (c *amd64Compiler) compileExtendImpl(inst asm.Instruction, destinationType runtimeValueType) error { target := c.locationStack.peek() // Note this is peek! if err := c.compileEnsureOnRegister(target); err != nil { return err } c.assembler.CompileRegisterToRegister(inst, target.register, target.register) target.valueType = destinationType return nil } // compileEq implements compiler.compileEq for the amd64 architecture. func (c *amd64Compiler) compileEq(o *wazeroir.UnionOperation) error { return c.compileEqOrNe(wazeroir.UnsignedType(o.B1), true) } // compileNe implements compiler.compileNe for the amd64 architecture. func (c *amd64Compiler) compileNe(o *wazeroir.UnionOperation) error { return c.compileEqOrNe(wazeroir.UnsignedType(o.B1), false) } func (c *amd64Compiler) compileEqOrNe(t wazeroir.UnsignedType, shouldEqual bool) (err error) { x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x1); err != nil { return err } x1r, x2r := x1.register, x2.register // x1 and x2 are temporary registers only used for the cmp operation. Release them. c.locationStack.releaseRegister(x1) c.locationStack.releaseRegister(x2) switch t { case wazeroir.UnsignedTypeI32: err = c.compileEqOrNeForInts(x1r, x2r, amd64.CMPL, shouldEqual) case wazeroir.UnsignedTypeI64: err = c.compileEqOrNeForInts(x1r, x2r, amd64.CMPQ, shouldEqual) case wazeroir.UnsignedTypeF32: err = c.compileEqOrNeForFloats(x1r, x2r, amd64.UCOMISS, shouldEqual) case wazeroir.UnsignedTypeF64: err = c.compileEqOrNeForFloats(x1r, x2r, amd64.UCOMISD, shouldEqual) } if err != nil { return } return } func (c *amd64Compiler) compileEqOrNeForInts(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction, shouldEqual bool, ) error { c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg) // Record that the result is on the conditional register. var condReg asm.ConditionalRegisterState if shouldEqual { condReg = amd64.ConditionalRegisterStateE } else { condReg = amd64.ConditionalRegisterStateNE } loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(condReg) loc.valueType = runtimeValueTypeI32 return nil } // For float EQ and NE, we have to take NaN values into account. // Notably, Wasm specification states that if one of targets is NaN, // the result must be zero for EQ or one for NE. func (c *amd64Compiler) compileEqOrNeForFloats(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction, shouldEqual bool) error { // Before we allocate the result, we have to reserve two int registers. nanFragReg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.locationStack.markRegisterUsed(nanFragReg) cmpResultReg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // Then, execute the comparison. c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg) // First, we get the parity flag which indicates whether one of values was NaN. if shouldEqual { // Set 1 if two values are NOT NaN. c.assembler.CompileNoneToRegister(amd64.SETPC, nanFragReg) } else { // Set 1 if one of values is NaN. c.assembler.CompileNoneToRegister(amd64.SETPS, nanFragReg) } // next, we get the usual comparison flag. if shouldEqual { // Set 1 if equal. c.assembler.CompileNoneToRegister(amd64.SETEQ, cmpResultReg) } else { // Set 1 if not equal. c.assembler.CompileNoneToRegister(amd64.SETNE, cmpResultReg) } // Do "and" or "or" operations on these two flags to get the actual result. if shouldEqual { c.assembler.CompileRegisterToRegister(amd64.ANDL, nanFragReg, cmpResultReg) } else { c.assembler.CompileRegisterToRegister(amd64.ORL, nanFragReg, cmpResultReg) } // Clear the unnecessary bits by zero extending the first byte. // This is necessary the upper bits (5 to 32 bits) of SET* instruction result is undefined. c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, cmpResultReg, cmpResultReg) // Now we have the result in cmpResultReg register, so we record it. c.pushRuntimeValueLocationOnRegister(cmpResultReg, runtimeValueTypeI32) // Also, we no longer need nanFragRegister. c.locationStack.markRegisterUnused(nanFragReg) return nil } // compileEqz implements compiler.compileEqz for the amd64 architecture. func (c *amd64Compiler) compileEqz(o *wazeroir.UnionOperation) (err error) { v := c.locationStack.pop() if err = c.compileEnsureOnRegister(v); err != nil { return err } unsignedInt := wazeroir.UnsignedInt(o.B1) switch unsignedInt { case wazeroir.UnsignedInt32: err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, c.fourZeros, v.register) case wazeroir.UnsignedInt64: err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, c.eightZeros, v.register) } if err != nil { return err } // v is consumed by the cmp operation so release it. c.locationStack.releaseRegister(v) // Finally, record that the result is on the conditional register. loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE) loc.valueType = runtimeValueTypeI32 return nil } // compileLt implements compiler.compileLt for the amd64 architecture. func (c *amd64Compiler) compileLt(o *wazeroir.UnionOperation) error { x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x1); err != nil { return err } // Emit the compare instruction. var resultConditionState asm.ConditionalRegisterState var inst asm.Instruction signedType := wazeroir.SignedType(o.B1) switch signedType { case wazeroir.SignedTypeInt32: resultConditionState = amd64.ConditionalRegisterStateL inst = amd64.CMPL case wazeroir.SignedTypeUint32: resultConditionState = amd64.ConditionalRegisterStateB inst = amd64.CMPL case wazeroir.SignedTypeInt64: inst = amd64.CMPQ resultConditionState = amd64.ConditionalRegisterStateL case wazeroir.SignedTypeUint64: resultConditionState = amd64.ConditionalRegisterStateB inst = amd64.CMPQ case wazeroir.SignedTypeFloat32: resultConditionState = amd64.ConditionalRegisterStateA inst = amd64.COMISS case wazeroir.SignedTypeFloat64: resultConditionState = amd64.ConditionalRegisterStateA inst = amd64.COMISD } c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register) // x1 and x2 are temporary registers only used for the cmp operation. Release them. c.locationStack.releaseRegister(x1) c.locationStack.releaseRegister(x2) // Finally, record that the result is on the conditional register. loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState) loc.valueType = runtimeValueTypeI32 return nil } // compileGt implements compiler.compileGt for the amd64 architecture. func (c *amd64Compiler) compileGt(o *wazeroir.UnionOperation) error { x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x1); err != nil { return err } // Emit the compare instruction. var resultConditionState asm.ConditionalRegisterState signedType := wazeroir.SignedType(o.B1) switch signedType { case wazeroir.SignedTypeInt32: resultConditionState = amd64.ConditionalRegisterStateG c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register) case wazeroir.SignedTypeUint32: c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register) resultConditionState = amd64.ConditionalRegisterStateA case wazeroir.SignedTypeInt64: c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register) resultConditionState = amd64.ConditionalRegisterStateG case wazeroir.SignedTypeUint64: c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register) resultConditionState = amd64.ConditionalRegisterStateA case wazeroir.SignedTypeFloat32: c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register) resultConditionState = amd64.ConditionalRegisterStateA case wazeroir.SignedTypeFloat64: c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register) resultConditionState = amd64.ConditionalRegisterStateA } // x1 and x2 are temporary registers only used for the cmp operation. Release them. c.locationStack.releaseRegister(x1) c.locationStack.releaseRegister(x2) // Finally, record that the result is on the conditional register. loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState) loc.valueType = runtimeValueTypeI32 return nil } // compileLe implements compiler.compileLe for the amd64 architecture. func (c *amd64Compiler) compileLe(o *wazeroir.UnionOperation) error { x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x1); err != nil { return err } // Emit the compare instruction. var inst asm.Instruction var resultConditionState asm.ConditionalRegisterState signedType := wazeroir.SignedType(o.B1) switch signedType { case wazeroir.SignedTypeInt32: resultConditionState = amd64.ConditionalRegisterStateLE inst = amd64.CMPL case wazeroir.SignedTypeUint32: resultConditionState = amd64.ConditionalRegisterStateBE inst = amd64.CMPL case wazeroir.SignedTypeInt64: resultConditionState = amd64.ConditionalRegisterStateLE inst = amd64.CMPQ case wazeroir.SignedTypeUint64: resultConditionState = amd64.ConditionalRegisterStateBE inst = amd64.CMPQ case wazeroir.SignedTypeFloat32: resultConditionState = amd64.ConditionalRegisterStateAE inst = amd64.UCOMISS case wazeroir.SignedTypeFloat64: resultConditionState = amd64.ConditionalRegisterStateAE inst = amd64.UCOMISD } c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register) // x1 and x2 are temporary registers only used for the cmp operation. Release them. c.locationStack.releaseRegister(x1) c.locationStack.releaseRegister(x2) // Finally, record that the result is on the conditional register. loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState) loc.valueType = runtimeValueTypeI32 return nil } // compileGe implements compiler.compileGe for the amd64 architecture. func (c *amd64Compiler) compileGe(o *wazeroir.UnionOperation) error { x2 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x2); err != nil { return err } x1 := c.locationStack.pop() if err := c.compileEnsureOnRegister(x1); err != nil { return err } // Emit the compare instruction. var resultConditionState asm.ConditionalRegisterState signedType := wazeroir.SignedType(o.B1) switch signedType { case wazeroir.SignedTypeInt32: c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register) resultConditionState = amd64.ConditionalRegisterStateGE case wazeroir.SignedTypeUint32: c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register) resultConditionState = amd64.ConditionalRegisterStateAE case wazeroir.SignedTypeInt64: c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register) resultConditionState = amd64.ConditionalRegisterStateGE case wazeroir.SignedTypeUint64: c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register) resultConditionState = amd64.ConditionalRegisterStateAE case wazeroir.SignedTypeFloat32: c.assembler.CompileRegisterToRegister(amd64.COMISS, x2.register, x1.register) resultConditionState = amd64.ConditionalRegisterStateAE case wazeroir.SignedTypeFloat64: c.assembler.CompileRegisterToRegister(amd64.COMISD, x2.register, x1.register) resultConditionState = amd64.ConditionalRegisterStateAE } // x1 and x2 are temporary registers only used for the cmp operation. Release them. c.locationStack.releaseRegister(x1) c.locationStack.releaseRegister(x2) // Finally, record that the result is on the conditional register. loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState) loc.valueType = runtimeValueTypeI32 return nil } // compileLoad implements compiler.compileLoad for the amd64 architecture. func (c *amd64Compiler) compileLoad(o *wazeroir.UnionOperation) error { var ( isIntType bool movInst asm.Instruction targetSizeInBytes int64 vt runtimeValueType ) unsignedType := wazeroir.UnsignedType(o.B1) offset := uint32(o.U2) switch unsignedType { case wazeroir.UnsignedTypeI32: isIntType = true movInst = amd64.MOVL targetSizeInBytes = 32 / 8 vt = runtimeValueTypeI32 case wazeroir.UnsignedTypeI64: isIntType = true movInst = amd64.MOVQ targetSizeInBytes = 64 / 8 vt = runtimeValueTypeI64 case wazeroir.UnsignedTypeF32: isIntType = false movInst = amd64.MOVL targetSizeInBytes = 32 / 8 vt = runtimeValueTypeF32 case wazeroir.UnsignedTypeF64: isIntType = false movInst = amd64.MOVQ targetSizeInBytes = 64 / 8 vt = runtimeValueTypeF64 } reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes) if err != nil { return err } if isIntType { // For integer types, read the corresponding bytes from the offset to the memory // and store the value to the int register. c.assembler.CompileMemoryWithIndexToRegister(movInst, // we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil]. amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1, reg) c.pushRuntimeValueLocationOnRegister(reg, vt) } else { // For float types, we read the value to the float register. floatReg, err := c.allocateRegister(registerTypeVector) if err != nil { return err } c.assembler.CompileMemoryWithIndexToRegister(movInst, // we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil]. amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1, floatReg) c.pushRuntimeValueLocationOnRegister(floatReg, vt) // We no longer need the int register so mark it unused. c.locationStack.markRegisterUnused(reg) } return nil } // compileLoad8 implements compiler.compileLoad8 for the amd64 architecture. func (c *amd64Compiler) compileLoad8(o *wazeroir.UnionOperation) error { const targetSizeInBytes = 1 offset := uint32(o.U2) reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes) if err != nil { return err } // Then move a byte at the offset to the register. // Note that Load8 is only for integer types. var inst asm.Instruction var vt runtimeValueType signedInt := wazeroir.SignedInt(o.B1) switch signedInt { case wazeroir.SignedInt32: inst = amd64.MOVBLSX vt = runtimeValueTypeI32 case wazeroir.SignedUint32: inst = amd64.MOVBLZX vt = runtimeValueTypeI32 case wazeroir.SignedInt64: inst = amd64.MOVBQSX vt = runtimeValueTypeI64 case wazeroir.SignedUint64: inst = amd64.MOVBQZX vt = runtimeValueTypeI64 } c.assembler.CompileMemoryWithIndexToRegister(inst, // we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil]. amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1, reg) c.pushRuntimeValueLocationOnRegister(reg, vt) return nil } // compileLoad16 implements compiler.compileLoad16 for the amd64 architecture. func (c *amd64Compiler) compileLoad16(o *wazeroir.UnionOperation) error { const targetSizeInBytes = 16 / 8 offset := uint32(o.U2) reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes) if err != nil { return err } // Then move 2 bytes at the offset to the register. // Note that Load16 is only for integer types. var inst asm.Instruction var vt runtimeValueType signedInt := wazeroir.SignedInt(o.B1) switch signedInt { case wazeroir.SignedInt32: inst = amd64.MOVWLSX vt = runtimeValueTypeI32 case wazeroir.SignedInt64: inst = amd64.MOVWQSX vt = runtimeValueTypeI64 case wazeroir.SignedUint32: inst = amd64.MOVWLZX vt = runtimeValueTypeI32 case wazeroir.SignedUint64: inst = amd64.MOVWQZX vt = runtimeValueTypeI64 } c.assembler.CompileMemoryWithIndexToRegister(inst, // we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil]. amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1, reg) c.pushRuntimeValueLocationOnRegister(reg, vt) return nil } // compileLoad32 implements compiler.compileLoad32 for the amd64 architecture. func (c *amd64Compiler) compileLoad32(o *wazeroir.UnionOperation) error { const targetSizeInBytes = 32 / 8 offset := uint32(o.U2) reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes) if err != nil { return err } // Then move 4 bytes at the offset to the register. var inst asm.Instruction signed := o.B1 == 1 if signed { inst = amd64.MOVLQSX } else { inst = amd64.MOVLQZX } c.assembler.CompileMemoryWithIndexToRegister(inst, // We access memory as memory.Buffer[ceil-targetSizeInBytes: ceil]. amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1, reg) c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64) return nil } // compileMemoryAccessCeilSetup pops the top value from the stack (called "base"), stores "base + offsetArg + targetSizeInBytes" // into a register, and returns the stored register. We call the result "ceil" because we access the memory // as memory.Buffer[ceil-targetSizeInBytes: ceil]. // // Note: this also emits the instructions to check the out-of-bounds memory access. // In other words, if the ceil exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status. func (c *amd64Compiler) compileMemoryAccessCeilSetup(offsetArg uint32, targetSizeInBytes int64) (asm.Register, error) { base := c.locationStack.pop() if err := c.compileEnsureOnRegister(base); err != nil { return asm.NilRegister, err } result := base.register if offsetConst := int64(offsetArg) + targetSizeInBytes; offsetConst <= math.MaxInt32 { c.assembler.CompileConstToRegister(amd64.ADDQ, offsetConst, result) } else if offsetConst <= math.MaxUint32 { // Note: in practice, this branch rarely happens as in this case, the wasm binary know that // memory has more than 1 GBi or at least tries to access above 1 GBi memory region. // // This case, we cannot directly add the offset to a register by ADDQ(const) instruction. // That is because the imm32 const is sign-extended to 64-bit in ADDQ(const), and we end up // making offsetConst as the negative number, which is wrong. tmp, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return asm.NilRegister, err } c.assembler.CompileConstToRegister(amd64.MOVL, int64(uint32(offsetConst)), tmp) c.assembler.CompileRegisterToRegister(amd64.ADDQ, tmp, result) } else { // If the offset const is too large, we exit with nativeCallStatusCodeMemoryOutOfBounds. c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds) return result, nil } // Now we compare the value with the memory length which is held by callEngine. c.assembler.CompileMemoryToRegister(amd64.CMPQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, result) // Trap if the value is out-of-bounds of memory length. c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds) c.locationStack.markRegisterUnused(result) return result, nil } // compileStore implements compiler.compileStore for the amd64 architecture. func (c *amd64Compiler) compileStore(o *wazeroir.UnionOperation) error { var movInst asm.Instruction var targetSizeInByte int64 unsignedType := wazeroir.UnsignedType(o.B1) offset := uint32(o.U2) switch unsignedType { case wazeroir.UnsignedTypeI32, wazeroir.UnsignedTypeF32: movInst = amd64.MOVL targetSizeInByte = 32 / 8 case wazeroir.UnsignedTypeI64, wazeroir.UnsignedTypeF64: movInst = amd64.MOVQ targetSizeInByte = 64 / 8 } return c.compileStoreImpl(offset, movInst, targetSizeInByte) } // compileStore8 implements compiler.compileStore8 for the amd64 architecture. func (c *amd64Compiler) compileStore8(o *wazeroir.UnionOperation) error { return c.compileStoreImpl(uint32(o.U2), amd64.MOVB, 1) } // compileStore32 implements compiler.compileStore32 for the amd64 architecture. func (c *amd64Compiler) compileStore16(o *wazeroir.UnionOperation) error { return c.compileStoreImpl(uint32(o.U2), amd64.MOVW, 16/8) } // compileStore32 implements compiler.compileStore32 for the amd64 architecture. func (c *amd64Compiler) compileStore32(o *wazeroir.UnionOperation) error { return c.compileStoreImpl(uint32(o.U2), amd64.MOVL, 32/8) } func (c *amd64Compiler) compileStoreImpl(offsetConst uint32, inst asm.Instruction, targetSizeInBytes int64) error { val := c.locationStack.pop() if err := c.compileEnsureOnRegister(val); err != nil { return err } reg, err := c.compileMemoryAccessCeilSetup(offsetConst, targetSizeInBytes) if err != nil { return err } c.assembler.CompileRegisterToMemoryWithIndex( inst, val.register, amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1, ) // We no longer need both the value and base registers. c.locationStack.releaseRegister(val) c.locationStack.markRegisterUnused(reg) return nil } // compileMemoryGrow implements compiler.compileMemoryGrow for the amd64 architecture. func (c *amd64Compiler) compileMemoryGrow() error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } if err := c.compileCallBuiltinFunction(builtinFunctionIndexMemoryGrow); err != nil { return err } // After the function call, we have to initialize the stack base pointer and memory reserved registers. c.compileReservedStackBasePointerInitialization() c.compileReservedMemoryPointerInitialization() return nil } // compileMemorySize implements compiler.compileMemorySize for the amd64 architecture. func (c *amd64Compiler) compileMemorySize() error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } reg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } loc := c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32) c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, loc.register) // WebAssembly's memory.size returns the page size (65536) of memory region. // That is equivalent to divide the len of memory slice by 65536 and // that can be calculated as SHR by 16 bits as 65536 = 2^16. c.assembler.CompileConstToRegister(amd64.SHRQ, wasm.MemoryPageSizeInBits, loc.register) return nil } // compileMemoryInit implements compiler.compileMemoryInit for the amd64 architecture. func (c *amd64Compiler) compileMemoryInit(o *wazeroir.UnionOperation) error { dataIndex := uint32(o.U1) return c.compileInitImpl(false, dataIndex, 0) } // compileInitImpl implements compileTableInit and compileMemoryInit. // // TODO: the compiled code in this function should be reused and compile at once as // the code is independent of any module. func (c *amd64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) error { outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds if isTable { outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess } copySize := c.locationStack.pop() if err := c.compileEnsureOnRegister(copySize); err != nil { return err } sourceOffset := c.locationStack.pop() if err := c.compileEnsureOnRegister(sourceOffset); err != nil { return err } destinationOffset := c.locationStack.pop() if err := c.compileEnsureOnRegister(destinationOffset); err != nil { return err } instanceAddr, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.locationStack.markRegisterUsed(instanceAddr) if isTable { c.compileLoadElemInstanceAddress(index, instanceAddr) } else { c.compileLoadDataInstanceAddress(index, instanceAddr) } tmp, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.locationStack.markRegisterUsed(tmp) // sourceOffset += size. c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register) // destinationOffset += size. c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register) // Check instance bounds and if exceeds the length, exit with out of bounds error. c.assembler.CompileMemoryToRegister(amd64.CMPQ, instanceAddr, 8, // DataInstance and Element instance holds the length is stored at offset 8. sourceOffset.register) c.compileMaybeExitFromNativeCode(amd64.JCC, outOfBoundsErrorStatus) // Check destination bounds and if exceeds the length, exit with out of bounds error. if isTable { // Load the target table's address. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp) c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp) // Compare length. c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register) } else { c.assembler.CompileMemoryToRegister(amd64.CMPQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, destinationOffset.register) } c.compileMaybeExitFromNativeCode(amd64.JCC, outOfBoundsErrorStatus) // Otherwise, ready to copy the value from source to destination. // // If the copy size equal zero, we skip the entire instructions below. c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register) skipJump := c.assembler.CompileJump(amd64.JEQ) var scale int16 var memToReg, regToMem asm.Instruction if isTable { // Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2. c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register) c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register) // destinationOffset += table buffer's absolute address. c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register) // sourceOffset += data buffer's absolute address. c.assembler.CompileMemoryToRegister(amd64.ADDQ, instanceAddr, 0, sourceOffset.register) // For tables, we move 8 bytes at once. memToReg = amd64.MOVQ regToMem = memToReg scale = 8 } else { // destinationOffset += memory buffer's absolute address. c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register) // sourceOffset += data buffer's absolute address. c.assembler.CompileMemoryToRegister(amd64.ADDQ, instanceAddr, 0, sourceOffset.register) // Move one byte at once. memToReg = amd64.MOVBQZX regToMem = amd64.MOVB scale = 1 } // Negate the counter. c.assembler.CompileNoneToRegister(amd64.NEGQ, copySize.register) beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP) c.assembler.CompileMemoryWithIndexToRegister(memToReg, sourceOffset.register, 0, copySize.register, scale, tmp) // [destinationOffset + (size.register)] = tmp. c.assembler.CompileRegisterToMemoryWithIndex(regToMem, tmp, destinationOffset.register, 0, copySize.register, scale, ) // size += 1 c.assembler.CompileNoneToRegister(amd64.INCQ, copySize.register) c.assembler.CompileJump(amd64.JMI).AssignJumpTarget(beginCopyLoop) c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register, destinationOffset.register, instanceAddr, tmp) c.assembler.SetJumpTargetOnNext(skipJump) return nil } // compileDataDrop implements compiler.compileDataDrop for the amd64 architecture. func (c *amd64Compiler) compileDataDrop(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } tmp, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } dataIndex := uint32(o.U1) c.compileLoadDataInstanceAddress(dataIndex, tmp) // Clears the content of DataInstance[o.DataIndex] (== []byte type). c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0) c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8) c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16) return nil } func (c *amd64Compiler) compileLoadDataInstanceAddress(dataIndex uint32, dst asm.Register) { // dst = dataIndex * dataInstanceStructSize. c.assembler.CompileConstToRegister(amd64.MOVQ, int64(dataIndex)*dataInstanceStructSize, dst) // dst = &moduleInstance.DataInstances[0] + dst // = &moduleInstance.DataInstances[0] + dataIndex*dataInstanceStructSize // = &moduleInstance.DataInstances[dataIndex] c.assembler.CompileMemoryToRegister(amd64.ADDQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset, dst, ) } // compileCopyLoopImpl implements a REP MOVSQ memory copy for the given range with support for both directions. func (c *amd64Compiler) compileCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, backwards bool, bwOffset uint8) { // skip if nothing to copy c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register) emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ) // Prepare registers for swaps. There will never be more than 3 XCHGs in total. restoreCrossing := c.compilePreventCrossedTargetRegisters( []*runtimeValueLocation{destinationOffset, sourceOffset, copySize}, []asm.Register{amd64.RegDI, amd64.RegSI, amd64.RegCX}) // Prepare registers for REP MOVSQ: copy from rsi to rdi, rcx times. c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI) c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI) c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX) // Point on first byte of first quadword to copy. if backwards { c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegDI) c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegSI) // Set REP prefix direction backwards. c.assembler.CompileStandAlone(amd64.STD) } c.assembler.CompileStandAlone(amd64.REPMOVSQ) if backwards { // Reset direction. c.assembler.CompileStandAlone(amd64.CLD) } // Restore registers. c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI) c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI) c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX) restoreCrossing() c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump) c.assembler.CompileStandAlone(amd64.NOP) } // compileMemoryCopyLoopImpl is used for directly copying after bounds/direction check. func (c *amd64Compiler) compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) { // Point on first byte to be copied depending on direction. if backwards { c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register) c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register) } else { c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register) c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register) } // destinationOffset += memory buffer's absolute address. c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register) // sourceOffset += memory buffer's absolute address. c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, sourceOffset.register) // Copy copySize % 8 bytes in loop to allow copying in 8 byte groups afterward. beginLoop := c.assembler.CompileStandAlone(amd64.NOP) // Check copySize % 8 == 0. c.assembler.CompileConstToRegister(amd64.TESTQ, 7, copySize.register) breakLoop := c.assembler.CompileJump(amd64.JEQ) c.assembler.CompileMemoryToRegister(amd64.MOVBQZX, sourceOffset.register, 0, tmp) c.assembler.CompileRegisterToMemory(amd64.MOVB, tmp, destinationOffset.register, 0) if backwards { c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register) c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register) } else { c.assembler.CompileNoneToRegister(amd64.INCQ, sourceOffset.register) c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register) } c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register) c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginLoop) c.assembler.SetJumpTargetOnNext(breakLoop) // compileCopyLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8. c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register) c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 7) } // compileMemoryCopy implements compiler.compileMemoryCopy for the amd64 architecture. // // This uses efficient `REP MOVSQ` instructions to copy in quadword (8 bytes) batches. The remaining bytes // are copied with a simple `MOV` loop. It uses backward copying for overlapped segments. func (c *amd64Compiler) compileMemoryCopy() error { copySize := c.locationStack.pop() if err := c.compileEnsureOnRegister(copySize); err != nil { return err } sourceOffset := c.locationStack.pop() if err := c.compileEnsureOnRegister(sourceOffset); err != nil { return err } destinationOffset := c.locationStack.pop() if err := c.compileEnsureOnRegister(destinationOffset); err != nil { return err } tmp, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.locationStack.markRegisterUsed(tmp) // sourceOffset += size. c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register) // destinationOffset += size. c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register) // tmp = max(sourceOffset, destinationOffset). c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, destinationOffset.register) c.assembler.CompileRegisterToRegister(amd64.MOVQ, sourceOffset.register, tmp) c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, destinationOffset.register, tmp) // Check maximum bounds and if exceeds the length, exit with out of bounds error. c.assembler.CompileMemoryToRegister(amd64.CMPQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, tmp) c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds) // Skip zero size. c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register) skipJump := c.assembler.CompileJump(amd64.JEQ) // If dest < source, we can copy forwards c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register) destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS) // If source + size < dest, we can copy forwards c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp) c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp) c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp) sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS) // Copy backwards. c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, true) endJump := c.assembler.CompileJump(amd64.JMP) // Copy forwards. c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump) c.assembler.SetJumpTargetOnNext(sourceBoundLowerThanDestJump) c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, false) c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register, destinationOffset.register, tmp) c.assembler.SetJumpTargetOnNext(skipJump) c.assembler.SetJumpTargetOnNext(endJump) return nil } // compileFillLoopImpl implements a REP STOSQ fill loop. func (c *amd64Compiler) compileFillLoopImpl(destinationOffset, value, fillSize *runtimeValueLocation, tmp asm.Register, replicateByte bool) { // Skip if nothing to fill. c.assembler.CompileRegisterToRegister(amd64.TESTQ, fillSize.register, fillSize.register) emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ) if replicateByte { // Truncate value.register to a single byte c.assembler.CompileConstToRegister(amd64.ANDQ, 0xff, value.register) // Replicate single byte onto full 8-byte register. c.assembler.CompileConstToRegister(amd64.MOVQ, 0x0101010101010101, tmp) c.assembler.CompileRegisterToRegister(amd64.IMULQ, tmp, value.register) } // Prepare registers for swaps. There will never be more than 3 XCHGs in total. restoreCrossing := c.compilePreventCrossedTargetRegisters( []*runtimeValueLocation{destinationOffset, value, fillSize}, []asm.Register{amd64.RegDI, amd64.RegAX, amd64.RegCX}) // Prepare registers for REP STOSQ: fill at [rdi] with rax, rcx times. c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI) c.compileMaybeSwapRegisters(value.register, amd64.RegAX) c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX) c.assembler.CompileStandAlone(amd64.REPSTOSQ) // Restore registers. c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI) c.compileMaybeSwapRegisters(value.register, amd64.RegAX) c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX) restoreCrossing() c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump) } // compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture. // // This function uses efficient `REP STOSQ` instructions to copy in quadword (8 bytes) batches // if the size if above 15 bytes. For smaller sizes, a simple MOVB copy loop is the best // option. // // TODO: the compiled code in this function should be reused and compile at once as // the code is independent of any module. func (c *amd64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error { copySize := c.locationStack.pop() if err := c.compileEnsureOnRegister(copySize); err != nil { return err } value := c.locationStack.pop() if err := c.compileEnsureOnRegister(value); err != nil { return err } destinationOffset := c.locationStack.pop() if err := c.compileEnsureOnRegister(destinationOffset); err != nil { return err } tmp, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.locationStack.markRegisterUsed(tmp) // destinationOffset += size. c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register) // Check destination bounds and if exceeds the length, exit with out of bounds error. if isTable { // tmp = &tables[0] c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp) // tmp = [tmp + TableIndex*8] // = [&tables[0] + TableIndex*sizeOf(*tableInstance)] // = [&tables[TableIndex]] = tables[TableIndex]. c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex)*8, tmp) c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register) } else { c.assembler.CompileMemoryToRegister(amd64.CMPQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, destinationOffset.register) } if isTable { c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess) } else { c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds) } // Otherwise, ready to copy the value from source to destination. // // If the copy size equal zero, we skip the entire instructions below. c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register) skipJump := c.assembler.CompileJump(amd64.JEQ) // destinationOffset -= size. c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register) if isTable { // Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2. c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register) // destinationOffset += table buffer's absolute address. c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register) } else { // destinationOffset += memory buffer's absolute address. c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register) // Copy first % 16 bytes with simple MOVB instruction. beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP) c.assembler.CompileConstToRegister(amd64.TESTQ, 15, copySize.register) breakLoop := c.assembler.CompileJump(amd64.JEQ) c.assembler.CompileRegisterToMemory(amd64.MOVB, value.register, destinationOffset.register, 0) c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register) c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register) c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginCopyLoop) c.assembler.SetJumpTargetOnNext(breakLoop) // compileFillLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8. c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register) } c.compileFillLoopImpl(destinationOffset, value, copySize, tmp, !isTable) c.locationStack.markRegisterUnused(copySize.register, value.register, destinationOffset.register, tmp) c.assembler.SetJumpTargetOnNext(skipJump) return nil } // compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture. // // TODO: the compiled code in this function should be reused and compile at once as // the code is independent of any module. func (c *amd64Compiler) compileMemoryFill() error { return c.compileFillImpl(false, 0) } // compileTableInit implements compiler.compileTableInit for the amd64 architecture. func (c *amd64Compiler) compileTableInit(o *wazeroir.UnionOperation) error { elemIndex := uint32(o.U1) tableIndex := uint32(o.U2) return c.compileInitImpl(true, elemIndex, tableIndex) } // compileTableCopyLoopImpl is used for directly copying after bounds/direction check. func (c *amd64Compiler) compileTableCopyLoopImpl(srcTableIndex, dstTableIndex uint32, destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) { // Point on first byte to be copied. if !backwards { c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register) c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register) } // Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2. c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register) c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register) // destinationOffset += table buffer's absolute address. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp) c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(dstTableIndex*8), tmp) c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register) // sourceOffset += table buffer's absolute address. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp) c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(srcTableIndex*8), tmp) c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, sourceOffset.register) c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 8) } // compileTableCopy implements compiler.compileTableCopy for the amd64 architecture. // // It uses efficient `REP MOVSB` instructions for optimized copying. It uses backward copying for // overlapped segments. func (c *amd64Compiler) compileTableCopy(o *wazeroir.UnionOperation) error { copySize := c.locationStack.pop() if err := c.compileEnsureOnRegister(copySize); err != nil { return err } sourceOffset := c.locationStack.pop() if err := c.compileEnsureOnRegister(sourceOffset); err != nil { return err } destinationOffset := c.locationStack.pop() if err := c.compileEnsureOnRegister(destinationOffset); err != nil { return err } tmp, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // sourceOffset += size. c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register) // destinationOffset += size. c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register) srcTableIndex := uint32(o.U1) dstTableIndex := uint32(o.U2) // Check source bounds and if exceeds the length, exit with out of bounds error. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp) c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(srcTableIndex*8), tmp) c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, sourceOffset.register) c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess) // Check destination bounds and if exceeds the length, exit with out of bounds error. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp) c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(dstTableIndex*8), tmp) c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register) c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess) // Skip zero size. c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register) skipJump := c.assembler.CompileJump(amd64.JEQ) // If dest < source, we can copy forwards. c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register) destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS) // If source + size < dest, we can copy forwards. c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp) c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp) c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp) sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS) // Copy backwards. c.compileTableCopyLoopImpl(srcTableIndex, dstTableIndex, destinationOffset, sourceOffset, copySize, tmp, true) endJump := c.assembler.CompileJump(amd64.JMP) // Copy forwards. c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump) c.assembler.SetJumpTargetOnNext(sourceBoundLowerThanDestJump) c.compileTableCopyLoopImpl(srcTableIndex, dstTableIndex, destinationOffset, sourceOffset, copySize, tmp, false) c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register, destinationOffset.register, tmp) c.assembler.SetJumpTargetOnNext(skipJump) c.assembler.SetJumpTargetOnNext(endJump) return nil } // compileElemDrop implements compiler.compileElemDrop for the amd64 architecture. func (c *amd64Compiler) compileElemDrop(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } tmp, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } elemIndex := uint32(o.U1) c.compileLoadElemInstanceAddress(elemIndex, tmp) // Clears the content of ElementInstances[o.ElemIndex].References (== []uintptr{} type). c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0) c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8) c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16) return nil } func (c *amd64Compiler) compileLoadElemInstanceAddress(elemIndex uint32, dst asm.Register) { // dst = elemIndex * elementInstanceStructSize c.assembler.CompileConstToRegister(amd64.MOVQ, int64(elemIndex)*elementInstanceStructSize, dst) // dst = &moduleInstance.ElementInstances[0] + dst // = &moduleInstance.ElementInstances[0] + elemIndex*elementInstanceStructSize // = &moduleInstance.ElementInstances[elemIndex] c.assembler.CompileMemoryToRegister(amd64.ADDQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset, dst, ) } // compileTableGet implements compiler.compileTableGet for the amd64 architecture. func (c *amd64Compiler) compileTableGet(o *wazeroir.UnionOperation) error { ref, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.locationStack.markRegisterUsed(ref) offset := c.locationStack.pop() if err := c.compileEnsureOnRegister(offset); err != nil { return err } // ref = &tables[0] c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, ref) // ref = [ref + TableIndex*8] // = [&tables[0] + TableIndex*sizeOf(*tableInstance)] // = [&tables[TableIndex]] = tables[TableIndex]. tableIndex := int64(o.U1) c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableIndex*8, ref) // Out of bounds check. c.assembler.CompileMemoryToRegister(amd64.CMPQ, ref, tableInstanceTableLenOffset, offset.register) c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess) // ref = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0] c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableInstanceTableOffset, ref) // ref = [ref + 0 + offset.register * 8] // = [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset] // = [&tables[TableIndex].References[offset]] // = tables[TableIndex].References[offset] c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, ref, 0, offset.register, 8, ref, ) c.locationStack.markRegisterUnused(offset.register) c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) // table elements are opaque 64-bit at runtime. return nil } // compileTableSet implements compiler.compileTableSet for the amd64 architecture. func (c *amd64Compiler) compileTableSet(o *wazeroir.UnionOperation) error { ref := c.locationStack.pop() if err := c.compileEnsureOnRegister(ref); err != nil { return err } offset := c.locationStack.pop() if err := c.compileEnsureOnRegister(offset); err != nil { return err } tmp, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // tmp = &tables[0] c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp) // ref = [ref + TableIndex*8] // = [&tables[0] + TableIndex*sizeOf(*tableInstance)] // = [&tables[TableIndex]] = tables[TableIndex]. tableIndex := int64(o.U1) c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableIndex*8, tmp) // Out of bounds check. c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register) c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess) // tmp = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0] c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableInstanceTableOffset, tmp) // [tmp + 0 + offset.register * 8] = ref // [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset] = ref // [&tables[TableIndex].References[offset]] = ref // tables[TableIndex].References[offset] = ref c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVQ, ref.register, tmp, 0, offset.register, 8) c.locationStack.markRegisterUnused(offset.register, ref.register) return nil } // compileTableGrow implements compiler.compileTableGrow for the amd64 architecture. func (c *amd64Compiler) compileTableGrow(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } // Pushes the table index. tableIndex := uint32(o.U1) if err := c.compileConstI32Impl(tableIndex); err != nil { return err } // Table grow cannot be done in assembly just like memory grow as it involves with allocation in Go. // Therefore, call out to the built function for this purpose. if err := c.compileCallBuiltinFunction(builtinFunctionIndexTableGrow); err != nil { return err } // TableGrow consumes three values (table index, number of items, initial value). for i := 0; i < 3; i++ { c.locationStack.pop() } // Then, the previous length was pushed as the result. loc := c.locationStack.pushRuntimeValueLocationOnStack() loc.valueType = runtimeValueTypeI32 // After return, we re-initialize reserved registers just like preamble of functions. c.compileReservedStackBasePointerInitialization() c.compileReservedMemoryPointerInitialization() return nil } // compileTableSize implements compiler.compileTableSize for the amd64 architecture. func (c *amd64Compiler) compileTableSize(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } result, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } // result = &tables[0] c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, result) // result = [result + TableIndex*8] // = [&tables[0] + TableIndex*sizeOf(*tableInstance)] // = [&tables[TableIndex]] = tables[TableIndex]. tableIndex := int64(o.U1) c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableIndex*8, result) // result = [result + tableInstanceTableLenOffset] // = [tables[TableIndex] + tableInstanceTableLenOffset] // = len(tables[TableIndex]) c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableInstanceTableLenOffset, result) c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) return nil } // compileTableFill implements compiler.compileTableFill for the amd64 architecture. func (c *amd64Compiler) compileTableFill(o *wazeroir.UnionOperation) error { tableIndex := uint32(o.U1) return c.compileFillImpl(true, tableIndex) } // compileRefFunc implements compiler.compileRefFunc for the amd64 architecture. func (c *amd64Compiler) compileRefFunc(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } ref, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } functionIndex := int64(o.U1) c.assembler.CompileConstToRegister(amd64.MOVQ, functionIndex*functionSize, ref) // ref = [amd64ReservedRegisterForCallEngine + callEngineModuleContextFunctionsElement0AddressOffset + int64(o.FunctionIndex)*functionSize] // = &moduleEngine.functions[index] c.assembler.CompileMemoryToRegister( amd64.ADDQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset, ref, ) c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) return nil } // compileConstI32 implements compiler.compileConstI32 for the amd64 architecture. func (c *amd64Compiler) compileConstI32(o *wazeroir.UnionOperation) error { return c.compileConstI32Impl(uint32(o.U1)) } func (c *amd64Compiler) compileConstI32Impl(v uint32) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } reg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32) c.assembler.CompileConstToRegister(amd64.MOVL, int64(v), reg) return nil } // compileConstI64 implements compiler.compileConstI64 for the amd64 architecture. func (c *amd64Compiler) compileConstI64(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } reg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64) c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.U1), reg) return nil } // compileConstF32 implements compiler.compileConstF32 for the amd64 architecture. func (c *amd64Compiler) compileConstF32(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } reg, err := c.allocateRegister(registerTypeVector) if err != nil { return err } c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF32) // We cannot directly load the value from memory to float regs, // so we move it to int reg temporarily. tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.assembler.CompileConstToRegister(amd64.MOVL, int64(o.U1) /*math.Float32bits(o.Value)*/, tmpReg) c.assembler.CompileRegisterToRegister(amd64.MOVL, tmpReg, reg) return nil } // compileConstF64 implements compiler.compileConstF64 for the amd64 architecture. func (c *amd64Compiler) compileConstF64(o *wazeroir.UnionOperation) error { if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil { return err } reg, err := c.allocateRegister(registerTypeVector) if err != nil { return err } c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF64) // We cannot directly load the value from memory to float regs, // so we move it to int reg temporarily. tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.U1) /* math.Float64bits(o.Value) */, tmpReg) c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, reg) return nil } // compileLoadValueOnStackToRegister implements compiler.compileLoadValueOnStackToRegister for amd64. func (c *amd64Compiler) compileLoadValueOnStackToRegister(loc *runtimeValueLocation) { var inst asm.Instruction switch loc.valueType { case runtimeValueTypeV128Lo: inst = amd64.MOVDQU case runtimeValueTypeV128Hi: panic("BUG: V128Hi must be be loaded to a register along with V128Lo") case runtimeValueTypeI32, runtimeValueTypeF32: inst = amd64.MOVL case runtimeValueTypeI64, runtimeValueTypeF64: inst = amd64.MOVQ default: panic("BUG: unknown runtime value type") } // Copy the value from the stack. c.assembler.CompileMemoryToRegister(inst, // Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range. amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8, loc.register) if loc.valueType == runtimeValueTypeV128Lo { // Higher 64-bits are loaded as well ^^. hi := &c.locationStack.stack[loc.stackPointer+1] hi.setRegister(loc.register) } } // maybeCompileMoveTopConditionalToGeneralPurposeRegister moves the top value on the stack // if the value is located on a conditional register. // // This is usually called at the beginning of methods on compiler interface where we possibly // compile instructions without saving the conditional register value. // The compileXXX functions without calling this function is saving the conditional // value to the stack or register by invoking compileEnsureOnRegister for the top. func (c *amd64Compiler) maybeCompileMoveTopConditionalToGeneralPurposeRegister() (err error) { if c.locationStack.sp > 0 { if loc := c.locationStack.peek(); loc.onConditionalRegister() { if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil { return err } } } return } // loadConditionalRegisterToGeneralPurposeRegister saves the conditional register value // to a general purpose register. func (c *amd64Compiler) compileLoadConditionalRegisterToGeneralPurposeRegister(loc *runtimeValueLocation) error { reg, err := c.allocateRegister(registerTypeGeneralPurpose) if err != nil { return err } c.compileMoveConditionalToGeneralPurposeRegister(loc, reg) return nil } func (c *amd64Compiler) compileMoveConditionalToGeneralPurposeRegister(loc *runtimeValueLocation, reg asm.Register) { // Set the flag bit to the destination. See // - https://c9x.me/x86/html/file_module_x86_id_288.html // - https://github.com/golang/go/blob/master/src/cmd/internal/obj/x86/asm6.go#L1453-L1468 // to translate conditionalRegisterState* to amd64.SET* var inst asm.Instruction switch loc.conditionalRegister { case amd64.ConditionalRegisterStateE: inst = amd64.SETEQ case amd64.ConditionalRegisterStateNE: inst = amd64.SETNE case amd64.ConditionalRegisterStateS: inst = amd64.SETMI case amd64.ConditionalRegisterStateNS: inst = amd64.SETPL case amd64.ConditionalRegisterStateG: inst = amd64.SETGT case amd64.ConditionalRegisterStateGE: inst = amd64.SETGE case amd64.ConditionalRegisterStateL: inst = amd64.SETLT case amd64.ConditionalRegisterStateLE: inst = amd64.SETLE case amd64.ConditionalRegisterStateA: inst = amd64.SETHI case amd64.ConditionalRegisterStateAE: inst = amd64.SETCC case amd64.ConditionalRegisterStateB: inst = amd64.SETCS case amd64.ConditionalRegisterStateBE: inst = amd64.SETLS } c.assembler.CompileNoneToRegister(inst, reg) // Then we reset the unnecessary bit. c.assembler.CompileConstToRegister(amd64.ANDQ, 0x1, reg) // Mark it uses the register. loc.setRegister(reg) c.locationStack.markRegisterUsed(reg) } // allocateRegister implements compiler.allocateRegister for amd64. func (c *amd64Compiler) allocateRegister(t registerType) (reg asm.Register, err error) { var ok bool // Try to get the unused register. reg, ok = c.locationStack.takeFreeRegister(t) if ok { return } // If not found, we have to steal the register. stealTarget, ok := c.locationStack.takeStealTargetFromUsedRegister(t) if !ok { err = fmt.Errorf("cannot steal register") return } // Release the steal target register value onto stack location. reg = stealTarget.register c.compileReleaseRegisterToStack(stealTarget) return } // callFunction adds instructions to call a function whose address equals either addr parameter or the value on indexReg. // // Note: this is the counterpart for returnFunction, and see the comments there as well // to understand how the function calls are achieved. func (c *amd64Compiler) compileCallFunctionImpl(functionAddressRegister asm.Register, functype *wasm.FunctionType) error { // Release all the registers as our calling convention requires the caller-save. if err := c.compileReleaseAllRegistersToStack(); err != nil { return err } c.locationStack.markRegisterUsed(functionAddressRegister) // Obtain a temporary register to be used in the followings. tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose) if !found { // This in theory never happen as all the registers must be free except codeAddressRegister. return fmt.Errorf("could not find enough free registers") } // The stack should look like: // // reserved slots for results (if len(results) > len(args)) // | | // ,arg0, ..., argN, ..., _, .returnAddress, .returnStackBasePointerInBytes, .function, .... // | | | // | callFrame{^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^} // | // nextStackBasePointerOffset // // where callFrame is used to return to this currently executed function. nextStackBasePointerOffset := int64(c.locationStack.sp) - int64(functype.ParamNumInUint64) callFrameReturnAddressLoc, callFrameStackBasePointerInBytesLoc, callFrameFunctionLoc := c.locationStack.pushCallFrame(functype) // Save the current stack base pointer at callFrameStackBasePointerInBytesLoc. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset, tmpRegister) callFrameStackBasePointerInBytesLoc.setRegister(tmpRegister) c.compileReleaseRegisterToStack(callFrameStackBasePointerInBytesLoc) // Set callEngine.stackContext.stackBasePointer for the next function. c.assembler.CompileConstToRegister(amd64.ADDQ, nextStackBasePointerOffset<<3, tmpRegister) // Write the calculated value to callEngine.stackContext.stackBasePointer. c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset) // Save the currently executed *function (placed at callEngine.moduleContext.fn) into callFrameFunctionLoc. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset, tmpRegister) callFrameFunctionLoc.setRegister(tmpRegister) c.compileReleaseRegisterToStack(callFrameFunctionLoc) // Set callEngine.moduleContext.fn to the next *function. c.assembler.CompileRegisterToMemory(amd64.MOVQ, functionAddressRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset) // Write the return address into callFrameReturnAddressLoc. c.assembler.CompileReadInstructionAddress(tmpRegister, amd64.JMP) callFrameReturnAddressLoc.setRegister(tmpRegister) c.compileReleaseRegisterToStack(callFrameReturnAddressLoc) if amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister == functionAddressRegister { // This case we must move the value on targetFunctionAddressRegister to another register, otherwise // the address (jump target below) will be modified and result in segfault. // See #526. c.assembler.CompileRegisterToRegister(amd64.MOVQ, functionAddressRegister, tmpRegister) functionAddressRegister = tmpRegister } // Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister. c.assembler.CompileMemoryToRegister(amd64.MOVQ, functionAddressRegister, functionModuleInstanceOffset, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister) // And jump into the initial address of the target function. c.assembler.CompileJumpToMemory(amd64.JMP, functionAddressRegister, functionCodeInitialAddressOffset) // All the registers used are temporary, so we mark them unused. c.locationStack.markRegisterUnused(tmpRegister, functionAddressRegister) // On the function return, we have to initialize the state. if err := c.compileModuleContextInitialization(); err != nil { return err } // Due to the change to callEngine.stackContext.stackBasePointer. c.compileReservedStackBasePointerInitialization() // Due to the change to callEngine.moduleContext.moduleInstance as that might result in // the memory instance manipulation. c.compileReservedMemoryPointerInitialization() // We consumed the function parameters, the call frame stack and reserved slots during the call. c.locationStack.sp = uint64(nextStackBasePointerOffset) // Now the function results are pushed by the call. for _, t := range functype.Results { loc := c.locationStack.pushRuntimeValueLocationOnStack() switch t { case wasm.ValueTypeI32: loc.valueType = runtimeValueTypeI32 case wasm.ValueTypeI64, wasm.ValueTypeFuncref, wasm.ValueTypeExternref: loc.valueType = runtimeValueTypeI64 case wasm.ValueTypeF32: loc.valueType = runtimeValueTypeF32 case wasm.ValueTypeF64: loc.valueType = runtimeValueTypeF64 case wasm.ValueTypeV128: loc.valueType = runtimeValueTypeV128Lo hi := c.locationStack.pushRuntimeValueLocationOnStack() hi.valueType = runtimeValueTypeV128Hi default: panic("BUG: invalid type: " + wasm.ValueTypeName(t)) } } return nil } // returnFunction adds instructions to return from the current callframe back to the caller's frame. // If this is the current one is the origin, we return to the callEngine.execWasmFunction with the Returned status. // Otherwise, we jump into the callers' return address stored in callFrame.returnAddress while setting // up all the necessary change on the callEngine's state. // // Note: this is the counterpart for callFunction, and see the comments there as well // to understand how the function calls are achieved. func (c *amd64Compiler) compileReturnFunction() error { // Release all the registers as our calling convention requires the caller-save. if err := c.compileReleaseAllRegistersToStack(); err != nil { return err } if c.withListener { if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerAfter); err != nil { return err } // After return, we re-initialize the stack base pointer as that is used to return to the caller below. c.compileReservedStackBasePointerInitialization() } // amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address // so mark it used so that it won't be used as a free register. c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister) defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister) // Obtain a temporary register to be used in the following. returnAddressRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose) if !found { panic("BUG: all the registers should be free at this point: " + c.locationStack.String()) } returnAddress, callerStackBasePointerInBytes, callerFunction := c.locationStack.getCallFrameLocations(c.typ) // A zero return address means return from the execution. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForStackBasePointerAddress, int64(returnAddress.stackPointer)*8, returnAddressRegister, ) c.assembler.CompileRegisterToRegister(amd64.TESTQ, returnAddressRegister, returnAddressRegister) c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusCodeReturned) // Alias for readability. tmpRegister := amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister // First, restore the stackContext.stackBasePointerInBytesOffset from callerStackBasePointerInBytes. callerStackBasePointerInBytes.setRegister(tmpRegister) c.compileLoadValueOnStackToRegister(callerStackBasePointerInBytes) c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset) // Next, restore moduleContext.fn from callerFunction. callerFunction.setRegister(tmpRegister) c.compileLoadValueOnStackToRegister(callerFunction) c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset) // Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister. c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, functionModuleInstanceOffset, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister) // Then, jump into the return address! c.assembler.CompileJumpToRegister(amd64.JMP, returnAddressRegister) return nil } func (c *amd64Compiler) compileCallGoHostFunction() error { return c.compileCallGoFunction(nativeCallStatusCodeCallGoHostFunction) } func (c *amd64Compiler) compileCallBuiltinFunction(index wasm.Index) error { // Set the functionAddress to the callEngine.exitContext functionCallAddress. c.assembler.CompileConstToMemory(amd64.MOVL, int64(index), amd64ReservedRegisterForCallEngine, callEngineExitContextBuiltinFunctionCallIndexOffset) return c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction) } func (c *amd64Compiler) compileCallGoFunction(compilerStatus nativeCallStatusCode) error { // Release all the registers as our calling convention requires the caller-save. if err := c.compileReleaseAllRegistersToStack(); err != nil { return err } c.compileExitFromNativeCode(compilerStatus) return nil } // compileReleaseAllRegistersToStack add the instructions to release all the LIVE value // in the value location stack at this point into the stack memory location. func (c *amd64Compiler) compileReleaseAllRegistersToStack() (err error) { for i := uint64(0); i < c.locationStack.sp; i++ { if loc := &c.locationStack.stack[i]; loc.onRegister() { c.compileReleaseRegisterToStack(loc) } else if loc.onConditionalRegister() { if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil { return } c.compileReleaseRegisterToStack(loc) } } return } func (c *amd64Compiler) onValueReleaseRegisterToStack(reg asm.Register) { for i := uint64(0); i < c.locationStack.sp; i++ { prevValue := &c.locationStack.stack[i] if prevValue.register == reg { c.compileReleaseRegisterToStack(prevValue) break } } } // compileReleaseRegisterToStack implements compiler.compileReleaseRegisterToStack for amd64. func (c *amd64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation) { var inst asm.Instruction switch loc.valueType { case runtimeValueTypeV128Lo: inst = amd64.MOVDQU case runtimeValueTypeV128Hi: panic("BUG: V128Hi must be released to the stack along with V128Lo") case runtimeValueTypeI32, runtimeValueTypeF32: inst = amd64.MOVL case runtimeValueTypeI64, runtimeValueTypeF64: inst = amd64.MOVQ default: panic("BUG: unknown runtime value type") } c.assembler.CompileRegisterToMemory(inst, loc.register, // Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range. amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8) // Mark the register is free. c.locationStack.releaseRegister(loc) if loc.valueType == runtimeValueTypeV128Lo { // Higher 64-bits are released as well ^^. hi := &c.locationStack.stack[loc.stackPointer+1] c.locationStack.releaseRegister(hi) } } func (c *amd64Compiler) compileMaybeExitFromNativeCode(skipCondition asm.Instruction, status nativeCallStatusCode) { if target := c.compiledTrapTargets[status]; target != nil { // We've already compiled this. // Invert the return condition to jump into the appropriate target. var returnCondition asm.Instruction switch skipCondition { case amd64.JHI: returnCondition = amd64.JLS case amd64.JLS: returnCondition = amd64.JHI case amd64.JNE: returnCondition = amd64.JEQ case amd64.JEQ: returnCondition = amd64.JNE case amd64.JCC: returnCondition = amd64.JCS case amd64.JCS: returnCondition = amd64.JCC case amd64.JPC: returnCondition = amd64.JPS case amd64.JPS: returnCondition = amd64.JPC case amd64.JPL: returnCondition = amd64.JMI case amd64.JMI: returnCondition = amd64.JPL default: panic("BUG: couldn't invert condition") } c.assembler.CompileJump(returnCondition).AssignJumpTarget(target) } else { skip := c.assembler.CompileJump(skipCondition) c.compileExitFromNativeCode(status) c.assembler.SetJumpTargetOnNext(skip) } } func (c *amd64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) { if target := c.compiledTrapTargets[status]; target != nil { c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(target) return } switch status { case nativeCallStatusCodeReturned: // Save the target for reuse. c.compiledTrapTargets[status] = c.compileNOP() case nativeCallStatusCodeCallGoHostFunction, nativeCallStatusCodeCallBuiltInFunction: // Read the return address, and write it to callEngine.exitContext.returnAddress. returnAddressReg, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose) if !ok { panic("BUG: cannot take free register") } c.assembler.CompileReadInstructionAddress(returnAddressReg, amd64.RET) c.assembler.CompileRegisterToMemory(amd64.MOVQ, returnAddressReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset) default: if c.ir.IROperationSourceOffsetsInWasmBinary != nil { // This case, the execution traps and we want the top frame's source position in the stack trace. // Take RegR15 and store the instruction address onto callEngine.returnAddress. returnAddressReg := amd64.RegR15 c.assembler.CompileReadInstructionAddress(returnAddressReg, amd64.MOVQ) c.assembler.CompileRegisterToMemory(amd64.MOVQ, returnAddressReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset) } else { // We won't use the source position, so just save the target for reuse. c.compiledTrapTargets[status] = c.compileNOP() } } // Write the status to callEngine.exitContext.statusCode. c.assembler.CompileConstToMemory(amd64.MOVB, int64(status), amd64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset) // Write back the cached SP to the actual eng.stackPointer. c.assembler.CompileConstToMemory(amd64.MOVQ, int64(c.locationStack.sp), amd64ReservedRegisterForCallEngine, callEngineStackContextStackPointerOffset) c.assembler.CompileStandAlone(amd64.RET) } func (c *amd64Compiler) compilePreamble() (err error) { // We assume all function parameters are already pushed onto the stack by // the caller. c.locationStack.init(c.typ) if err := c.compileModuleContextInitialization(); err != nil { return err } // Check if it's necessary to grow the value stack by using max stack pointer. if err = c.compileMaybeGrowStack(); err != nil { return err } if c.withListener { if err = c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil { return err } } c.compileReservedStackBasePointerInitialization() // Finally, we initialize the reserved memory register based on the module context. c.compileReservedMemoryPointerInitialization() return } func (c *amd64Compiler) compileReservedStackBasePointerInitialization() { // First, make reservedRegisterForStackBasePointer point to the beginning of the slice backing array. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineStackContextStackElement0AddressOffset, amd64ReservedRegisterForStackBasePointerAddress) // next we move the base pointer (callEngine.stackBasePointer) to the tmp register. c.assembler.CompileMemoryToRegister(amd64.ADDQ, amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset, amd64ReservedRegisterForStackBasePointerAddress, ) } func (c *amd64Compiler) compileReservedMemoryPointerInitialization() { if c.ir.HasMemory || c.ir.UsesMemory { c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset, amd64ReservedRegisterForMemory, ) } } // compileMaybeGrowStack adds instructions to check the necessity to grow the value stack, // and if so, make the builtin function call to do so. These instructions are called in the function's // preamble. func (c *amd64Compiler) compileMaybeGrowStack() error { tmpRegister, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose) if !ok { panic("BUG: cannot take free register") } c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineStackContextStackLenInBytesOffset, tmpRegister) c.assembler.CompileMemoryToRegister(amd64.SUBQ, amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset, tmpRegister) // If stack base pointer + max stack pointer > stackLen, we need to grow the stack. cmpWithStackPointerCeil := c.assembler.CompileRegisterToConst(amd64.CMPQ, tmpRegister, 0) c.assignStackPointerCeilNeeded = cmpWithStackPointerCeil // Jump if we have no need to grow. jmpIfNoNeedToGrowStack := c.assembler.CompileJump(amd64.JCC) // Otherwise, we have to make the builtin function call to grow the call stack. if err := c.compileCallBuiltinFunction(builtinFunctionIndexGrowStack); err != nil { return err } c.assembler.SetJumpTargetOnNext(jmpIfNoNeedToGrowStack) return nil } // compileModuleContextInitialization adds instructions to initialize callEngine.ModuleContext's fields based on // callEngine.ModuleContext.ModuleInstanceAddress. // This is called in two cases: in function preamble, and on the return from (non-Go) function calls. func (c *amd64Compiler) compileModuleContextInitialization() error { // amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address // so mark it used so that it won't be used as a free register until the module context initialization finishes. c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister) defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister) // Obtain the temporary registers to be used in the followings. tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose) if !found { // This in theory never happen as all the registers must be free except indexReg. return fmt.Errorf("could not find enough free registers") } c.locationStack.markRegisterUsed(tmpRegister) tmpRegister2, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose) if !found { // This in theory never happen as all the registers must be free except indexReg. return fmt.Errorf("could not find enough free registers") } c.locationStack.markRegisterUsed(tmpRegister2) // If the module instance address stays the same, we could skip the entire code below. // The rationale/idea for this is that, in almost all use cases, users instantiate a single // Wasm binary and run the functions from it, rather than doing import/export on multiple // binaries. As a result, this cmp and jmp instruction sequence below must be easy for // x64 CPU to do branch prediction since almost 100% jump happens across function calls. c.assembler.CompileMemoryToRegister(amd64.CMPQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister) jmpIfModuleNotChange := c.assembler.CompileJump(amd64.JEQ) // If engine.ModuleContext.ModuleInstance is not equal the value on amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, // we have to put the new value there. c.assembler.CompileRegisterToMemory(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset) // Also, we have to update the following fields: // * callEngine.moduleContext.globalElement0Address // * callEngine.moduleContext.tableElement0Address // * callEngine.moduleContext.memoryInstance // * callEngine.moduleContext.memoryElement0Address // * callEngine.moduleContext.memorySliceLen // * callEngine.moduleContext.codesElement0Address // * callEngine.moduleContext.typeIDsElement0Address // * callEngine.moduleContext.dataInstancesElement0Address // * callEngine.moduleContext.elementInstancesElement0Address // Update globalElement0Address. // // Note: if there's global.get or set instruction in the function, the existence of the globals // is ensured by function validation at module instantiation phase, and that's why it is ok to // skip the initialization if the module's globals slice is empty. if len(c.ir.Globals) > 0 { // Since ModuleInstance.Globals is []*globalInstance, internally // the address of the first item in the underlying array lies exactly on the globals offset. // See https://go.dev/blog/slices-intro if unfamiliar. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceGlobalsOffset, tmpRegister) c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset) } // Update tableElement0Address. // // Note: if there's table instruction in the function, the existence of the table // is ensured by function validation at module instantiation phase, and that's // why it is ok to skip the initialization if the module's table doesn't exist. if c.ir.HasTable { // First, we need to read the *wasm.Table. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTablesOffset, tmpRegister) // At this point, tmpRegister holds the address of ModuleInstance.Table. // So we are ready to read and put the first item's address stored in Table.Table. // Here we read the value into tmpRegister2. c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset) // Finally, we put &ModuleInstance.TypeIDs[0] into moduleContext.typeIDsElement0Address. c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTypeIDsOffset, tmpRegister) c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset) } // Update memoryElement0Address and memorySliceLen. // // Note: if there's memory instruction in the function, memory instance must be non-nil. // That is ensured by function validation at module instantiation phase, and that's // why it is ok to skip the initialization if the module's memory instance is nil. if c.ir.HasMemory { c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceMemoryOffset, tmpRegister) // Set memory instance. c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset) // Set length. c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferLenOffset, tmpRegister2) c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset) // Set element zero address. c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferOffset, tmpRegister2) c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset) } // Update moduleContext.codesElement0Address { // "tmpRegister = [moduleInstanceAddressRegister + moduleInstanceEngineOffset + interfaceDataOffset] (== *moduleEngine)" // // Go's interface is laid out on memory as two quad words as struct {tab, data uintptr} // where tab points to the interface table, and the latter points to the actual // implementation of interface. This case, we extract "data" pointer as *moduleEngine. // See the following references for detail: // * https://research.swtch.com/interfaces // * https://github.com/golang/go/blob/release-branch.go1.20/src/runtime/runtime2.go#L207-L210 c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceEngineOffset+interfaceDataOffset, tmpRegister) // "tmpRegister = [tmpRegister + moduleEnginecodesOffset] (== &moduleEngine.codes[0])" c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, moduleEngineFunctionsOffset, tmpRegister) // "callEngine.moduleContext.functionsElement0Address = tmpRegister". c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset) } // Update dataInstancesElement0Address. if c.ir.HasDataInstances { // "tmpRegister = &moduleInstance.DataInstances[0]" c.assembler.CompileMemoryToRegister( amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceDataInstancesOffset, tmpRegister, ) // "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister". c.assembler.CompileRegisterToMemory( amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset, ) } // Update callEngine.moduleContext.elementInstancesElement0Address if c.ir.HasElementInstances { // "tmpRegister = &moduleInstance.ElementInstnaces[0]" c.assembler.CompileMemoryToRegister( amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceElementInstancesOffset, tmpRegister, ) // "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister". c.assembler.CompileRegisterToMemory( amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset, ) } c.locationStack.markRegisterUnused(tmpRegister, tmpRegister2) // Set the jump target towards the next instruction for the case where module instance address hasn't changed. c.assembler.SetJumpTargetOnNext(jmpIfModuleNotChange) return nil } // compileEnsureOnRegister ensures that the given value is located on a // general purpose register of an appropriate type. func (c *amd64Compiler) compileEnsureOnRegister(loc *runtimeValueLocation) (err error) { if loc.onStack() { // Allocate the register. reg, err := c.allocateRegister(loc.getRegisterType()) if err != nil { return err } // Mark it uses the register. loc.setRegister(reg) c.locationStack.markRegisterUsed(reg) c.compileLoadValueOnStackToRegister(loc) } else if loc.onConditionalRegister() { err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc) } return } // compileMaybeSwapRegisters swaps two registers if they're not equal. func (c *amd64Compiler) compileMaybeSwapRegisters(reg1, reg2 asm.Register) { if reg1 != reg2 { c.assembler.CompileRegisterToRegister(amd64.XCHGQ, reg1, reg2) } } // compilePreventCrossedTargetRegisters swaps registers in such a way, that for neither runtimeValueLocation from locs its // corresponding register with the same index from targets is occupied by some other runtimeValueLocation from locs. It returns a // closure to restore the original register placement. // // This function makes it possible to safely exchange one set of registers with another, where a register might be in both sets. // Each register will correspond either to itself or another register not present in its own set. // // For example, if we have locs = [AX, BX, CX], targets = [BX, SI, AX], then it'll do two swaps // to make locs = [BX, CX, AX]. func (c *amd64Compiler) compilePreventCrossedTargetRegisters(locs []*runtimeValueLocation, targets []asm.Register) (restore func()) { type swap struct{ srcIndex, dstIndex int } var swaps []swap for i := range locs { targetLocation := -1 // -1 means not found. for j := range locs { if locs[j].register == targets[i] { targetLocation = j break } } if targetLocation != -1 && targetLocation != i { c.compileMaybeSwapRegisters(locs[i].register, locs[targetLocation].register) locs[i].register, locs[targetLocation].register = locs[targetLocation].register, locs[i].register swaps = append(swaps, swap{i, targetLocation}) } } return func() { // Restore in reverse order because a register can be moved multiple times. for i := len(swaps) - 1; i >= 0; i -= 1 { r1, r2 := swaps[i].srcIndex, swaps[i].dstIndex c.compileMaybeSwapRegisters(locs[r1].register, locs[r2].register) locs[r1].register, locs[r2].register = locs[r2].register, locs[r1].register } } }