package amd64 import ( "encoding/binary" "errors" "fmt" "math" "github.com/tetratelabs/wazero/internal/asm" ) // nodeImpl implements asm.Node for amd64. type nodeImpl struct { // jumpTarget holds the target node in the linked for the jump-kind instruction. jumpTarget *nodeImpl // prev and next hold the prev/next node from this node in the assembled linked list. prev, next *nodeImpl // forwardJumpOrigins hold all the nodes trying to jump into this node as a // singly linked list. In other words, all the nodes with .jumpTarget == this. forwardJumpOrigins *nodeImpl staticConst *asm.StaticConst dstConst asm.ConstantValue offsetInBinary asm.NodeOffsetInBinary srcConst asm.ConstantValue instruction asm.Instruction // readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of // read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress. readInstructionAddressBeforeTargetInstruction asm.Instruction flag nodeFlag types operandTypes srcReg, dstReg asm.Register srcMemIndex, dstMemIndex asm.Register srcMemScale, dstMemScale byte arg byte // staticConstReferrersAdded true if this node is already added into AssemblerImpl.staticConstReferrers. // Only used when staticConst is not nil. Through re-assembly, we might end up adding multiple times which causes unnecessary // allocations, so we use this flag to do it once. staticConstReferrersAdded bool } type nodeFlag byte const ( // nodeFlagInitializedForEncoding is always set to indicate that node is already initialized. Notably, this is used to judge // whether a jump is backward or forward before encoding. nodeFlagInitializedForEncoding nodeFlag = 1 << iota nodeFlagBackwardJump // nodeFlagShortForwardJump is set to false by default and only used by forward branch jumps, which means .jumpTarget != nil and // the target node is encoded after this node. False by default means that we Encode all the jumps with jumpTarget // as short jump (i.e. relative signed 8-bit integer offset jump) and try to Encode as small as possible. nodeFlagShortForwardJump ) func (n *nodeImpl) isInitializedForEncoding() bool { return n.flag&nodeFlagInitializedForEncoding != 0 } func (n *nodeImpl) isJumpNode() bool { return n.jumpTarget != nil } func (n *nodeImpl) isBackwardJump() bool { return n.isJumpNode() && (n.flag&nodeFlagBackwardJump != 0) } func (n *nodeImpl) isForwardJump() bool { return n.isJumpNode() && (n.flag&nodeFlagBackwardJump == 0) } func (n *nodeImpl) isForwardShortJump() bool { return n.isForwardJump() && n.flag&nodeFlagShortForwardJump != 0 } // AssignJumpTarget implements asm.Node.AssignJumpTarget. func (n *nodeImpl) AssignJumpTarget(target asm.Node) { n.jumpTarget = target.(*nodeImpl) } // AssignDestinationConstant implements asm.Node.AssignDestinationConstant. func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) { n.dstConst = value } // AssignSourceConstant implements asm.Node.AssignSourceConstant. func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) { n.srcConst = value } // OffsetInBinary implements asm.Node.OffsetInBinary. func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary { return n.offsetInBinary } // String implements fmt.Stringer. // // This is for debugging purpose, and the format is almost same as the AT&T assembly syntax, // meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand // might be embraced by '[]' to represent the memory location. func (n *nodeImpl) String() (ret string) { instName := InstructionName(n.instruction) switch n.types { case operandTypesNoneToNone: ret = instName case operandTypesNoneToRegister: ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg)) case operandTypesNoneToMemory: if n.dstMemIndex != asm.NilRegister { ret = fmt.Sprintf("%s [%s + 0x%x + %s*0x%x]", instName, RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) } else { ret = fmt.Sprintf("%s [%s + 0x%x]", instName, RegisterName(n.dstReg), n.dstConst) } case operandTypesNoneToBranch: ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget) case operandTypesRegisterToNone: ret = fmt.Sprintf("%s %s", instName, RegisterName(n.srcReg)) case operandTypesRegisterToRegister: ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg)) case operandTypesRegisterToMemory: if n.dstMemIndex != asm.NilRegister { ret = fmt.Sprintf("%s %s, [%s + 0x%x + %s*0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) } else { ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst) } case operandTypesRegisterToConst: ret = fmt.Sprintf("%s %s, 0x%x", instName, RegisterName(n.srcReg), n.dstConst) case operandTypesMemoryToRegister: if n.srcMemIndex != asm.NilRegister { ret = fmt.Sprintf("%s [%s + %#x + %s*%#x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, RegisterName(n.dstReg)) } else { ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg)) } case operandTypesMemoryToConst: if n.srcMemIndex != asm.NilRegister { ret = fmt.Sprintf("%s [%s + %#x + %s*0x%x], 0x%x", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, n.dstConst) } else { ret = fmt.Sprintf("%s [%s + %#x], 0x%x", instName, RegisterName(n.srcReg), n.srcConst, n.dstConst) } case operandTypesConstToMemory: if n.dstMemIndex != asm.NilRegister { ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x + %s*0x%x]", instName, n.srcConst, RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) } else { ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x]", instName, n.srcConst, RegisterName(n.dstReg), n.dstConst) } case operandTypesConstToRegister: ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg)) case operandTypesStaticConstToRegister: ret = fmt.Sprintf("%s $%#x, %s", instName, n.staticConst.Raw, RegisterName(n.dstReg)) case operandTypesRegisterToStaticConst: ret = fmt.Sprintf("%s %s, $%#x", instName, RegisterName(n.srcReg), n.staticConst.Raw) } return } type operandTypes byte const ( operandTypesNoneToNone operandTypes = iota operandTypesNoneToRegister operandTypesNoneToMemory operandTypesNoneToBranch operandTypesRegisterToNone operandTypesRegisterToRegister operandTypesRegisterToMemory operandTypesRegisterToConst operandTypesMemoryToRegister operandTypesMemoryToConst operandTypesConstToRegister operandTypesConstToMemory operandTypesStaticConstToRegister operandTypesRegisterToStaticConst ) // String implements fmt.Stringer func (o operandTypes) String() (ret string) { switch o { case operandTypesNoneToNone: ret = "NoneToNone" case operandTypesNoneToRegister: ret = "NoneToRegister" case operandTypesNoneToMemory: ret = "NoneToMemory" case operandTypesNoneToBranch: ret = "NoneToBranch" case operandTypesRegisterToNone: ret = "RegisterToNone" case operandTypesRegisterToRegister: ret = "RegisterToRegister" case operandTypesRegisterToMemory: ret = "RegisterToMemory" case operandTypesRegisterToConst: ret = "RegisterToConst" case operandTypesMemoryToRegister: ret = "MemoryToRegister" case operandTypesMemoryToConst: ret = "MemoryToConst" case operandTypesConstToRegister: ret = "ConstToRegister" case operandTypesConstToMemory: ret = "ConstToMemory" case operandTypesStaticConstToRegister: ret = "StaticConstToRegister" case operandTypesRegisterToStaticConst: ret = "RegisterToStaticConst" } return } type ( // AssemblerImpl implements Assembler. AssemblerImpl struct { root *nodeImpl current *nodeImpl asm.BaseAssemblerImpl readInstructionAddressNodes []*nodeImpl // staticConstReferrers maintains the list of static const referrers which requires the // offset resolution after finalizing the binary layout. staticConstReferrers []staticConstReferrer nodePool nodePool pool asm.StaticConstPool // MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstantPool // but have it as an exported field here for testability. MaxDisplacementForConstantPool int forceReAssemble bool } // staticConstReferrer represents a referrer of a asm.StaticConst. staticConstReferrer struct { n *nodeImpl // instLen is the encoded length of the instruction for `n`. instLen int } ) func NewAssembler() *AssemblerImpl { return &AssemblerImpl{ nodePool: nodePool{index: nodePageSize}, pool: asm.NewStaticConstPool(), MaxDisplacementForConstantPool: defaultMaxDisplacementForConstantPool, } } const nodePageSize = 128 type nodePage = [nodePageSize]nodeImpl // nodePool is the central allocation pool for nodeImpl used by a single AssemblerImpl. // This reduces the allocations over compilation by reusing AssemblerImpl. type nodePool struct { pages []*nodePage index int } // allocNode allocates a new nodeImpl for use from the pool. // This expands the pool if there is no space left for it. func (n *nodePool) allocNode() *nodeImpl { if n.index == nodePageSize { if len(n.pages) == cap(n.pages) { n.pages = append(n.pages, new(nodePage)) } else { i := len(n.pages) n.pages = n.pages[:i+1] if n.pages[i] == nil { n.pages[i] = new(nodePage) } } n.index = 0 } ret := &n.pages[len(n.pages)-1][n.index] n.index++ return ret } func (n *nodePool) reset() { for _, ns := range n.pages { pages := ns[:] for i := range pages { pages[i] = nodeImpl{} } } n.pages = n.pages[:0] n.index = nodePageSize } // AllocateNOP implements asm.AssemblerBase. func (a *AssemblerImpl) AllocateNOP() asm.Node { n := a.nodePool.allocNode() n.instruction = NOP n.types = operandTypesNoneToNone return n } // Add implements asm.AssemblerBase. func (a *AssemblerImpl) Add(n asm.Node) { a.addNode(n.(*nodeImpl)) } // Reset implements asm.AssemblerBase. func (a *AssemblerImpl) Reset() { pool := a.pool pool.Reset() *a = AssemblerImpl{ nodePool: a.nodePool, pool: pool, readInstructionAddressNodes: a.readInstructionAddressNodes[:0], staticConstReferrers: a.staticConstReferrers[:0], BaseAssemblerImpl: asm.BaseAssemblerImpl{ SetBranchTargetOnNextNodes: a.SetBranchTargetOnNextNodes[:0], JumpTableEntries: a.JumpTableEntries[:0], }, } a.nodePool.reset() } // newNode creates a new Node and appends it into the linked list. func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl { n := a.nodePool.allocNode() n.instruction = instruction n.types = types a.addNode(n) return n } // addNode appends the new node into the linked list. func (a *AssemblerImpl) addNode(node *nodeImpl) { if a.root == nil { a.root = node a.current = node } else { parent := a.current parent.next = node node.prev = parent a.current = node } for _, o := range a.SetBranchTargetOnNextNodes { origin := o.(*nodeImpl) origin.jumpTarget = node } // Reuse the underlying slice to avoid re-allocations. a.SetBranchTargetOnNextNodes = a.SetBranchTargetOnNextNodes[:0] } // encodeNode encodes the given node into writer. func (a *AssemblerImpl) encodeNode(buf asm.Buffer, n *nodeImpl) (err error) { switch n.types { case operandTypesNoneToNone: err = a.encodeNoneToNone(buf, n) case operandTypesNoneToRegister: err = a.encodeNoneToRegister(buf, n) case operandTypesNoneToMemory: err = a.encodeNoneToMemory(buf, n) case operandTypesNoneToBranch: // Branching operand can be encoded as relative jumps. err = a.encodeRelativeJump(buf, n) case operandTypesRegisterToNone: err = a.encodeRegisterToNone(buf, n) case operandTypesRegisterToRegister: err = a.encodeRegisterToRegister(buf, n) case operandTypesRegisterToMemory: err = a.encodeRegisterToMemory(buf, n) case operandTypesRegisterToConst: err = a.encodeRegisterToConst(buf, n) case operandTypesMemoryToRegister: err = a.encodeMemoryToRegister(buf, n) case operandTypesMemoryToConst: err = a.encodeMemoryToConst(buf, n) case operandTypesConstToRegister: err = a.encodeConstToRegister(buf, n) case operandTypesConstToMemory: err = a.encodeConstToMemory(buf, n) case operandTypesStaticConstToRegister: err = a.encodeStaticConstToRegister(buf, n) case operandTypesRegisterToStaticConst: err = a.encodeRegisterToStaticConst(buf, n) default: err = fmt.Errorf("encoder undefined for [%s] operand type", n.types) } if err != nil { err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node. } return } // Assemble implements asm.AssemblerBase func (a *AssemblerImpl) Assemble(buf asm.Buffer) error { a.initializeNodesForEncoding() // Continue encoding until we are not forced to re-assemble which happens when // a short relative jump ends up the offset larger than 8-bit length. for { err := a.encode(buf) if err != nil { return err } if !a.forceReAssemble { break } else { // We reset the length of buffer but don't delete the underlying slice since // the binary size will roughly the same after reassemble. buf.Reset() // Reset the re-assemble flag in order to avoid the infinite loop! a.forceReAssemble = false } } code := buf.Bytes() for _, n := range a.readInstructionAddressNodes { if err := a.finalizeReadInstructionAddressNode(code, n); err != nil { return err } } // Now that we've finished the layout, fill out static consts offsets. for i := range a.staticConstReferrers { ref := &a.staticConstReferrers[i] n, instLen := ref.n, ref.instLen // Calculate the displacement between the RIP (the offset _after_ n) and the static constant. displacement := int(n.staticConst.OffsetInBinary) - int(n.OffsetInBinary()) - instLen // The offset must be stored at the 4 bytes from the tail of this n. See AssemblerImpl.encodeStaticConstImpl for detail. displacementOffsetInInstruction := n.OffsetInBinary() + uint64(instLen-4) binary.LittleEndian.PutUint32(code[displacementOffsetInInstruction:], uint32(int32(displacement))) } return a.FinalizeJumpTableEntry(code) } // initializeNodesForEncoding initializes nodeImpl.flag and determine all the jumps // are forward or backward jump. func (a *AssemblerImpl) initializeNodesForEncoding() { for n := a.root; n != nil; n = n.next { n.flag |= nodeFlagInitializedForEncoding if target := n.jumpTarget; target != nil { if target.isInitializedForEncoding() { // This means the target exists behind. n.flag |= nodeFlagBackwardJump } else { // Otherwise, this is forward jump. // We start with assuming that the jump can be short (8-bit displacement). // If it doens't fit, we change this flag in resolveRelativeForwardJump. n.flag |= nodeFlagShortForwardJump // If the target node is also the branching instruction, we replace the target with the NOP // node so that we can avoid the collision of the target.forwardJumpOrigins both as destination and origins. if target.types == operandTypesNoneToBranch { // Allocate the NOP node from the pool. nop := a.nodePool.allocNode() nop.instruction = NOP nop.types = operandTypesNoneToNone // Insert it between target.prev and target: [target.prev, target] -> [target.prev, nop, target] prev := target.prev nop.prev = prev prev.next = nop nop.next = target target.prev = nop n.jumpTarget = nop target = nop } // We add this node `n` into the end of the linked list (.forwardJumpOrigins) beginning from the `target.forwardJumpOrigins`. // Insert the current `n` as the head of the list. n.forwardJumpOrigins = target.forwardJumpOrigins target.forwardJumpOrigins = n } } } } func (a *AssemblerImpl) encode(buf asm.Buffer) error { for n := a.root; n != nil; n = n.next { // If an instruction needs NOP padding, we do so before encoding it. // // This is necessary to avoid Intel's jump erratum; see in Section 2.1 // in for when we have to pad NOP: // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf // // This logic used to be implemented in a function called maybeNOPPadding, // but the complexity of the logic made it impossible for the compiler to // inline. Since this function is on a hot code path, we inlined the // initial checks to skip the function call when instructions do not need // NOP padding. switch info := nopPaddingInfo[n.instruction]; { case info.jmp: if err := a.encodeJmpNOPPadding(buf, n); err != nil { return err } case info.onNextJmp: if err := a.encodeOnNextJmpNOPPAdding(buf, n); err != nil { return err } } // After the padding, we can finalize the offset of this instruction in the binary. n.offsetInBinary = uint64(buf.Len()) if err := a.encodeNode(buf, n); err != nil { return err } if n.forwardJumpOrigins != nil { if err := a.resolveForwardRelativeJumps(buf, n); err != nil { return fmt.Errorf("invalid relative forward jumps: %w", err) } } a.maybeFlushConstants(buf, n.next == nil) } return nil } var nopPaddingInfo = [instructionEnd]struct { jmp, onNextJmp bool }{ RET: {jmp: true}, JMP: {jmp: true}, JCC: {jmp: true}, JCS: {jmp: true}, JEQ: {jmp: true}, JGE: {jmp: true}, JGT: {jmp: true}, JHI: {jmp: true}, JLE: {jmp: true}, JLS: {jmp: true}, JLT: {jmp: true}, JMI: {jmp: true}, JNE: {jmp: true}, JPC: {jmp: true}, JPS: {jmp: true}, // The possible fused jump instructions if the next node is a conditional jump instruction. CMPL: {onNextJmp: true}, CMPQ: {onNextJmp: true}, TESTL: {onNextJmp: true}, TESTQ: {onNextJmp: true}, ADDL: {onNextJmp: true}, ADDQ: {onNextJmp: true}, SUBL: {onNextJmp: true}, SUBQ: {onNextJmp: true}, ANDL: {onNextJmp: true}, ANDQ: {onNextJmp: true}, INCQ: {onNextJmp: true}, DECQ: {onNextJmp: true}, } func (a *AssemblerImpl) encodeJmpNOPPadding(buf asm.Buffer, n *nodeImpl) error { // In order to know the instruction length before writing into the binary, // we try encoding it. prevLen := buf.Len() // Assign the temporary offset which may or may not be correct depending on the padding decision. n.offsetInBinary = uint64(prevLen) // Encode the node and get the instruction length. if err := a.encodeNode(buf, n); err != nil { return err } instructionLen := int32(buf.Len() - prevLen) // Revert the written bytes. buf.Truncate(prevLen) return a.encodeNOPPadding(buf, instructionLen) } func (a *AssemblerImpl) encodeOnNextJmpNOPPAdding(buf asm.Buffer, n *nodeImpl) error { instructionLen, err := a.fusedInstructionLength(buf, n) if err != nil { return err } return a.encodeNOPPadding(buf, instructionLen) } // encodeNOPPadding maybe appends NOP instructions before the node `n`. // This is necessary to avoid Intel's jump erratum: // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf func (a *AssemblerImpl) encodeNOPPadding(buf asm.Buffer, instructionLen int32) error { const boundaryInBytes int32 = 32 const mask = boundaryInBytes - 1 var padNum int currentPos := int32(buf.Len()) if used := currentPos & mask; used+instructionLen >= boundaryInBytes { padNum = int(boundaryInBytes - used) } a.padNOP(buf, padNum) return nil } // fusedInstructionLength returns the length of "macro fused instruction" if the // instruction sequence starting from `n` can be fused by processor. Otherwise, // returns zero. func (a *AssemblerImpl) fusedInstructionLength(buf asm.Buffer, n *nodeImpl) (ret int32, err error) { // Find the next non-NOP instruction. next := n.next for ; next != nil && next.instruction == NOP; next = next.next { } if next == nil { return } inst, jmpInst := n.instruction, next.instruction if !nopPaddingInfo[jmpInst].jmp { // If the next instruction is not jump kind, the instruction will not be fused. return } // How to determine whether the instruction can be fused is described in // Section 3.4.2.2 of "Intel Optimization Manual": // https://www.intel.com/content/dam/doc/manual/64-ia-32-architectures-optimization-manual.pdf isTest := inst == TESTL || inst == TESTQ isCmp := inst == CMPQ || inst == CMPL isTestCmp := isTest || isCmp if isTestCmp && (n.types == operandTypesMemoryToConst || n.types == operandTypesConstToMemory) { // The manual says: "CMP and TEST can not be fused when comparing MEM-IMM". return } // Implement the decision according to the table 3-1 in the manual. isAnd := inst == ANDL || inst == ANDQ if !isTest && !isAnd { if jmpInst == JMI || jmpInst == JPL || jmpInst == JPS || jmpInst == JPC { // These jumps are only fused for TEST or AND. return } isAdd := inst == ADDL || inst == ADDQ isSub := inst == SUBL || inst == SUBQ if !isCmp && !isAdd && !isSub { if jmpInst == JCS || jmpInst == JCC || jmpInst == JHI || jmpInst == JLS { // Thses jumpst are only fused for TEST, AND, CMP, ADD, or SUB. return } } } // Now the instruction is ensured to be fused by the processor. // In order to know the fused instruction length before writing into the binary, // we try encoding it. savedLen := uint64(buf.Len()) // Encode the nodes into the buffer. if err = a.encodeNode(buf, n); err != nil { return } if err = a.encodeNode(buf, next); err != nil { return } ret = int32(uint64(buf.Len()) - savedLen) // Revert the written bytes. buf.Truncate(int(savedLen)) return } // nopOpcodes is the multi byte NOP instructions table derived from section 5.8 "Code Padding with Operand-Size Override and Multibyte NOP" // in "AMD Software Optimization Guide for AMD Family 15h Processors" https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf var nopOpcodes = [][11]byte{ {0x90}, {0x66, 0x90}, {0x0f, 0x1f, 0x00}, {0x0f, 0x1f, 0x40, 0x00}, {0x0f, 0x1f, 0x44, 0x00, 0x00}, {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, {0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, {0x66, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, } func (a *AssemblerImpl) padNOP(buf asm.Buffer, num int) { for num > 0 { singleNopNum := num if singleNopNum > len(nopOpcodes) { singleNopNum = len(nopOpcodes) } buf.AppendBytes(nopOpcodes[singleNopNum-1][:singleNopNum]) num -= singleNopNum } } // CompileStandAlone implements the same method as documented on asm.AssemblerBase. func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node { return a.newNode(instruction, operandTypesNoneToNone) } // CompileConstToRegister implements the same method as documented on asm.AssemblerBase. func (a *AssemblerImpl) CompileConstToRegister( instruction asm.Instruction, value asm.ConstantValue, destinationReg asm.Register, ) (inst asm.Node) { n := a.newNode(instruction, operandTypesConstToRegister) n.srcConst = value n.dstReg = destinationReg return n } // CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase. func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) { n := a.newNode(instruction, operandTypesRegisterToRegister) n.srcReg = from n.dstReg = to } // CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase. func (a *AssemblerImpl) CompileMemoryToRegister( instruction asm.Instruction, sourceBaseReg asm.Register, sourceOffsetConst asm.ConstantValue, destinationReg asm.Register, ) { n := a.newNode(instruction, operandTypesMemoryToRegister) n.srcReg = sourceBaseReg n.srcConst = sourceOffsetConst n.dstReg = destinationReg } // CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase. func (a *AssemblerImpl) CompileRegisterToMemory( instruction asm.Instruction, sourceRegister, destinationBaseRegister asm.Register, destinationOffsetConst asm.ConstantValue, ) { n := a.newNode(instruction, operandTypesRegisterToMemory) n.srcReg = sourceRegister n.dstReg = destinationBaseRegister n.dstConst = destinationOffsetConst } // CompileJump implements the same method as documented on asm.AssemblerBase. func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node { return a.newNode(jmpInstruction, operandTypesNoneToBranch) } // CompileJumpToMemory implements the same method as documented on asm.AssemblerBase. func (a *AssemblerImpl) CompileJumpToMemory( jmpInstruction asm.Instruction, baseReg asm.Register, offset asm.ConstantValue, ) { n := a.newNode(jmpInstruction, operandTypesNoneToMemory) n.dstReg = baseReg n.dstConst = offset } // CompileJumpToRegister implements the same method as documented on asm.AssemblerBase. func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) { n := a.newNode(jmpInstruction, operandTypesNoneToRegister) n.dstReg = reg } // CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase. func (a *AssemblerImpl) CompileReadInstructionAddress( destinationRegister asm.Register, beforeAcquisitionTargetInstruction asm.Instruction, ) { n := a.newNode(LEAQ, operandTypesMemoryToRegister) n.dstReg = destinationRegister n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction } // CompileRegisterToRegisterWithArg implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileRegisterToRegisterWithArg( instruction asm.Instruction, from, to asm.Register, arg byte, ) { n := a.newNode(instruction, operandTypesRegisterToRegister) n.srcReg = from n.dstReg = to n.arg = arg } // CompileMemoryWithIndexToRegister implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileMemoryWithIndexToRegister( instruction asm.Instruction, srcBaseReg asm.Register, srcOffsetConst asm.ConstantValue, srcIndex asm.Register, srcScale int16, dstReg asm.Register, ) { n := a.newNode(instruction, operandTypesMemoryToRegister) n.srcReg = srcBaseReg n.srcConst = srcOffsetConst n.srcMemIndex = srcIndex n.srcMemScale = byte(srcScale) n.dstReg = dstReg } // CompileMemoryWithIndexAndArgToRegister implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileMemoryWithIndexAndArgToRegister( instruction asm.Instruction, srcBaseReg asm.Register, srcOffsetConst asm.ConstantValue, srcIndex asm.Register, srcScale int16, dstReg asm.Register, arg byte, ) { n := a.newNode(instruction, operandTypesMemoryToRegister) n.srcReg = srcBaseReg n.srcConst = srcOffsetConst n.srcMemIndex = srcIndex n.srcMemScale = byte(srcScale) n.dstReg = dstReg n.arg = arg } // CompileRegisterToMemoryWithIndex implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileRegisterToMemoryWithIndex( instruction asm.Instruction, srcReg, dstBaseReg asm.Register, dstOffsetConst asm.ConstantValue, dstIndex asm.Register, dstScale int16, ) { n := a.newNode(instruction, operandTypesRegisterToMemory) n.srcReg = srcReg n.dstReg = dstBaseReg n.dstConst = dstOffsetConst n.dstMemIndex = dstIndex n.dstMemScale = byte(dstScale) } // CompileRegisterToMemoryWithIndexAndArg implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileRegisterToMemoryWithIndexAndArg( instruction asm.Instruction, srcReg, dstBaseReg asm.Register, dstOffsetConst asm.ConstantValue, dstIndex asm.Register, dstScale int16, arg byte, ) { n := a.newNode(instruction, operandTypesRegisterToMemory) n.srcReg = srcReg n.dstReg = dstBaseReg n.dstConst = dstOffsetConst n.dstMemIndex = dstIndex n.dstMemScale = byte(dstScale) n.arg = arg } // CompileRegisterToConst implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileRegisterToConst( instruction asm.Instruction, srcRegister asm.Register, value asm.ConstantValue, ) asm.Node { n := a.newNode(instruction, operandTypesRegisterToConst) n.srcReg = srcRegister n.dstConst = value return n } // CompileRegisterToNone implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileRegisterToNone(instruction asm.Instruction, register asm.Register) { n := a.newNode(instruction, operandTypesRegisterToNone) n.srcReg = register } // CompileNoneToRegister implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileNoneToRegister(instruction asm.Instruction, register asm.Register) { n := a.newNode(instruction, operandTypesNoneToRegister) n.dstReg = register } // CompileNoneToMemory implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileNoneToMemory( instruction asm.Instruction, baseReg asm.Register, offset asm.ConstantValue, ) { n := a.newNode(instruction, operandTypesNoneToMemory) n.dstReg = baseReg n.dstConst = offset } // CompileConstToMemory implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileConstToMemory( instruction asm.Instruction, value asm.ConstantValue, dstbaseReg asm.Register, dstOffset asm.ConstantValue, ) asm.Node { n := a.newNode(instruction, operandTypesConstToMemory) n.srcConst = value n.dstReg = dstbaseReg n.dstConst = dstOffset return n } // CompileMemoryToConst implements the same method as documented on amd64.Assembler. func (a *AssemblerImpl) CompileMemoryToConst( instruction asm.Instruction, srcBaseReg asm.Register, srcOffset, value asm.ConstantValue, ) asm.Node { n := a.newNode(instruction, operandTypesMemoryToConst) n.srcReg = srcBaseReg n.srcConst = srcOffset n.dstConst = value return n } func errorEncodingUnsupported(n *nodeImpl) error { return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types) } func (a *AssemblerImpl) encodeNoneToNone(buf asm.Buffer, n *nodeImpl) (err error) { // Throughout the encoding methods, we use this pair of base offset and // code buffer to write instructions. // // The code buffer is allocated at the end of the current buffer to a size // large enough to hold all the bytes that may be written by the method. // // We use Go's append builtin to write to the buffer because it allows the // compiler to generate much better code than if we made calls to write // methods to mutate an encapsulated byte slice. // // At the end of the method, we truncate the buffer size back to the base // plus the length of the code buffer so the end of the buffer points right // after the last byte that was written. base := buf.Len() code := buf.Append(4)[:0] switch n.instruction { case CDQ: // https://www.felixcloutier.com/x86/cwd:cdq:cqo code = append(code, 0x99) case CQO: // https://www.felixcloutier.com/x86/cwd:cdq:cqo code = append(code, rexPrefixW, 0x99) case NOP: // Simply optimize out the NOP instructions. case RET: // https://www.felixcloutier.com/x86/ret code = append(code, 0xc3) case UD2: // https://mudongliang.github.io/x86/html/file_module_x86_id_318.html code = append(code, 0x0f, 0x0b) case REPMOVSQ: code = append(code, 0xf3, rexPrefixW, 0xa5) case REPSTOSQ: code = append(code, 0xf3, rexPrefixW, 0xab) case STD: code = append(code, 0xfd) case CLD: code = append(code, 0xfc) default: err = errorEncodingUnsupported(n) } buf.Truncate(base + len(code)) return } func (a *AssemblerImpl) encodeNoneToRegister(buf asm.Buffer, n *nodeImpl) (err error) { regBits, prefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM := 0b11_000_000 | // Specifying that opeand is register. regBits if n.instruction == JMP { // JMP's opcode is defined as "FF /4" meaning that we have to have "4" // in 4-6th bits in the ModRM byte. https://www.felixcloutier.com/x86/jmp modRM |= 0b00_100_000 } else if n.instruction == NEGQ { prefix |= rexPrefixW modRM |= 0b00_011_000 } else if n.instruction == INCQ { prefix |= rexPrefixW } else if n.instruction == DECQ { prefix |= rexPrefixW modRM |= 0b00_001_000 } else { if RegSP <= n.dstReg && n.dstReg <= RegDI { // If the destination is one byte length register, we need to have the default prefix. // https: //wiki.osdev.org/X86-64_Instruction_Encoding#Registers prefix |= rexPrefixDefault } } base := buf.Len() code := buf.Append(4)[:0] if prefix != rexPrefixNone { // https://wiki.osdev.org/X86-64_Instruction_Encoding#Encoding code = append(code, prefix) } switch n.instruction { case JMP: // https://www.felixcloutier.com/x86/jmp code = append(code, 0xff, modRM) case SETCC: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x93, modRM) case SETCS: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x92, modRM) case SETEQ: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x94, modRM) case SETGE: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x9d, modRM) case SETGT: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x9f, modRM) case SETHI: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x97, modRM) case SETLE: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x9e, modRM) case SETLS: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x96, modRM) case SETLT: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x9c, modRM) case SETNE: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x95, modRM) case SETPC: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x9b, modRM) case SETPS: // https://www.felixcloutier.com/x86/setcc code = append(code, 0x0f, 0x9a, modRM) case NEGQ: // https://www.felixcloutier.com/x86/neg code = append(code, 0xf7, modRM) case INCQ: // https://www.felixcloutier.com/x86/inc code = append(code, 0xff, modRM) case DECQ: // https://www.felixcloutier.com/x86/dec code = append(code, 0xff, modRM) default: err = errorEncodingUnsupported(n) } buf.Truncate(base + len(code)) return } func (a *AssemblerImpl) encodeNoneToMemory(buf asm.Buffer, n *nodeImpl) (err error) { rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true) if err != nil { return err } var opcode byte switch n.instruction { case INCQ: // https://www.felixcloutier.com/x86/inc rexPrefix |= rexPrefixW opcode = 0xff case DECQ: // https://www.felixcloutier.com/x86/dec rexPrefix |= rexPrefixW modRM |= 0b00_001_000 // DEC needs "/1" extension in ModRM. opcode = 0xff case JMP: // https://www.felixcloutier.com/x86/jmp modRM |= 0b00_100_000 // JMP needs "/4" extension in ModRM. opcode = 0xff default: return errorEncodingUnsupported(n) } base := buf.Len() code := buf.Append(12)[:0] if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, opcode, modRM) if sbiExist { code = append(code, sbi) } if displacementWidth != 0 { code = appendConst(code, n.dstConst, displacementWidth) } buf.Truncate(base + len(code)) return } type relativeJumpOpcode struct{ short, long []byte } func (o relativeJumpOpcode) instructionLen(short bool) int64 { if short { return int64(len(o.short)) + 1 // 1 byte = 8 bit offset } else { return int64(len(o.long)) + 4 // 4 byte = 32 bit offset } } var relativeJumpOpcodes = [...]relativeJumpOpcode{ // https://www.felixcloutier.com/x86/jcc JCC: {short: []byte{0x73}, long: []byte{0x0f, 0x83}}, JCS: {short: []byte{0x72}, long: []byte{0x0f, 0x82}}, JEQ: {short: []byte{0x74}, long: []byte{0x0f, 0x84}}, JGE: {short: []byte{0x7d}, long: []byte{0x0f, 0x8d}}, JGT: {short: []byte{0x7f}, long: []byte{0x0f, 0x8f}}, JHI: {short: []byte{0x77}, long: []byte{0x0f, 0x87}}, JLE: {short: []byte{0x7e}, long: []byte{0x0f, 0x8e}}, JLS: {short: []byte{0x76}, long: []byte{0x0f, 0x86}}, JLT: {short: []byte{0x7c}, long: []byte{0x0f, 0x8c}}, JMI: {short: []byte{0x78}, long: []byte{0x0f, 0x88}}, JPL: {short: []byte{0x79}, long: []byte{0x0f, 0x89}}, JNE: {short: []byte{0x75}, long: []byte{0x0f, 0x85}}, JPC: {short: []byte{0x7b}, long: []byte{0x0f, 0x8b}}, JPS: {short: []byte{0x7a}, long: []byte{0x0f, 0x8a}}, // https://www.felixcloutier.com/x86/jmp JMP: {short: []byte{0xeb}, long: []byte{0xe9}}, } func (a *AssemblerImpl) resolveForwardRelativeJumps(buf asm.Buffer, target *nodeImpl) (err error) { offsetInBinary := int64(target.OffsetInBinary()) origin := target.forwardJumpOrigins for ; origin != nil; origin = origin.forwardJumpOrigins { shortJump := origin.isForwardShortJump() op := relativeJumpOpcodes[origin.instruction] instructionLen := op.instructionLen(shortJump) // Calculate the offset from the EIP (at the time of executing this jump instruction) // to the target instruction. This value is always >= 0 as here we only handle forward jumps. offset := offsetInBinary - (int64(origin.OffsetInBinary()) + instructionLen) if shortJump { if offset > math.MaxInt8 { // This forces reassemble in the outer loop inside AssemblerImpl.Assemble(). a.forceReAssemble = true // From the next reAssemble phases, this forward jump will be encoded long jump and // allocate 32-bit offset bytes by default. This means that this `origin` node // will always enter the "long jump offset encoding" block below origin.flag ^= nodeFlagShortForwardJump } else { buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-1] = byte(offset) } } else { // long jump offset encoding. if offset > math.MaxInt32 { return fmt.Errorf("too large jump offset %d for encoding %s", offset, InstructionName(origin.instruction)) } binary.LittleEndian.PutUint32(buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-4:], uint32(offset)) } } return nil } func (a *AssemblerImpl) encodeRelativeJump(buf asm.Buffer, n *nodeImpl) (err error) { if n.jumpTarget == nil { err = fmt.Errorf("jump target must not be nil for relative %s", InstructionName(n.instruction)) return } op := relativeJumpOpcodes[n.instruction] var isShortJump bool // offsetOfEIP means the offset of EIP register at the time of executing this jump instruction. // Relative jump instructions can be encoded with the signed 8-bit or 32-bit integer offsets from the EIP. var offsetOfEIP int64 = 0 // We set zero and resolve later once the target instruction is encoded for forward jumps if n.isBackwardJump() { // If this is the backward jump, we can calculate the exact offset now. offsetOfJumpInstruction := int64(n.jumpTarget.OffsetInBinary()) - int64(n.OffsetInBinary()) isShortJump = offsetOfJumpInstruction-2 >= math.MinInt8 offsetOfEIP = offsetOfJumpInstruction - op.instructionLen(isShortJump) } else { // For forward jumps, we resolve the offset when we Encode the target node. See AssemblerImpl.ResolveForwardRelativeJumps. isShortJump = n.isForwardShortJump() } if offsetOfEIP < math.MinInt32 { // offsetOfEIP is always <= 0 as we don't calculate it for forward jump here. return fmt.Errorf("too large jump offset %d for encoding %s", offsetOfEIP, InstructionName(n.instruction)) } base := buf.Len() code := buf.Append(6)[:0] if isShortJump { code = append(code, op.short...) code = append(code, byte(offsetOfEIP)) } else { code = append(code, op.long...) code = appendUint32(code, uint32(offsetOfEIP)) } buf.Truncate(base + len(code)) return } func (a *AssemblerImpl) encodeRegisterToNone(buf asm.Buffer, n *nodeImpl) (err error) { regBits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM) // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM := 0b11_000_000 | // Specifying that opeand is register. regBits var opcode byte switch n.instruction { case DIVL: // https://www.felixcloutier.com/x86/div modRM |= 0b00_110_000 opcode = 0xf7 case DIVQ: // https://www.felixcloutier.com/x86/div prefix |= rexPrefixW modRM |= 0b00_110_000 opcode = 0xf7 case IDIVL: // https://www.felixcloutier.com/x86/idiv modRM |= 0b00_111_000 opcode = 0xf7 case IDIVQ: // https://www.felixcloutier.com/x86/idiv prefix |= rexPrefixW modRM |= 0b00_111_000 opcode = 0xf7 case MULL: // https://www.felixcloutier.com/x86/mul modRM |= 0b00_100_000 opcode = 0xf7 case MULQ: // https://www.felixcloutier.com/x86/mul prefix |= rexPrefixW modRM |= 0b00_100_000 opcode = 0xf7 default: err = errorEncodingUnsupported(n) } base := buf.Len() code := buf.Append(3)[:0] if prefix != rexPrefixNone { code = append(code, prefix) } code = append(code, opcode, modRM) buf.Truncate(base + len(code)) return } var registerToRegisterOpcode = [instructionEnd]*struct { opcode []byte rPrefix rexPrefix mandatoryPrefix byte srcOnModRMReg bool isSrc8bit bool needArg bool }{ // https://www.felixcloutier.com/x86/add ADDL: {opcode: []byte{0x1}, srcOnModRMReg: true}, ADDQ: {opcode: []byte{0x1}, rPrefix: rexPrefixW, srcOnModRMReg: true}, // https://www.felixcloutier.com/x86/and ANDL: {opcode: []byte{0x21}, srcOnModRMReg: true}, ANDQ: {opcode: []byte{0x21}, rPrefix: rexPrefixW, srcOnModRMReg: true}, // https://www.felixcloutier.com/x86/cmp CMPL: {opcode: []byte{0x39}}, CMPQ: {opcode: []byte{0x39}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/cmovcc CMOVQCS: {opcode: []byte{0x0f, 0x42}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/addsd ADDSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x58}}, // https://www.felixcloutier.com/x86/addss ADDSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x58}}, // https://www.felixcloutier.com/x86/addpd ANDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x54}}, // https://www.felixcloutier.com/x86/addps ANDPS: {opcode: []byte{0x0f, 0x54}}, // https://www.felixcloutier.com/x86/bsr BSRL: {opcode: []byte{0xf, 0xbd}}, BSRQ: {opcode: []byte{0xf, 0xbd}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/comisd COMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2f}}, // https://www.felixcloutier.com/x86/comiss COMISS: {opcode: []byte{0x0f, 0x2f}}, // https://www.felixcloutier.com/x86/cvtsd2ss CVTSD2SS: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5a}}, // https://www.felixcloutier.com/x86/cvtsi2sd CVTSL2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}}, // https://www.felixcloutier.com/x86/cvtsi2sd CVTSQ2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/cvtsi2ss CVTSL2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}}, // https://www.felixcloutier.com/x86/cvtsi2ss CVTSQ2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/cvtss2sd CVTSS2SD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5a}}, // https://www.felixcloutier.com/x86/cvttsd2si CVTTSD2SL: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}}, CVTTSD2SQ: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/cvttss2si CVTTSS2SL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}}, CVTTSS2SQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/divsd DIVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5e}}, // https://www.felixcloutier.com/x86/divss DIVSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5e}}, // https://www.felixcloutier.com/x86/lzcnt LZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}}, LZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/maxsd MAXSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5f}}, // https://www.felixcloutier.com/x86/maxss MAXSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5f}}, // https://www.felixcloutier.com/x86/minsd MINSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5d}}, // https://www.felixcloutier.com/x86/minss MINSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5d}}, // https://www.felixcloutier.com/x86/movsx:movsxd MOVBLSX: {opcode: []byte{0x0f, 0xbe}, isSrc8bit: true}, // https://www.felixcloutier.com/x86/movzx MOVBLZX: {opcode: []byte{0x0f, 0xb6}, isSrc8bit: true}, // https://www.felixcloutier.com/x86/movzx MOVWLZX: {opcode: []byte{0x0f, 0xb7}, isSrc8bit: true}, // https://www.felixcloutier.com/x86/movsx:movsxd MOVBQSX: {opcode: []byte{0x0f, 0xbe}, rPrefix: rexPrefixW, isSrc8bit: true}, // https://www.felixcloutier.com/x86/movsx:movsxd MOVLQSX: {opcode: []byte{0x63}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/movsx:movsxd MOVWQSX: {opcode: []byte{0x0f, 0xbf}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/movsx:movsxd MOVWLSX: {opcode: []byte{0x0f, 0xbf}}, // https://www.felixcloutier.com/x86/imul IMULQ: {opcode: []byte{0x0f, 0xaf}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/mulss MULSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x59}}, // https://www.felixcloutier.com/x86/mulsd MULSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x59}}, // https://www.felixcloutier.com/x86/or ORL: {opcode: []byte{0x09}, srcOnModRMReg: true}, ORQ: {opcode: []byte{0x09}, rPrefix: rexPrefixW, srcOnModRMReg: true}, // https://www.felixcloutier.com/x86/orpd ORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x56}}, // https://www.felixcloutier.com/x86/orps ORPS: {opcode: []byte{0x0f, 0x56}}, // https://www.felixcloutier.com/x86/popcnt POPCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}}, POPCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/roundss ROUNDSS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0a}, needArg: true}, // https://www.felixcloutier.com/x86/roundsd ROUNDSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0b}, needArg: true}, // https://www.felixcloutier.com/x86/sqrtss SQRTSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x51}}, // https://www.felixcloutier.com/x86/sqrtsd SQRTSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x51}}, // https://www.felixcloutier.com/x86/sub SUBL: {opcode: []byte{0x29}, srcOnModRMReg: true}, SUBQ: {opcode: []byte{0x29}, rPrefix: rexPrefixW, srcOnModRMReg: true}, // https://www.felixcloutier.com/x86/subss SUBSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5c}}, // https://www.felixcloutier.com/x86/subsd SUBSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5c}}, // https://www.felixcloutier.com/x86/test TESTL: {opcode: []byte{0x85}, srcOnModRMReg: true}, TESTQ: {opcode: []byte{0x85}, rPrefix: rexPrefixW, srcOnModRMReg: true}, // https://www.felixcloutier.com/x86/tzcnt TZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}}, TZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/ucomisd UCOMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2e}}, // https://www.felixcloutier.com/x86/ucomiss UCOMISS: {opcode: []byte{0x0f, 0x2e}}, // https://www.felixcloutier.com/x86/xchg XCHGQ: {opcode: []byte{0x87}, rPrefix: rexPrefixW, srcOnModRMReg: true}, // https://www.felixcloutier.com/x86/xor XORL: {opcode: []byte{0x31}, srcOnModRMReg: true}, XORQ: {opcode: []byte{0x31}, rPrefix: rexPrefixW, srcOnModRMReg: true}, // https://www.felixcloutier.com/x86/xorpd XORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x57}}, XORPS: {opcode: []byte{0x0f, 0x57}}, // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq PINSRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x20}, needArg: true}, // https://www.felixcloutier.com/x86/pinsrw PINSRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc4}, needArg: true}, // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq PINSRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x22}, needArg: true}, // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq PINSRQ: {mandatoryPrefix: 0x66, rPrefix: rexPrefixW, opcode: []byte{0x0f, 0x3a, 0x22}, needArg: true}, // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 MOVDQU: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x6f}}, // https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64 MOVDQA: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6f}}, // https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq PADDB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfc}}, PADDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfd}}, PADDD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfe}}, PADDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd4}}, // https://www.felixcloutier.com/x86/psubb:psubw:psubd PSUBB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf8}}, PSUBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf9}}, PSUBD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfa}}, // https://www.felixcloutier.com/x86/psubq PSUBQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfb}}, // https://www.felixcloutier.com/x86/addps ADDPS: {opcode: []byte{0x0f, 0x58}}, // https://www.felixcloutier.com/x86/addpd ADDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x58}}, // https://www.felixcloutier.com/x86/subps SUBPS: {opcode: []byte{0x0f, 0x5c}}, // https://www.felixcloutier.com/x86/subpd SUBPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5c}}, // https://www.felixcloutier.com/x86/pxor PXOR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xef}}, // https://www.felixcloutier.com/x86/pand PAND: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdb}}, // https://www.felixcloutier.com/x86/por POR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xeb}}, // https://www.felixcloutier.com/x86/pandn PANDN: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdf}}, // https://www.felixcloutier.com/x86/pshufb PSHUFB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0}}, // https://www.felixcloutier.com/x86/pshufd PSHUFD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x70}, needArg: true}, // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq PEXTRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x14}, needArg: true, srcOnModRMReg: true}, // https://www.felixcloutier.com/x86/pextrw PEXTRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc5}, needArg: true}, // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq PEXTRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, needArg: true, srcOnModRMReg: true}, // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq PEXTRQ: {rPrefix: rexPrefixW, mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, needArg: true, srcOnModRMReg: true}, // https://www.felixcloutier.com/x86/insertps INSERTPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x21}, needArg: true}, // https://www.felixcloutier.com/x86/movlhps MOVLHPS: {opcode: []byte{0x0f, 0x16}}, // https://www.felixcloutier.com/x86/ptest PTEST: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x17}}, // https://www.felixcloutier.com/x86/pcmpeqb:pcmpeqw:pcmpeqd PCMPEQB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x74}}, PCMPEQW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x75}}, PCMPEQD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x76}}, // https://www.felixcloutier.com/x86/pcmpeqq PCMPEQQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x29}}, // https://www.felixcloutier.com/x86/paddusb:paddusw PADDUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdc}}, // https://www.felixcloutier.com/x86/movsd MOVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x10}}, // https://www.felixcloutier.com/x86/packsswb:packssdw PACKSSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x63}}, // https://www.felixcloutier.com/x86/pmovmskb PMOVMSKB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd7}}, // https://www.felixcloutier.com/x86/movmskps MOVMSKPS: {opcode: []byte{0x0f, 0x50}}, // https://www.felixcloutier.com/x86/movmskpd MOVMSKPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x50}}, // https://www.felixcloutier.com/x86/psraw:psrad:psraq PSRAD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe2}}, // https://www.felixcloutier.com/x86/psraw:psrad:psraq PSRAW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe1}}, // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq PSRLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd3}}, // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq PSRLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd2}}, // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq PSRLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd1}}, // https://www.felixcloutier.com/x86/psllw:pslld:psllq PSLLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf1}}, // https://www.felixcloutier.com/x86/psllw:pslld:psllq PSLLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf2}}, // https://www.felixcloutier.com/x86/psllw:pslld:psllq PSLLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf3}}, // https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq PUNPCKLBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x60}}, // https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq PUNPCKHBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x68}}, // https://www.felixcloutier.com/x86/cmpps CMPPS: {opcode: []byte{0x0f, 0xc2}, needArg: true}, // https://www.felixcloutier.com/x86/cmppd CMPPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, needArg: true}, // https://www.felixcloutier.com/x86/pcmpgtq PCMPGTQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x37}}, // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd PCMPGTD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x66}}, // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd PCMPGTW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x65}}, // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd PCMPGTB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x64}}, // https://www.felixcloutier.com/x86/pminsd:pminsq PMINSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x39}}, // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq PMAXSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3d}}, // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq PMAXSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xee}}, // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq PMAXSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3c}}, // https://www.felixcloutier.com/x86/pminsb:pminsw PMINSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xea}}, // https://www.felixcloutier.com/x86/pminsb:pminsw PMINSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x38}}, // https://www.felixcloutier.com/x86/pminud:pminuq PMINUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3b}}, // https://www.felixcloutier.com/x86/pminub:pminuw PMINUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3a}}, // https://www.felixcloutier.com/x86/pminub:pminuw PMINUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xda}}, // https://www.felixcloutier.com/x86/pmaxud:pmaxuq PMAXUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3f}}, // https://www.felixcloutier.com/x86/pmaxub:pmaxuw PMAXUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3e}}, // https://www.felixcloutier.com/x86/pmaxub:pmaxuw PMAXUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xde}}, // https://www.felixcloutier.com/x86/pmullw PMULLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd5}}, // https://www.felixcloutier.com/x86/pmulld:pmullq PMULLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x40}}, // https://www.felixcloutier.com/x86/pmuludq PMULUDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf4}}, // https://www.felixcloutier.com/x86/psubsb:psubsw PSUBSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe8}}, // https://www.felixcloutier.com/x86/psubsb:psubsw PSUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe9}}, // https://www.felixcloutier.com/x86/psubusb:psubusw PSUBUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd8}}, // https://www.felixcloutier.com/x86/psubusb:psubusw PSUBUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd9}}, // https://www.felixcloutier.com/x86/paddsb:paddsw PADDSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xed}}, // https://www.felixcloutier.com/x86/paddsb:paddsw PADDSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xec}}, // https://www.felixcloutier.com/x86/paddusb:paddusw PADDUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdd}}, // https://www.felixcloutier.com/x86/pavgb:pavgw PAVGB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe0}}, // https://www.felixcloutier.com/x86/pavgb:pavgw PAVGW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe3}}, // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq PABSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1c}}, // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq PABSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1d}}, // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq PABSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1e}}, // https://www.felixcloutier.com/x86/blendvpd BLENDVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x15}}, // https://www.felixcloutier.com/x86/maxpd MAXPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5f}}, // https://www.felixcloutier.com/x86/maxps MAXPS: {opcode: []byte{0x0f, 0x5f}}, // https://www.felixcloutier.com/x86/minpd MINPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5d}}, // https://www.felixcloutier.com/x86/minps MINPS: {opcode: []byte{0x0f, 0x5d}}, // https://www.felixcloutier.com/x86/andnpd ANDNPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x55}}, // https://www.felixcloutier.com/x86/andnps ANDNPS: {opcode: []byte{0x0f, 0x55}}, // https://www.felixcloutier.com/x86/mulps MULPS: {opcode: []byte{0x0f, 0x59}}, // https://www.felixcloutier.com/x86/mulpd MULPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x59}}, // https://www.felixcloutier.com/x86/divps DIVPS: {opcode: []byte{0x0f, 0x5e}}, // https://www.felixcloutier.com/x86/divpd DIVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5e}}, // https://www.felixcloutier.com/x86/sqrtps SQRTPS: {opcode: []byte{0x0f, 0x51}}, // https://www.felixcloutier.com/x86/sqrtpd SQRTPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x51}}, // https://www.felixcloutier.com/x86/roundps ROUNDPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x08}, needArg: true}, // https://www.felixcloutier.com/x86/roundpd ROUNDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x09}, needArg: true}, // https://www.felixcloutier.com/x86/palignr PALIGNR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0f}, needArg: true}, // https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq PUNPCKLWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x61}}, // https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq PUNPCKHWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x69}}, // https://www.felixcloutier.com/x86/pmulhuw PMULHUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe4}}, // https://www.felixcloutier.com/x86/pmuldq PMULDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x28}}, // https://www.felixcloutier.com/x86/pmulhrsw PMULHRSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0b}}, // https://www.felixcloutier.com/x86/pmovsx PMOVSXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x20}}, // https://www.felixcloutier.com/x86/pmovsx PMOVSXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x23}}, // https://www.felixcloutier.com/x86/pmovsx PMOVSXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x25}}, // https://www.felixcloutier.com/x86/pmovzx PMOVZXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x30}}, // https://www.felixcloutier.com/x86/pmovzx PMOVZXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x33}}, // https://www.felixcloutier.com/x86/pmovzx PMOVZXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x35}}, // https://www.felixcloutier.com/x86/pmulhw PMULHW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe5}}, // https://www.felixcloutier.com/x86/cmpps CMPEQPS: {opcode: []byte{0x0f, 0xc2}, needArg: true}, // https://www.felixcloutier.com/x86/cmppd CMPEQPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, needArg: true}, // https://www.felixcloutier.com/x86/cvttps2dq CVTTPS2DQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5b}}, // https://www.felixcloutier.com/x86/cvtdq2ps CVTDQ2PS: {opcode: []byte{0x0f, 0x5b}}, // https://www.felixcloutier.com/x86/cvtdq2pd CVTDQ2PD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xe6}}, // https://www.felixcloutier.com/x86/cvtpd2ps CVTPD2PS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5a}}, // https://www.felixcloutier.com/x86/cvtps2pd CVTPS2PD: {opcode: []byte{0x0f, 0x5a}}, // https://www.felixcloutier.com/x86/movupd MOVUPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x10}}, // https://www.felixcloutier.com/x86/shufps SHUFPS: {opcode: []byte{0x0f, 0xc6}, needArg: true}, // https://www.felixcloutier.com/x86/pmaddwd PMADDWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf5}}, // https://www.felixcloutier.com/x86/unpcklps UNPCKLPS: {opcode: []byte{0x0f, 0x14}}, // https://www.felixcloutier.com/x86/packuswb PACKUSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x67}}, // https://www.felixcloutier.com/x86/packsswb:packssdw PACKSSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6b}}, // https://www.felixcloutier.com/x86/packusdw PACKUSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x2b}}, // https://www.felixcloutier.com/x86/pmaddubsw PMADDUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x04}}, // https://www.felixcloutier.com/x86/cvttpd2dq CVTTPD2DQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe6}}, } var registerToRegisterShiftOpcode = [instructionEnd]*struct { opcode []byte rPrefix rexPrefix modRMExtension byte }{ // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror ROLL: {opcode: []byte{0xd3}}, ROLQ: {opcode: []byte{0xd3}, rPrefix: rexPrefixW}, RORL: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000}, RORQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000, rPrefix: rexPrefixW}, // https://www.felixcloutier.com/x86/sal:sar:shl:shr SARL: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000}, SARQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000, rPrefix: rexPrefixW}, SHLL: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000}, SHLQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000, rPrefix: rexPrefixW}, SHRL: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000}, SHRQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000, rPrefix: rexPrefixW}, } func (a *AssemblerImpl) encodeRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) { // Alias for readability inst := n.instruction base := buf.Len() code := buf.Append(8)[:0] switch inst { case MOVL, MOVQ: var ( opcode []byte mandatoryPrefix byte srcOnModRMReg bool rPrefix rexPrefix ) srcIsFloat, dstIsFloat := isVectorRegister(n.srcReg), isVectorRegister(n.dstReg) f2f := srcIsFloat && dstIsFloat if f2f { // https://www.felixcloutier.com/x86/movq opcode, mandatoryPrefix = []byte{0x0f, 0x7e}, 0xf3 } else if srcIsFloat && !dstIsFloat { // https://www.felixcloutier.com/x86/movd:movq opcode, mandatoryPrefix, srcOnModRMReg = []byte{0x0f, 0x7e}, 0x66, true } else if !srcIsFloat && dstIsFloat { // https://www.felixcloutier.com/x86/movd:movq opcode, mandatoryPrefix, srcOnModRMReg = []byte{0x0f, 0x6e}, 0x66, false } else { // https://www.felixcloutier.com/x86/mov opcode, srcOnModRMReg = []byte{0x89}, true } rexPrefix, modRM, err := n.getRegisterToRegisterModRM(srcOnModRMReg) if err != nil { return err } rexPrefix |= rPrefix if inst == MOVQ && !f2f { rexPrefix |= rexPrefixW } if mandatoryPrefix != 0 { code = append(code, mandatoryPrefix) } if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, opcode...) code = append(code, modRM) buf.Truncate(base + len(code)) return nil } if op := registerToRegisterOpcode[inst]; op != nil { rexPrefix, modRM, err := n.getRegisterToRegisterModRM(op.srcOnModRMReg) if err != nil { return err } rexPrefix |= op.rPrefix if op.isSrc8bit && RegSP <= n.srcReg && n.srcReg <= RegDI { // If an operand register is 8-bit length of SP, BP, DI, or SI register, we need to have the default prefix. // https://wiki.osdev.org/X86-64_Instruction_Encoding#Registers rexPrefix |= rexPrefixDefault } if op.mandatoryPrefix != 0 { code = append(code, op.mandatoryPrefix) } if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, op.opcode...) code = append(code, modRM) if op.needArg { code = append(code, n.arg) } } else if op := registerToRegisterShiftOpcode[inst]; op != nil { reg3bits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) rexPrefix |= op.rPrefix if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM := 0b11_000_000 | (op.modRMExtension) | reg3bits code = append(code, op.opcode...) code = append(code, modRM) } else { return errorEncodingUnsupported(n) } buf.Truncate(base + len(code)) return nil } func (a *AssemblerImpl) encodeRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) { rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true) if err != nil { return err } var opcode []byte var mandatoryPrefix byte var isShiftInstruction bool var needArg bool switch n.instruction { case CMPL: // https://www.felixcloutier.com/x86/cmp opcode = []byte{0x3b} case CMPQ: // https://www.felixcloutier.com/x86/cmp rexPrefix |= rexPrefixW opcode = []byte{0x3b} case MOVB: // https://www.felixcloutier.com/x86/mov opcode = []byte{0x88} // 1 byte register operands need default prefix for the following registers. if n.srcReg >= RegSP && n.srcReg <= RegDI { rexPrefix |= rexPrefixDefault } case MOVL: if isVectorRegister(n.srcReg) { // https://www.felixcloutier.com/x86/movd:movq opcode = []byte{0x0f, 0x7e} mandatoryPrefix = 0x66 } else { // https://www.felixcloutier.com/x86/mov opcode = []byte{0x89} } case MOVQ: if isVectorRegister(n.srcReg) { // https://www.felixcloutier.com/x86/movq opcode = []byte{0x0f, 0xd6} mandatoryPrefix = 0x66 } else { // https://www.felixcloutier.com/x86/mov rexPrefix |= rexPrefixW opcode = []byte{0x89} } case MOVW: // https://www.felixcloutier.com/x86/mov // Note: Need 0x66 to indicate that the operand size is 16-bit. // https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix mandatoryPrefix = 0x66 opcode = []byte{0x89} case SARL: // https://www.felixcloutier.com/x86/sal:sar:shl:shr modRM |= 0b00_111_000 opcode = []byte{0xd3} isShiftInstruction = true case SARQ: // https://www.felixcloutier.com/x86/sal:sar:shl:shr rexPrefix |= rexPrefixW modRM |= 0b00_111_000 opcode = []byte{0xd3} isShiftInstruction = true case SHLL: // https://www.felixcloutier.com/x86/sal:sar:shl:shr modRM |= 0b00_100_000 opcode = []byte{0xd3} isShiftInstruction = true case SHLQ: // https://www.felixcloutier.com/x86/sal:sar:shl:shr rexPrefix |= rexPrefixW modRM |= 0b00_100_000 opcode = []byte{0xd3} isShiftInstruction = true case SHRL: // https://www.felixcloutier.com/x86/sal:sar:shl:shr modRM |= 0b00_101_000 opcode = []byte{0xd3} isShiftInstruction = true case SHRQ: // https://www.felixcloutier.com/x86/sal:sar:shl:shr rexPrefix |= rexPrefixW modRM |= 0b00_101_000 opcode = []byte{0xd3} isShiftInstruction = true case ROLL: // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror opcode = []byte{0xd3} isShiftInstruction = true case ROLQ: // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror rexPrefix |= rexPrefixW opcode = []byte{0xd3} isShiftInstruction = true case RORL: // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror modRM |= 0b00_001_000 opcode = []byte{0xd3} isShiftInstruction = true case RORQ: // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror rexPrefix |= rexPrefixW opcode = []byte{0xd3} modRM |= 0b00_001_000 isShiftInstruction = true case MOVDQU: // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 mandatoryPrefix = 0xf3 opcode = []byte{0x0f, 0x7f} case PEXTRB: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x3a, 0x14} needArg = true case PEXTRW: // https://www.felixcloutier.com/x86/pextrw mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x3a, 0x15} needArg = true case PEXTRD: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x3a, 0x16} needArg = true case PEXTRQ: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq mandatoryPrefix = 0x66 rexPrefix |= rexPrefixW // REX.W opcode = []byte{0x0f, 0x3a, 0x16} needArg = true default: return errorEncodingUnsupported(n) } if !isShiftInstruction { srcReg3Bits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldReg) rexPrefix |= prefix modRM |= srcReg3Bits << 3 // Place the source register on ModRM:reg } else { if n.srcReg != RegCX { return fmt.Errorf("shifting instruction %s require CX register as src but got %s", InstructionName(n.instruction), RegisterName(n.srcReg)) } } base := buf.Len() code := buf.Append(16)[:0] if mandatoryPrefix != 0 { // https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix code = append(code, mandatoryPrefix) } if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, opcode...) code = append(code, modRM) if sbiExist { code = append(code, sbi) } if displacementWidth != 0 { code = appendConst(code, n.dstConst, displacementWidth) } if needArg { code = append(code, n.arg) } buf.Truncate(base + len(code)) return } func (a *AssemblerImpl) encodeRegisterToConst(buf asm.Buffer, n *nodeImpl) (err error) { regBits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM) base := buf.Len() code := buf.Append(10)[:0] switch n.instruction { case CMPL, CMPQ: if n.instruction == CMPQ { prefix |= rexPrefixW } if prefix != rexPrefixNone { code = append(code, prefix) } is8bitConst := fitInSigned8bit(n.dstConst) // https://www.felixcloutier.com/x86/cmp if n.srcReg == RegAX && !is8bitConst { code = append(code, 0x3d) } else { // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM := 0b11_000_000 | // Specifying that opeand is register. 0b00_111_000 | // CMP with immediate needs "/7" extension. regBits if is8bitConst { code = append(code, 0x83, modRM) } else { code = append(code, 0x81, modRM) } } default: err = errorEncodingUnsupported(n) } if fitInSigned8bit(n.dstConst) { code = append(code, byte(n.dstConst)) } else { code = appendUint32(code, uint32(n.dstConst)) } buf.Truncate(base + len(code)) return } func (a *AssemblerImpl) finalizeReadInstructionAddressNode(code []byte, n *nodeImpl) (err error) { // Find the target instruction node. targetNode := n for ; targetNode != nil; targetNode = targetNode.next { if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction { targetNode = targetNode.next break } } if targetNode == nil { return errors.New("BUG: target instruction not found for read instruction address") } offset := targetNode.OffsetInBinary() - (n.OffsetInBinary() + 7 /* 7 = the length of the LEAQ instruction */) if offset >= math.MaxInt32 { return errors.New("BUG: too large offset for LEAQ instruction") } binary.LittleEndian.PutUint32(code[n.OffsetInBinary()+3:], uint32(int32(offset))) return nil } func (a *AssemblerImpl) encodeReadInstructionAddress(buf asm.Buffer, n *nodeImpl) error { dstReg3Bits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg) a.readInstructionAddressNodes = append(a.readInstructionAddressNodes, n) // https://www.felixcloutier.com/x86/lea opcode := byte(0x8d) rexPrefix |= rexPrefixW // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing modRM := 0b00_000_101 | // Indicate "LEAQ [RIP + 32bit displacement], dstReg" encoding. (dstReg3Bits << 3) // Place the dstReg on ModRM:reg. code := buf.Append(7) code[0] = rexPrefix code[1] = opcode code[2] = modRM binary.LittleEndian.PutUint32(code[3:], 0) // Preserve return nil } func (a *AssemblerImpl) encodeMemoryToRegister(buf asm.Buffer, n *nodeImpl) (err error) { if n.instruction == LEAQ && n.readInstructionAddressBeforeTargetInstruction != NONE { return a.encodeReadInstructionAddress(buf, n) } rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(false) if err != nil { return err } dstReg3Bits, prefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg) rexPrefix |= prefix modRM |= dstReg3Bits << 3 // Place the destination register on ModRM:reg var mandatoryPrefix byte var opcode []byte var needArg bool switch n.instruction { case ADDL: // https://www.felixcloutier.com/x86/add opcode = []byte{0x03} case ADDQ: // https://www.felixcloutier.com/x86/add rexPrefix |= rexPrefixW opcode = []byte{0x03} case CMPL: // https://www.felixcloutier.com/x86/cmp opcode = []byte{0x39} case CMPQ: // https://www.felixcloutier.com/x86/cmp rexPrefix |= rexPrefixW opcode = []byte{0x39} case LEAQ: // https://www.felixcloutier.com/x86/lea rexPrefix |= rexPrefixW opcode = []byte{0x8d} case MOVBLSX: // https://www.felixcloutier.com/x86/movsx:movsxd opcode = []byte{0x0f, 0xbe} case MOVBLZX: // https://www.felixcloutier.com/x86/movzx opcode = []byte{0x0f, 0xb6} case MOVBQSX: // https://www.felixcloutier.com/x86/movsx:movsxd rexPrefix |= rexPrefixW opcode = []byte{0x0f, 0xbe} case MOVBQZX: // https://www.felixcloutier.com/x86/movzx rexPrefix |= rexPrefixW opcode = []byte{0x0f, 0xb6} case MOVLQSX: // https://www.felixcloutier.com/x86/movsx:movsxd rexPrefix |= rexPrefixW opcode = []byte{0x63} case MOVLQZX: // https://www.felixcloutier.com/x86/mov // Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and // that is semantically equivalent to MOV 32bit to 32bit. opcode = []byte{0x8B} case MOVL: // https://www.felixcloutier.com/x86/mov // Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and // that is semantically equivalent to MOV 32bit to 32bit. if isVectorRegister(n.dstReg) { // https://www.felixcloutier.com/x86/movd:movq opcode = []byte{0x0f, 0x6e} mandatoryPrefix = 0x66 } else { // https://www.felixcloutier.com/x86/mov opcode = []byte{0x8B} } case MOVQ: if isVectorRegister(n.dstReg) { // https://www.felixcloutier.com/x86/movq opcode = []byte{0x0f, 0x7e} mandatoryPrefix = 0xf3 } else { // https://www.felixcloutier.com/x86/mov rexPrefix |= rexPrefixW opcode = []byte{0x8B} } case MOVWLSX: // https://www.felixcloutier.com/x86/movsx:movsxd opcode = []byte{0x0f, 0xbf} case MOVWLZX: // https://www.felixcloutier.com/x86/movzx opcode = []byte{0x0f, 0xb7} case MOVWQSX: // https://www.felixcloutier.com/x86/movsx:movsxd rexPrefix |= rexPrefixW opcode = []byte{0x0f, 0xbf} case MOVWQZX: // https://www.felixcloutier.com/x86/movzx rexPrefix |= rexPrefixW opcode = []byte{0x0f, 0xb7} case SUBQ: // https://www.felixcloutier.com/x86/sub rexPrefix |= rexPrefixW opcode = []byte{0x2b} case SUBSD: // https://www.felixcloutier.com/x86/subsd opcode = []byte{0x0f, 0x5c} mandatoryPrefix = 0xf2 case SUBSS: // https://www.felixcloutier.com/x86/subss opcode = []byte{0x0f, 0x5c} mandatoryPrefix = 0xf3 case UCOMISD: // https://www.felixcloutier.com/x86/ucomisd opcode = []byte{0x0f, 0x2e} mandatoryPrefix = 0x66 case UCOMISS: // https://www.felixcloutier.com/x86/ucomiss opcode = []byte{0x0f, 0x2e} case MOVDQU: // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 mandatoryPrefix = 0xf3 opcode = []byte{0x0f, 0x6f} case PMOVSXBW: // https://www.felixcloutier.com/x86/pmovsx mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x38, 0x20} case PMOVSXWD: // https://www.felixcloutier.com/x86/pmovsx mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x38, 0x23} case PMOVSXDQ: // https://www.felixcloutier.com/x86/pmovsx mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x38, 0x25} case PMOVZXBW: // https://www.felixcloutier.com/x86/pmovzx mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x38, 0x30} case PMOVZXWD: // https://www.felixcloutier.com/x86/pmovzx mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x38, 0x33} case PMOVZXDQ: // https://www.felixcloutier.com/x86/pmovzx mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x38, 0x35} case PINSRB: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x3a, 0x20} needArg = true case PINSRW: // https://www.felixcloutier.com/x86/pinsrw mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0xc4} needArg = true case PINSRD: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x3a, 0x22} needArg = true case PINSRQ: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq rexPrefix |= rexPrefixW mandatoryPrefix = 0x66 opcode = []byte{0x0f, 0x3a, 0x22} needArg = true default: return errorEncodingUnsupported(n) } base := buf.Len() code := buf.Append(16)[:0] if mandatoryPrefix != 0 { // https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix code = append(code, mandatoryPrefix) } if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, opcode...) code = append(code, modRM) if sbiExist { code = append(code, sbi) } if displacementWidth != 0 { code = appendConst(code, n.srcConst, displacementWidth) } if needArg { code = append(code, n.arg) } buf.Truncate(base + len(code)) return } func (a *AssemblerImpl) encodeConstToRegister(buf asm.Buffer, n *nodeImpl) (err error) { regBits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) isFloatReg := isVectorRegister(n.dstReg) switch n.instruction { case PSLLD, PSLLQ, PSRLD, PSRLQ, PSRAW, PSRLW, PSLLW, PSRAD: if !isFloatReg { return fmt.Errorf("%s needs float register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg)) } default: if isFloatReg { return fmt.Errorf("%s needs int register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg)) } } if n.instruction != MOVQ && !fitIn32bit(n.srcConst) { return fmt.Errorf("constant must fit in 32-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) } else if (n.instruction == SHLQ || n.instruction == SHRQ) && (n.srcConst < 0 || n.srcConst > math.MaxUint8) { return fmt.Errorf("constant must fit in positive 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) } else if (n.instruction == PSLLD || n.instruction == PSLLQ || n.instruction == PSRLD || n.instruction == PSRLQ) && (n.srcConst < math.MinInt8 || n.srcConst > math.MaxInt8) { return fmt.Errorf("constant must fit in signed 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) } base := buf.Len() code := buf.Append(32)[:0] isSigned8bitConst := fitInSigned8bit(n.srcConst) switch inst := n.instruction; inst { case ADDQ: // https://www.felixcloutier.com/x86/add rexPrefix |= rexPrefixW if n.dstReg == RegAX && !isSigned8bitConst { code = append(code, rexPrefix, 0x05) } else { modRM := 0b11_000_000 | // Specifying that opeand is register. regBits if isSigned8bitConst { code = append(code, rexPrefix, 0x83, modRM) } else { code = append(code, rexPrefix, 0x81, modRM) } } if isSigned8bitConst { code = append(code, byte(n.srcConst)) } else { code = appendUint32(code, uint32(n.srcConst)) } case ANDQ: // https://www.felixcloutier.com/x86/and rexPrefix |= rexPrefixW if n.dstReg == RegAX && !isSigned8bitConst { code = append(code, rexPrefix, 0x25) } else { modRM := 0b11_000_000 | // Specifying that opeand is register. 0b00_100_000 | // AND with immediate needs "/4" extension. regBits if isSigned8bitConst { code = append(code, rexPrefix, 0x83, modRM) } else { code = append(code, rexPrefix, 0x81, modRM) } } if fitInSigned8bit(n.srcConst) { code = append(code, byte(n.srcConst)) } else { code = appendUint32(code, uint32(n.srcConst)) } case TESTQ: // https://www.felixcloutier.com/x86/test rexPrefix |= rexPrefixW if n.dstReg == RegAX && !isSigned8bitConst { code = append(code, rexPrefix, 0xa9) } else { modRM := 0b11_000_000 | // Specifying that operand is register regBits code = append(code, rexPrefix, 0xf7, modRM) } code = appendUint32(code, uint32(n.srcConst)) case MOVL: // https://www.felixcloutier.com/x86/mov if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, 0xb8|regBits) code = appendUint32(code, uint32(n.srcConst)) case MOVQ: // https://www.felixcloutier.com/x86/mov if fitIn32bit(n.srcConst) { if n.srcConst > math.MaxInt32 { if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, 0xb8|regBits) } else { rexPrefix |= rexPrefixW modRM := 0b11_000_000 | // Specifying that opeand is register. regBits code = append(code, rexPrefix, 0xc7, modRM) } code = appendUint32(code, uint32(n.srcConst)) } else { rexPrefix |= rexPrefixW code = append(code, rexPrefix, 0xb8|regBits) code = appendUint64(code, uint64(n.srcConst)) } case SHLQ: // https://www.felixcloutier.com/x86/sal:sar:shl:shr rexPrefix |= rexPrefixW modRM := 0b11_000_000 | // Specifying that opeand is register. 0b00_100_000 | // SHL with immediate needs "/4" extension. regBits if n.srcConst == 1 { code = append(code, rexPrefix, 0xd1, modRM) } else { code = append(code, rexPrefix, 0xc1, modRM, byte(n.srcConst)) } case SHRQ: // https://www.felixcloutier.com/x86/sal:sar:shl:shr rexPrefix |= rexPrefixW modRM := 0b11_000_000 | // Specifying that opeand is register. 0b00_101_000 | // SHR with immediate needs "/5" extension. regBits if n.srcConst == 1 { code = append(code, rexPrefix, 0xd1, modRM) } else { code = append(code, rexPrefix, 0xc1, modRM, byte(n.srcConst)) } case PSLLD: // https://www.felixcloutier.com/x86/psllw:pslld:psllq modRM := 0b11_000_000 | // Specifying that opeand is register. 0b00_110_000 | // PSLL with immediate needs "/6" extension. regBits if rexPrefix != rexPrefixNone { code = append(code, 0x66, rexPrefix, 0x0f, 0x72, modRM, byte(n.srcConst)) } else { code = append(code, 0x66, 0x0f, 0x72, modRM, byte(n.srcConst)) } case PSLLQ: // https://www.felixcloutier.com/x86/psllw:pslld:psllq modRM := 0b11_000_000 | // Specifying that opeand is register. 0b00_110_000 | // PSLL with immediate needs "/6" extension. regBits if rexPrefix != rexPrefixNone { code = append(code, 0x66, rexPrefix, 0x0f, 0x73, modRM, byte(n.srcConst)) } else { code = append(code, 0x66, 0x0f, 0x73, modRM, byte(n.srcConst)) } case PSRLD: // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq // https://www.felixcloutier.com/x86/psllw:pslld:psllq modRM := 0b11_000_000 | // Specifying that operand is register. 0b00_010_000 | // PSRL with immediate needs "/2" extension. regBits if rexPrefix != rexPrefixNone { code = append(code, 0x66, rexPrefix, 0x0f, 0x72, modRM, byte(n.srcConst)) } else { code = append(code, 0x66, 0x0f, 0x72, modRM, byte(n.srcConst)) } case PSRLQ: // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq modRM := 0b11_000_000 | // Specifying that operand is register. 0b00_010_000 | // PSRL with immediate needs "/2" extension. regBits if rexPrefix != rexPrefixNone { code = append(code, 0x66, rexPrefix, 0x0f, 0x73, modRM, byte(n.srcConst)) } else { code = append(code, 0x66, 0x0f, 0x73, modRM, byte(n.srcConst)) } case PSRAW, PSRAD: // https://www.felixcloutier.com/x86/psraw:psrad:psraq modRM := 0b11_000_000 | // Specifying that operand is register. 0b00_100_000 | // PSRAW with immediate needs "/4" extension. regBits code = append(code, 0x66) if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } var op byte if inst == PSRAD { op = 0x72 } else { // PSRAW op = 0x71 } code = append(code, 0x0f, op, modRM, byte(n.srcConst)) case PSRLW: // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq modRM := 0b11_000_000 | // Specifying that operand is register. 0b00_010_000 | // PSRLW with immediate needs "/2" extension. regBits code = append(code, 0x66) if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, 0x0f, 0x71, modRM, byte(n.srcConst)) case PSLLW: // https://www.felixcloutier.com/x86/psllw:pslld:psllq modRM := 0b11_000_000 | // Specifying that operand is register. 0b00_110_000 | // PSLLW with immediate needs "/6" extension. regBits code = append(code, 0x66) if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, 0x0f, 0x71, modRM, byte(n.srcConst)) case XORL, XORQ: // https://www.felixcloutier.com/x86/xor if inst == XORQ { rexPrefix |= rexPrefixW } if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } if n.dstReg == RegAX && !isSigned8bitConst { code = append(code, 0x35) } else { modRM := 0b11_000_000 | // Specifying that opeand is register. 0b00_110_000 | // XOR with immediate needs "/6" extension. regBits if isSigned8bitConst { code = append(code, 0x83, modRM) } else { code = append(code, 0x81, modRM) } } if fitInSigned8bit(n.srcConst) { code = append(code, byte(n.srcConst)) } else { code = appendUint32(code, uint32(n.srcConst)) } default: err = errorEncodingUnsupported(n) } buf.Truncate(base + len(code)) return } func (a *AssemblerImpl) encodeMemoryToConst(buf asm.Buffer, n *nodeImpl) (err error) { if !fitIn32bit(n.dstConst) { return fmt.Errorf("too large target const %d for %s", n.dstConst, InstructionName(n.instruction)) } rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(false) if err != nil { return err } // Alias for readability. c := n.dstConst var opcode, constWidth byte switch n.instruction { case CMPL: // https://www.felixcloutier.com/x86/cmp if fitInSigned8bit(c) { opcode = 0x83 constWidth = 8 } else { opcode = 0x81 constWidth = 32 } modRM |= 0b00_111_000 default: return errorEncodingUnsupported(n) } base := buf.Len() code := buf.Append(20)[:0] if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, opcode, modRM) if sbiExist { code = append(code, sbi) } if displacementWidth != 0 { code = appendConst(code, n.srcConst, displacementWidth) } code = appendConst(code, c, constWidth) buf.Truncate(base + len(code)) return } func (a *AssemblerImpl) encodeConstToMemory(buf asm.Buffer, n *nodeImpl) (err error) { rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true) if err != nil { return err } // Alias for readability. inst := n.instruction c := n.srcConst if inst == MOVB && !fitInSigned8bit(c) { return fmt.Errorf("too large load target const %d for MOVB", c) } else if !fitIn32bit(c) { return fmt.Errorf("too large load target const %d for %s", c, InstructionName(n.instruction)) } var constWidth, opcode byte switch inst { case MOVB: opcode = 0xc6 constWidth = 8 case MOVL: opcode = 0xc7 constWidth = 32 case MOVQ: rexPrefix |= rexPrefixW opcode = 0xc7 constWidth = 32 default: return errorEncodingUnsupported(n) } base := buf.Len() code := buf.Append(20)[:0] if rexPrefix != rexPrefixNone { code = append(code, rexPrefix) } code = append(code, opcode, modRM) if sbiExist { code = append(code, sbi) } if displacementWidth != 0 { code = appendConst(code, n.dstConst, displacementWidth) } code = appendConst(code, c, constWidth) buf.Truncate(base + len(code)) return } func appendUint32(code []byte, v uint32) []byte { b := [4]byte{} binary.LittleEndian.PutUint32(b[:], uint32(v)) return append(code, b[:]...) } func appendUint64(code []byte, v uint64) []byte { b := [8]byte{} binary.LittleEndian.PutUint64(b[:], uint64(v)) return append(code, b[:]...) } func appendConst(code []byte, v int64, length byte) []byte { switch length { case 8: return append(code, byte(v)) case 32: return appendUint32(code, uint32(v)) default: return appendUint64(code, uint64(v)) } } func (n *nodeImpl) getMemoryLocation(dstMem bool) (p rexPrefix, modRM byte, sbi byte, sbiExist bool, displacementWidth byte, err error) { var baseReg, indexReg asm.Register var offset asm.ConstantValue var scale byte if dstMem { baseReg, offset, indexReg, scale = n.dstReg, n.dstConst, n.dstMemIndex, n.dstMemScale } else { baseReg, offset, indexReg, scale = n.srcReg, n.srcConst, n.srcMemIndex, n.srcMemScale } if !fitIn32bit(offset) { err = errors.New("offset does not fit in 32-bit integer") return } if baseReg == asm.NilRegister && indexReg != asm.NilRegister { // [(index*scale) + displacement] addressing is possible, but we haven't used it for now. err = errors.New("addressing without base register but with index is not implemented") } else if baseReg == asm.NilRegister { modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB. sbi, sbiExist = byte(0b00_100_101), true displacementWidth = 32 } else if indexReg == asm.NilRegister { modRM, p = register3bits(baseReg, registerSpecifierPositionModRMFieldRM) // Create ModR/M byte so that this instruction takes [R/M + displacement] operand if displacement !=0 // and otherwise [R/M]. withoutDisplacement := offset == 0 && // If the target register is R13 or BP, we have to keep [R/M + displacement] even if the value // is zero since it's not [R/M] operand is not defined for these two registers. // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing baseReg != RegR13 && baseReg != RegBP if withoutDisplacement { // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM |= 0b00_000_000 // Specifying that operand is memory without displacement displacementWidth = 0 } else if fitInSigned8bit(offset) { // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM |= 0b01_000_000 // Specifying that operand is memory + 8bit displacement. displacementWidth = 8 } else { // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM |= 0b10_000_000 // Specifying that operand is memory + 32bit displacement. displacementWidth = 32 } // For SP and R12 register, we have [SIB + displacement] if the const is non-zero, otherwise [SIP]. // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing // // Thefore we emit the SIB byte before the const so that [SIB + displacement] ends up [register + displacement]. // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing_2 if baseReg == RegSP || baseReg == RegR12 { sbi, sbiExist = byte(0b00_100_100), true } } else { if indexReg == RegSP { err = errors.New("SP cannot be used for SIB index") return } modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB. withoutDisplacement := offset == 0 && // For R13 and BP, base registers cannot be encoded "without displacement" mod (i.e. 0b00 mod). baseReg != RegR13 && baseReg != RegBP if withoutDisplacement { // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM |= 0b00_000_000 // Specifying that operand is SIB without displacement displacementWidth = 0 } else if fitInSigned8bit(offset) { // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM |= 0b01_000_000 // Specifying that operand is SIB + 8bit displacement. displacementWidth = 8 } else { // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM |= 0b10_000_000 // Specifying that operand is SIB + 32bit displacement. displacementWidth = 32 } var baseRegBits byte baseRegBits, p = register3bits(baseReg, registerSpecifierPositionModRMFieldRM) var indexRegBits byte var indexRegPrefix rexPrefix indexRegBits, indexRegPrefix = register3bits(indexReg, registerSpecifierPositionSIBIndex) p |= indexRegPrefix sbi, sbiExist = baseRegBits|(indexRegBits<<3), true switch scale { case 1: sbi |= 0b00_000_000 case 2: sbi |= 0b01_000_000 case 4: sbi |= 0b10_000_000 case 8: sbi |= 0b11_000_000 default: err = fmt.Errorf("scale in SIB must be one of 1, 2, 4, 8 but got %d", scale) return } } return } // getRegisterToRegisterModRM does XXXX // // TODO: srcOnModRMReg can be deleted after golang-asm removal. This is necessary to match our implementation // with golang-asm, but in practice, there are equivalent opcodes to always have src on ModRM:reg without ambiguity. func (n *nodeImpl) getRegisterToRegisterModRM(srcOnModRMReg bool) (rexPrefix, modRM byte, err error) { var reg3bits, rm3bits byte if srcOnModRMReg { reg3bits, rexPrefix = register3bits(n.srcReg, // Indicate that srcReg will be specified by ModRM:reg. registerSpecifierPositionModRMFieldReg) var dstRexPrefix byte rm3bits, dstRexPrefix = register3bits(n.dstReg, // Indicate that dstReg will be specified by ModRM:r/m. registerSpecifierPositionModRMFieldRM) rexPrefix |= dstRexPrefix } else { rm3bits, rexPrefix = register3bits(n.srcReg, // Indicate that srcReg will be specified by ModRM:r/m. registerSpecifierPositionModRMFieldRM) var dstRexPrefix byte reg3bits, dstRexPrefix = register3bits(n.dstReg, // Indicate that dstReg will be specified by ModRM:reg. registerSpecifierPositionModRMFieldReg) rexPrefix |= dstRexPrefix } // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM modRM = 0b11_000_000 | // Specifying that dst operand is register. (reg3bits << 3) | rm3bits return } // RexPrefix represents REX prefix https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix type rexPrefix = byte // REX prefixes are independent of each other and can be combined with OR. const ( rexPrefixNone rexPrefix = 0x0000_0000 // Indicates that the instruction doesn't need RexPrefix. rexPrefixDefault rexPrefix = 0b0100_0000 rexPrefixW = 0b0000_1000 | rexPrefixDefault // REX.W rexPrefixR = 0b0000_0100 | rexPrefixDefault // REX.R rexPrefixX = 0b0000_0010 | rexPrefixDefault // REX.X rexPrefixB = 0b0000_0001 | rexPrefixDefault // REX.B ) // registerSpecifierPosition represents the position in the instruction bytes where an operand register is placed. type registerSpecifierPosition byte const ( registerSpecifierPositionModRMFieldReg registerSpecifierPosition = iota registerSpecifierPositionModRMFieldRM registerSpecifierPositionSIBIndex ) var regInfo = [...]struct { bits byte needRex bool }{ RegAX: {bits: 0b000}, RegCX: {bits: 0b001}, RegDX: {bits: 0b010}, RegBX: {bits: 0b011}, RegSP: {bits: 0b100}, RegBP: {bits: 0b101}, RegSI: {bits: 0b110}, RegDI: {bits: 0b111}, RegR8: {bits: 0b000, needRex: true}, RegR9: {bits: 0b001, needRex: true}, RegR10: {bits: 0b010, needRex: true}, RegR11: {bits: 0b011, needRex: true}, RegR12: {bits: 0b100, needRex: true}, RegR13: {bits: 0b101, needRex: true}, RegR14: {bits: 0b110, needRex: true}, RegR15: {bits: 0b111, needRex: true}, RegX0: {bits: 0b000}, RegX1: {bits: 0b001}, RegX2: {bits: 0b010}, RegX3: {bits: 0b011}, RegX4: {bits: 0b100}, RegX5: {bits: 0b101}, RegX6: {bits: 0b110}, RegX7: {bits: 0b111}, RegX8: {bits: 0b000, needRex: true}, RegX9: {bits: 0b001, needRex: true}, RegX10: {bits: 0b010, needRex: true}, RegX11: {bits: 0b011, needRex: true}, RegX12: {bits: 0b100, needRex: true}, RegX13: {bits: 0b101, needRex: true}, RegX14: {bits: 0b110, needRex: true}, RegX15: {bits: 0b111, needRex: true}, } func register3bits( reg asm.Register, registerSpecifierPosition registerSpecifierPosition, ) (bits byte, prefix rexPrefix) { info := regInfo[reg] bits = info.bits if info.needRex { // https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix switch registerSpecifierPosition { case registerSpecifierPositionModRMFieldReg: prefix = rexPrefixR case registerSpecifierPositionModRMFieldRM: prefix = rexPrefixB case registerSpecifierPositionSIBIndex: prefix = rexPrefixX } } return } func fitIn32bit(v int64) bool { return math.MinInt32 <= v && v <= math.MaxUint32 } func fitInSigned8bit(v int64) bool { return math.MinInt8 <= v && v <= math.MaxInt8 } func isVectorRegister(r asm.Register) bool { return RegX0 <= r && r <= RegX15 }