1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // Assembler to produce x86-64 instructions. Somewhat influenced by V8 assembler.
18 
19 #ifndef BERBERIS_ASSEMBLER_X86_64_H_
20 #define BERBERIS_ASSEMBLER_X86_64_H_
21 
22 #include <type_traits>  // std::is_same
23 
24 #include "berberis/assembler/common_x86.h"
25 #include "berberis/base/logging.h"
26 #include "berberis/base/macros.h"  // DISALLOW_IMPLICIT_CONSTRUCTORS
27 
28 namespace berberis {
29 
30 class MachindeCode;
31 
32 namespace x86_64 {
33 
34 class Assembler : public AssemblerX86<Assembler> {
35  public:
Assembler(MachineCode * code)36   explicit Assembler(MachineCode* code) : AssemblerX86(code) {}
37 
38   static constexpr Register no_register{0x80};
39   static constexpr Register rax{0};
40   static constexpr Register rcx{1};
41   static constexpr Register rdx{2};
42   static constexpr Register rbx{3};
43   static constexpr Register rsp{4};
44   static constexpr Register rbp{5};
45   static constexpr Register rsi{6};
46   static constexpr Register rdi{7};
47   static constexpr Register r8{8};
48   static constexpr Register r9{9};
49   static constexpr Register r10{10};
50   static constexpr Register r11{11};
51   static constexpr Register r12{12};
52   static constexpr Register r13{13};
53   static constexpr Register r14{14};
54   static constexpr Register r15{15};
55 
56   static constexpr XMMRegister no_xmm_register{0x80};
57   static constexpr XMMRegister xmm0{0};
58   static constexpr XMMRegister xmm1{1};
59   static constexpr XMMRegister xmm2{2};
60   static constexpr XMMRegister xmm3{3};
61   static constexpr XMMRegister xmm4{4};
62   static constexpr XMMRegister xmm5{5};
63   static constexpr XMMRegister xmm6{6};
64   static constexpr XMMRegister xmm7{7};
65   static constexpr XMMRegister xmm8{8};
66   static constexpr XMMRegister xmm9{9};
67   static constexpr XMMRegister xmm10{10};
68   static constexpr XMMRegister xmm11{11};
69   static constexpr XMMRegister xmm12{12};
70   static constexpr XMMRegister xmm13{13};
71   static constexpr XMMRegister xmm14{14};
72   static constexpr XMMRegister xmm15{15};
73 
74   // Macroassembler uses these names to support both x86-32 and x86-64 modes.
75   static constexpr Register gpr_a{0};
76   static constexpr Register gpr_c{1};
77   static constexpr Register gpr_d{2};
78   static constexpr Register gpr_s{4};
79 
80 // Instructions.
81 #include "berberis/assembler/gen_assembler_x86_64-inl.h"  // NOLINT generated file!
82 
83   // Historical curiosity: x86-32 mode has Movq for memory-to-xmm operations.
84   // x86-64 added another one, with different opcode but since they are functionally equivalent
85   // GNU Assembler and Clang use old one both in 32-bit mode and 64-bit mode thus we are doing
86   // the same.
87 
88   // Unhide Decl(Mem) hidden by Decl(Reg).
89   using AssemblerX86::Decl;
90 
91   // Unhide Decw(Mem) hidden by Decw(Reg).
92   using AssemblerX86::Decw;
93 
94   // Unhide Incl(Mem) hidden by Incl(Reg).
95   using AssemblerX86::Incl;
96 
97   // Unhide Incw(Mem) hidden by Incw(Reg).
98   using AssemblerX86::Incw;
99 
100   // Unhide Movq(Mem, XMMReg) and Movq(XMMReg, Mem) hidden by Movq(Reg, Imm) and many others.
101   using AssemblerX86::Movq;
102 
103   // Unhide Xchgl(Mem, Reg) hidden by modified version below.
104   using AssemblerX86::Xchgl;
105 
106   // Unhide Vmov*(Mem, Reg) hidden by Vmov*(Reg, Reg).
107   using AssemblerX86::Vmovapd;
108   using AssemblerX86::Vmovaps;
109   using AssemblerX86::Vmovdqa;
110   using AssemblerX86::Vmovdqu;
111   using AssemblerX86::Vmovq;
112   using AssemblerX86::Vmovsd;
113   using AssemblerX86::Vmovss;
114 
Xchgl(Register dest,Register src)115   void Xchgl(Register dest, Register src) {
116     // In 32-bit mode "xchgl %eax, %eax" did nothing and was often reused as "nop".
117     //
118     // On x86-64 "xchgl %eax, %eax" clears top half of %eax register, but having single-byte nop
119     // is too convenient, thus, as special exception, 0x90 is not interpreted as "xchgl %eax, %eax",
120     // but was kept as "nop" - thus longer encoding for "xchgl %eax, %eax" must be used.
121 
122     if (IsAccumulator(src) && IsAccumulator(dest)) {
123       Emit16(0xc087);
124     } else {
125       AssemblerX86::Xchgl(dest, src);
126     }
127   }
128 
129   // TODO(b/127356868): decide what to do with these functions when cross-arch assembler is used.
130 
131 #ifdef __amd64__
132 
133   // Unhide Call(Reg), hidden by special version below.
134   using AssemblerX86::Call;
135 
Call(const void * target)136   void Call(const void* target) {
137     // There are no call instruction with properties we need thus we emulate it.
138     // This is what the following code looks like when decoded with objdump (if
139     // target address is 0x123456789abcdef0):
140     //   0: ff 15 02 00 00 00        callq  *0x2(%rip) # 0x8
141     //   6: eb 08                    jmp    0x10
142     //   8: f0 de bc 9a 78 56 34 12  lock fidivrs 0x12345678(%rdx,%rbx,4)
143     // First we do call - with address taken from last 8 bytes, then we jump over
144     // these 8 bytes.
145     Emit64(0x08eb0000000215ff);
146     Emit64(bit_cast<int64_t>(target));
147   }
148 
149   // Unhide Jcc(Label), hidden by special version below.
150   using AssemblerX86::Jcc;
151 
152   // Make sure only type void* can be passed to function below, not Label* or any other type.
153   template <typename T>
154   auto Jcc(Condition cc, T* target) -> void = delete;
155 
Jcc(Condition cc,const void * target)156   void Jcc(Condition cc, const void* target) {
157     if (cc == Condition::kAlways) {
158       Jmp(target);
159       return;
160     } else if (cc == Condition::kNever) {
161       return;
162     }
163     CHECK_EQ(0, static_cast<uint8_t>(cc) & 0xF0);
164     // There are no Jcc instruction with properties we need thus we emulate it.
165     // This is what the following code looks like when decoded with objdump (if
166     // target address is 0x123456789abcdef0):
167     //   0: 75 0e                   jne    0x10
168     //   2: ff 25 00 00 00 00       jmpq   *0x0(%rip) # 0x8
169     //   8: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
170     // We are doing relative jump for the inverted condition (because Jcc could
171     // only jump ±2GiB and in 64 bit mode which is not enough to reach arbitrary
172     // address), then jmpq with address stored right after jmpq.
173     Emit64(0x0000000025ff'0e70 | static_cast<int8_t>(ToReverseCond(cc)));
174     Emit64(bit_cast<int64_t>(target));
175   }
176 
177   // Unhide Jmp(Reg), hidden by special version below.
178   using AssemblerX86::Jmp;
179 
180   // Make sure only type void* can be passed to function below, not Label* or any other type.
181   template <typename T>
182   auto Jmp(T* target) -> void = delete;
183 
184   void Jmp(const void* target) {
185     // There are no jump instruction with properties we need thus we emulate it.
186     // This is what the following code looks like when decoded with objdump (if
187     // target address is 0x123456789abcdef0):
188     //   0: ff 25 00 00 00 00       jmpq   *0x0(%rip) # 0x6
189     //   6: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
190     // We are doing jump to the address stored right after jmpq using %rip-relative
191     // addressing (with offset 0).
192     Emit16(0x25ff);
193     Emit32(0x00000000);
194     Emit64(bit_cast<int64_t>(target));
195   }
196 
197 #endif
198 
199  private:
200   DISALLOW_IMPLICIT_CONSTRUCTORS(Assembler);
201 
202   static Register Accumulator() { return rax; }
203   static bool IsAccumulator(Register reg) { return reg == rax; }
204 
205   struct Register64Bit {
206     explicit constexpr Register64Bit(Register reg) : num(reg.num) {}
207     uint8_t num;
208   };
209 
210   struct Memory64Bit {
211     explicit Memory64Bit(const Operand& op) : operand(op) {}
212     Operand operand;
213   };
214 
215   struct Label64Bit {
216     explicit Label64Bit(const LabelOperand& l) : label(l.label) {}
217     const Label& label;
218   };
219 
220   // This type is only used by CmpXchg16b and acts similarly to Memory64Bit there.
221   using Memory128Bit = Memory64Bit;
222   using Label128Bit = Label64Bit;
223 
224   // Check if a given type is "a register with size" (for EmitInstruction).
225   template <typename ArgumentType>
226   struct IsRegister {
227     static constexpr bool value = std::is_same_v<ArgumentType, Register8Bit> ||
228                                   std::is_same_v<ArgumentType, Register32Bit> ||
229                                   std::is_same_v<ArgumentType, Register64Bit>;
230   };
231 
232   // Check if a given type is "a memory operand with size" (for EmitInstruction).
233   template <typename ArgumentType>
234   struct IsMemoryOperand {
235     static constexpr bool value =
236         std::is_same_v<ArgumentType, Memory32Bit> || std::is_same_v<ArgumentType, Memory64Bit>;
237   };
238 
239   template <typename ArgumentType>
240   struct IsLabelOperand {
241     static constexpr bool value =
242         std::is_same_v<ArgumentType, Label32Bit> || std::is_same_v<ArgumentType, Label64Bit>;
243   };
244 
245   template <typename... ArgumentsTypes>
246   void EmitRex(ArgumentsTypes... arguments) {
247     constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
248     constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
249     static_assert(registers_count + operands_count <= 2,
250                   "Only two-arguments instructions are supported, not VEX or EVEX");
251     uint8_t rex = 0;
252     if constexpr (registers_count == 2) {
253       rex = Rex<0b0100>(ArgumentByType<0, IsRegister>(arguments...)) |
254             Rex<0b0001>(ArgumentByType<1, IsRegister>(arguments...));
255     } else if constexpr (registers_count == 1 && operands_count == 1) {
256       rex = Rex<0b0100>(ArgumentByType<0, IsRegister>(arguments...)) |
257             Rex(ArgumentByType<0, IsMemoryOperand>(arguments...));
258     } else if constexpr (registers_count == 1) {
259       rex = Rex<0b0001>(ArgumentByType<0, IsRegister>(arguments...));
260     } else if constexpr (operands_count == 1) {
261       rex = Rex(ArgumentByType<0, IsMemoryOperand>(arguments...));
262     }
263     if (rex) {
264       Emit8(rex);
265     }
266   }
267 
268   template <uint8_t base_rex, typename ArgumentType>
269   uint8_t Rex(ArgumentType argument) {
270     if (argument.num & 0b1000) {
271       // 64-bit argument requires REX.W bit
272       if (std::is_same_v<ArgumentType, Register64Bit>) {
273         return 0b0100'1000 | base_rex;
274       }
275       return 0b0100'0000 | base_rex;
276     }
277     // 8-bit argument requires REX (even if without any bits).
278     if (std::is_same_v<ArgumentType, Register8Bit> && argument.num > 3) {
279       return 0b0100'0000;
280     }
281     if (std::is_same_v<ArgumentType, Register64Bit>) {
282       return 0b0100'1000;
283     }
284     return 0;
285   }
286 
Rex(Operand operand)287   uint8_t Rex(Operand operand) {
288     // REX.B and REX.X always come from operand.
289     uint8_t rex = ((operand.base.num & 0b1000) >> 3) | ((operand.index.num & 0b1000) >> 2);
290     if (rex) {
291       // We actually need rex byte here.
292       return 0b0100'0000 | rex;
293     } else {
294       return 0;
295     }
296   }
297 
Rex(Memory32Bit operand)298   uint8_t Rex(Memory32Bit operand) { return Rex(operand.operand); }
299 
Rex(Memory64Bit operand)300   uint8_t Rex(Memory64Bit operand) {
301     // 64-bit argument requires REX.W bit - and thus REX itself.
302     return 0b0100'1000 | Rex(operand.operand);
303   }
304 
305   template <typename RegisterType>
306   [[nodiscard]] static bool IsSwapProfitable(RegisterType rm_arg, RegisterType vex_arg) {
307     // In 64bit mode we may use more compact encoding if operand encoded in rm is low register.
308     // Return true if we may achieve that by swapping arguments.
309     return rm_arg.num >= 8 && vex_arg.num < 8;
310   }
311 
312   template <uint8_t byte1,
313             uint8_t byte2,
314             uint8_t byte3,
315             bool reg_is_opcode_extension,
316             typename... ArgumentsTypes>
317   void EmitVex(ArgumentsTypes... arguments) {
318     constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
319     constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
320     constexpr auto labels_count = kCountArguments<IsLabelOperand, ArgumentsTypes...>;
321     constexpr auto vvvv_parameter = 2 - reg_is_opcode_extension - operands_count - labels_count;
322     int vvvv = 0;
323     if constexpr (registers_count > vvvv_parameter) {
324       vvvv = ArgumentByType<vvvv_parameter, IsRegister>(arguments...).num;
325     }
326     auto vex2 = byte2 | 0b111'00000;
327     if constexpr (operands_count == 1) {
328       auto operand = ArgumentByType<0, IsMemoryOperand>(arguments...);
329       vex2 ^= (operand.operand.base.num & 0b1000) << 2;
330       vex2 ^= (operand.operand.index.num & 0b1000) << 3;
331       if constexpr (!reg_is_opcode_extension) {
332         vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num & 0b1000) << 4;
333       }
334     } else if constexpr (labels_count == 1) {
335       if constexpr (!reg_is_opcode_extension) {
336         vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num & 0b1000) << 4;
337       }
338     } else if constexpr (registers_count > 0) {
339       if constexpr (reg_is_opcode_extension) {
340         vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num & 0b1000) << 2;
341       } else {
342         vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num & 0b1000) << 4;
343         vex2 ^= (ArgumentByType<1, IsRegister>(arguments...).num & 0b1000) << 2;
344       }
345     }
346     if (byte1 == 0xC4 && (vex2 & 0b0'1'1'11111) == 0b0'1'1'00001 && (byte3 & 0b1'0000'0'00) == 0) {
347       Emit16((0xc5 | ((vex2 & 0b1'0'0'00000) << 8) | (byte3 << 8) |
348               0b0'1111'000'00000000) ^ (vvvv << 11));
349     } else {
350       Emit8(byte1);
351       Emit16((vex2 | (byte3 << 8) | 0b0'1111'000'00000000) ^ (vvvv << 11));
352     }
353   }
354 
355   template <typename ArgumentType>
EmitRegisterInOpcode(uint8_t opcode,ArgumentType argument)356   void EmitRegisterInOpcode(uint8_t opcode, ArgumentType argument) {
357     Emit8(opcode | (argument.num & 0b111));
358   }
359 
360   template <typename ArgumentType1, typename ArgumentType2>
EmitModRM(ArgumentType1 argument1,ArgumentType2 argument2)361   void EmitModRM(ArgumentType1 argument1, ArgumentType2 argument2) {
362     Emit8(0xC0 | ((argument1.num & 0b111) << 3) | (argument2.num & 0b111));
363   }
364 
365   template <typename ArgumentType>
EmitModRM(uint8_t opcode_extension,ArgumentType argument)366   void EmitModRM(uint8_t opcode_extension, ArgumentType argument) {
367     CHECK_LE(opcode_extension, 0b111);
368     Emit8(0xC0 | (opcode_extension << 3) | (argument.num & 0b111));
369   }
370 
371   template <typename ArgumentType>
EmitOperandOp(ArgumentType argument,Operand operand)372   void EmitOperandOp(ArgumentType argument, Operand operand) {
373     EmitOperandOp(static_cast<int>(argument.num & 0b111), operand);
374   }
375 
376   template <size_t kImmediatesSize, typename ArgumentType>
EmitRipOp(ArgumentType argument,const Label & label)377   void EmitRipOp(ArgumentType argument, const Label& label) {
378     EmitRipOp<kImmediatesSize>(static_cast<int>(argument.num) & 0b111, label);
379   }
380 
381   // Emit the ModR/M byte, and optionally the SIB byte and
382   // 1- or 4-byte offset for a memory operand.  Also used to encode
383   // a three-bit opcode extension into the ModR/M byte.
384   void EmitOperandOp(int number, const Operand& addr);
385   // Helper functions to handle various ModR/M and SIB combinations.
386   // Should *only* be called from EmitOperandOp!
387   void EmitIndexDispOperand(int reg, const Operand& addr);
388   template <typename ArgType, void (AssemblerBase::*)(ArgType)>
389   void EmitBaseIndexDispOperand(int base_modrm_and_sib, const Operand& addr);
390   // Emit ModR/M for rip-addressig.
391   template <size_t kImmediatesSize>
392   void EmitRipOp(int num, const Label& label);
393 
394   friend AssemblerX86<Assembler>;
395 };
396 
397 // This function looks big, but when we are emitting Operand with fixed registers
398 // (which is the most common case) all "if"s below are calculated statically which
399 // makes effective size of that function very small.
400 //
401 // But for this to happen function have to be inline and in header.
EmitOperandOp(int number,const Operand & addr)402 inline void Assembler::EmitOperandOp(int number, const Operand& addr) {
403   // Additional info (register number, etc) is limited to 3 bits.
404   CHECK_LE(unsigned(number), 7);
405 
406   // Reg field must be shifted by 3 bits.
407   int reg = number << 3;
408 
409   // On x86 %rsp cannot be index, only base.
410   CHECK(addr.index != rsp);
411 
412   // If base is not %rsp/r12 and we don't have index, then we don't have SIB byte.
413   // All other cases have "ModR/M" and SIB bytes.
414   if (addr.base != rsp && addr.base != r12 && addr.index == no_register) {
415     // If we have base register then we could use the same logic as for other common cases.
416     if (addr.base != no_register) {
417       EmitBaseIndexDispOperand<uint8_t, &Assembler::Emit8>((addr.base.num & 7) | reg, addr);
418     } else {
419       Emit16(0x2504 | reg);
420       Emit32(addr.disp);
421     }
422   } else if (addr.index == no_register) {
423     // Note: when ModR/M and SIB are used "no index" is encoded as if %rsp is used in place of
424     // index (that's why %rsp couldn't be used as index - see check above).
425     EmitBaseIndexDispOperand<int16_t, &Assembler::Emit16>(0x2004 | ((addr.base.num & 7) << 8) | reg,
426                                                           addr);
427   } else if (addr.base == no_register) {
428     EmitIndexDispOperand(reg, addr);
429   } else {
430     EmitBaseIndexDispOperand<int16_t, &Assembler::Emit16>(
431         0x04 | (addr.scale << 14) | ((addr.index.num & 7) << 11) | ((addr.base.num & 7) << 8) | reg,
432         addr);
433   }
434 }
435 
EmitIndexDispOperand(int reg,const Operand & addr)436 inline void Assembler::EmitIndexDispOperand(int reg, const Operand& addr) {
437   // We only have index here, no base, use SIB but put %rbp in "base" field.
438   Emit16(0x0504 | (addr.scale << 14) | ((addr.index.num & 7) << 11) | reg);
439   Emit32(addr.disp);
440 }
441 
442 template <size_t kImmediatesSize>
EmitRipOp(int num,const Label & label)443 inline void Assembler::EmitRipOp(int num, const Label& label) {
444   Emit8(0x05 | (num << 3));
445   jumps_.push_back(Jump{&label, pc(), false});
446   Emit32(0xfffffffc - kImmediatesSize);
447 }
448 
449 template <typename ArgType, void (AssemblerBase::*EmitBase)(ArgType)>
EmitBaseIndexDispOperand(int base_modrm_and_sib,const Operand & addr)450 inline void Assembler::EmitBaseIndexDispOperand(int base_modrm_and_sib, const Operand& addr) {
451   if (addr.disp == 0 && addr.base != rbp && addr.base != r13) {
452     // We can omit zero displacement only if base isn't %rbp/%r13
453     (this->*EmitBase)(base_modrm_and_sib);
454   } else if (IsInRange<int8_t>(addr.disp)) {
455     // If disp could it in byte then use byte-disp.
456     (this->*EmitBase)(base_modrm_and_sib | 0x40);
457     Emit8(addr.disp);
458   } else {
459     // Otherwise use full-disp.
460     (this->*EmitBase)(base_modrm_and_sib | 0x80);
461     Emit32(addr.disp);
462   }
463 }
464 
Movq(Register dest,int64_t imm64)465 inline void Assembler::Movq(Register dest, int64_t imm64) {
466   if (IsInRange<uint32_t>(imm64)) {
467     // Shorter encoding.
468     Movl(dest, static_cast<uint32_t>(imm64));
469   } else if (IsInRange<int32_t>(imm64)) {
470     // Slightly longer encoding.
471     EmitInstruction<Opcodes<0xc7, 0x00>>(Register64Bit(dest), static_cast<int32_t>(imm64));
472   } else {
473     // Longest encoding.
474     EmitInstruction<Opcodes<0xb8>>(Register64Bit(dest), imm64);
475   }
476 }
477 
Vmovapd(XMMRegister arg0,XMMRegister arg1)478 inline void Assembler::Vmovapd(XMMRegister arg0, XMMRegister arg1) {
479   if (arg0.num < 8 && arg1.num >= 8) {
480     return EmitInstruction<Opcodes<0xc4, 0x01, 0x01, 0x29>>(VectorRegister128Bit(arg1),
481                                                             VectorRegister128Bit(arg0));
482   }
483   EmitInstruction<Opcodes<0xc4, 0x01, 0x01, 0x28>>(VectorRegister128Bit(arg0),
484                                                    VectorRegister128Bit(arg1));
485 }
486 
Vmovaps(XMMRegister arg0,XMMRegister arg1)487 inline void Assembler::Vmovaps(XMMRegister arg0, XMMRegister arg1) {
488   if (arg0.num < 8 && arg1.num >= 8) {
489     return EmitInstruction<Opcodes<0xc4, 0x01, 0x00, 0x29>>(VectorRegister128Bit(arg1),
490                                                             VectorRegister128Bit(arg0));
491   }
492   EmitInstruction<Opcodes<0xc4, 0x01, 0x00, 0x28>>(VectorRegister128Bit(arg0),
493                                                    VectorRegister128Bit(arg1));
494 }
495 
Vmovdqa(XMMRegister arg0,XMMRegister arg1)496 inline void Assembler::Vmovdqa(XMMRegister arg0, XMMRegister arg1) {
497   if (arg0.num < 8 && arg1.num >= 8) {
498     return EmitInstruction<Opcodes<0xc4, 0x01, 0x01, 0x7F>>(VectorRegister128Bit(arg1),
499                                                             VectorRegister128Bit(arg0));
500   }
501   EmitInstruction<Opcodes<0xc4, 0x01, 0x01, 0x6F>>(VectorRegister128Bit(arg0),
502                                                    VectorRegister128Bit(arg1));
503 }
504 
Vmovdqu(XMMRegister arg0,XMMRegister arg1)505 inline void Assembler::Vmovdqu(XMMRegister arg0, XMMRegister arg1) {
506   if (arg0.num < 8 && arg1.num >= 8) {
507     return EmitInstruction<Opcodes<0xc4, 0x01, 0x02, 0x7F>>(VectorRegister128Bit(arg1),
508                                                             VectorRegister128Bit(arg0));
509   }
510   EmitInstruction<Opcodes<0xc4, 0x01, 0x02, 0x6F>>(VectorRegister128Bit(arg0),
511                                                    VectorRegister128Bit(arg1));
512 }
513 
Vmovsd(XMMRegister arg0,XMMRegister arg1,XMMRegister arg2)514 inline void Assembler::Vmovsd(XMMRegister arg0, XMMRegister arg1, XMMRegister arg2) {
515   if (arg0.num < 8 && arg2.num >= 8) {
516     return EmitInstruction<Opcodes<0xc4, 0x01, 0x03, 0x11>>(
517         VectorRegister128Bit(arg2), VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
518   }
519   EmitInstruction<Opcodes<0xc4, 0x01, 0x03, 0x10>>(
520       VectorRegister128Bit(arg0), VectorRegister128Bit(arg2), VectorRegister128Bit(arg1));
521 }
522 
Vmovss(XMMRegister arg0,XMMRegister arg1,XMMRegister arg2)523 inline void Assembler::Vmovss(XMMRegister arg0, XMMRegister arg1, XMMRegister arg2) {
524   if (arg0.num < 8 && arg2.num >= 8) {
525     return EmitInstruction<Opcodes<0xc4, 0x01, 0x02, 0x11>>(
526         VectorRegister128Bit(arg2), VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
527   }
528   EmitInstruction<Opcodes<0xc4, 0x01, 0x02, 0x10>>(
529       VectorRegister128Bit(arg0), VectorRegister128Bit(arg2), VectorRegister128Bit(arg1));
530 }
531 
Xchgq(Register dest,Register src)532 inline void Assembler::Xchgq(Register dest, Register src) {
533   // We compare output to that from clang and thus want to produce the same code.
534   // 0x48 0x90 is suboptimal encoding for that operation (pure 0x90 does the same
535   // and this is what gcc + gas are producing), but this is what clang <= 8 does.
536   if (IsAccumulator(src) && IsAccumulator(dest)) {
537     Emit8(0x90);
538   } else if (IsAccumulator(src) || IsAccumulator(dest)) {
539     Register other = IsAccumulator(src) ? dest : src;
540     EmitInstruction<Opcodes<0x90>>(Register64Bit(other));
541   } else {
542   // Clang 8 (after r330298) puts dest before src.  We are comparing output
543   // to clang in exhaustive test thus we want to match clang behavior exactly.
544     EmitInstruction<Opcodes<0x87>>(Register64Bit(dest), Register64Bit(src));
545   }
546 }
547 
548 }  // namespace x86_64
549 
550 }  // namespace berberis
551 
552 #endif  // BERBERIS_ASSEMBLER_X86_64_H_
553