Add x64 codegen support for float-to-unsigned-64-bit conversion

nmraz · nmraz · commit eab658b2b5d3 · 2025-07-07T22:44:51.000+03:00
Right now, this only works with the small-pic internal code model
because it needs to access the constant pool.

Note that this pseudo-operation would also benefit from regalloc "soft
ties".
diff --git a/crates/codegen/src/target/x64.rs b/crates/codegen/src/target/x64.rs
@@ -253,6 +253,7 @@ pub enum X64Instr {
     Cvtsi2s(OperandSize, SseFpuPrecision),
     Cvts2si(OperandSize, SseFpuPrecision),
     PseudoUint64ToFloat(SseFpuPrecision),
+    PseudoFloatToUint64Rel(SseFpuPrecision),
     MovGprmXmm(OperandSize),
     /// Load from [rbp + offset]
     MovRRbp {
@@ -307,6 +308,7 @@ impl X64Instr {
             X64Instr::Cvtsi2s(..) => false,
             X64Instr::Cvts2si(..) => false,
             X64Instr::PseudoUint64ToFloat(..) => false,
+            X64Instr::PseudoFloatToUint64Rel(..) => false,
             X64Instr::MovGprmXmm(..) => false,
             X64Instr::MovRRbp { .. } => false,
             X64Instr::MovsRRbp { .. } => false,
@@ -353,6 +355,7 @@ impl X64Instr {
             X64Instr::Cvtsi2s(..) => false,
             X64Instr::Cvts2si(..) => false,
             X64Instr::PseudoUint64ToFloat(..) => true,
+            X64Instr::PseudoFloatToUint64Rel(..) => true,
             X64Instr::MovGprmXmm(..) => false,
             X64Instr::Setcc(..) => false,
             X64Instr::MovRRbp { .. } => false,
diff --git a/crates/codegen/src/target/x64/emit.rs b/crates/codegen/src/target/x64/emit.rs
@@ -4,6 +4,7 @@ use ir::node::FunctionRef;
 use crate::{
     cfg::Block,
     code_buffer::{BufferRelocTarget, CodeBuffer, FixupKind, InstrAnchor, InstrSink, Label},
+    constpool::Constant,
     emit::{EmitContext, EmitInstrData},
     frame::FrameLayout,
     lir::{Instr, PhysReg, PhysRegSet, StackSlot},
@@ -323,6 +324,14 @@ impl MachineEmit for X64Machine {
                 defs[1].as_reg().unwrap(),
                 defs[2].as_reg().unwrap(),
             ),
+            &X64Instr::PseudoFloatToUint64Rel(prec) => emit_float_to_uint64_rel(
+                buffer,
+                prec,
+                defs[0].as_reg().unwrap(),
+                uses[0].as_reg().unwrap(),
+                defs[1].as_reg().unwrap(),
+                defs[2].as_reg().unwrap(),
+            ),
             &X64Instr::MovGprmXmm(op_size) => emit_mov_gprm_xmm(
                 buffer,
                 op_size,
@@ -607,6 +616,63 @@ fn emit_uint64_to_float(
     buffer.bind_label(done);
 }
 
+fn emit_float_to_uint64_rel(
+    buffer: &mut CodeBuffer<X64Fixup>,
+    prec: SseFpuPrecision,
+    dest: PhysReg,
+    src: PhysReg,
+    tmp_xmm1: PhysReg,
+    tmp_xmm2: PhysReg,
+) {
+    // Emit the following:
+    //
+    //         movsd tmp_xmm1, [rip + C_f_1p63]
+    //         ucomis[sd] src, tmp_xmm1
+    //         jae has_high_bit
+    //         cvts[sd]2si dest, src
+    //         jmp done
+    //     has_high_bit:
+    //         mov tmp_xmm2, src
+    //         subsd tmp_xmm2, tmp_xmm1
+    //         cvts[sd]2si dest, tmp_xmm2
+    //         btc dest
+    //     done:
+
+    let high_bit_set = buffer.create_label();
+    let done = buffer.create_label();
+
+    let f_1p63 = get_f_1p63(buffer, prec);
+    emit_movs_r_rm_rip_reloc(buffer, prec, tmp_xmm1, BufferRelocTarget::Constant(f_1p63));
+    emit_ucomi(buffer, prec, src, RegMem::Reg(tmp_xmm1));
+    emit_jcc(buffer, CondCode::Ae, high_bit_set);
+
+    emit_cvts2si(buffer, OperandSize::S64, prec, dest, RegMem::Reg(src));
+    emit_jmp(buffer, done);
+
+    buffer.bind_label(high_bit_set);
+    if tmp_xmm2 != src {
+        emit_movaps_r_rm(buffer, tmp_xmm2, RegMem::Reg(src));
+    }
+    emit_sse_fpu_r_rm(
+        buffer,
+        prec,
+        SseFpuBinOp::Sub,
+        tmp_xmm2,
+        RegMem::Reg(tmp_xmm1),
+    );
+    emit_cvts2si(buffer, OperandSize::S64, prec, dest, RegMem::Reg(tmp_xmm2));
+    emit_btc_rm_i(buffer, OperandSize::S64, RegMem::Reg(dest), 63);
+
+    buffer.bind_label(done);
+}
+
+fn get_f_1p63(buffer: &mut CodeBuffer<X64Fixup>, prec: SseFpuPrecision) -> Constant {
+    match prec {
+        SseFpuPrecision::Single => buffer.get_constant(4, &0x5f000000u32.to_le_bytes()),
+        SseFpuPrecision::Double => buffer.get_constant(8, &0x43e0000000000000u64.to_le_bytes()),
+    }
+}
+
 // Single-instruction emission helpers
 
 fn emit_push(buffer: &mut CodeBuffer<X64Fixup>, reg: PhysReg) {
@@ -960,6 +1026,19 @@ fn emit_setcc_r(buffer: &mut CodeBuffer<X64Fixup>, code: CondCode, dest: PhysReg
     });
 }
 
+fn emit_btc_rm_i(buffer: &mut CodeBuffer<X64Fixup>, op_size: OperandSize, arg: RegMem, imm: u8) {
+    let (rex, modrm_sib) = encode_reg_mem_parts(arg, |rex| {
+        rex.encode_operand_size(op_size);
+        0x7
+    });
+    buffer.instr(|sink| {
+        rex.emit(sink);
+        sink.emit(&[0xf, 0xba]);
+        modrm_sib.emit(sink);
+        sink.emit(&[imm]);
+    });
+}
+
 fn emit_movsx_r_rm(buffer: &mut CodeBuffer<X64Fixup>, width: ExtWidth, dest: PhysReg, src: RegMem) {
     let (opcode, op_size): (&[u8], _) = match width {
         ExtWidth::Ext8_32 => (&[0xf, 0xbe], OperandSize::S32),
diff --git a/crates/codegen/src/target/x64/lower.rs b/crates/codegen/src/target/x64/lower.rs
@@ -12,7 +12,10 @@ use valmatch::match_value;
 use crate::{
     cfg::Block,
     isel::{IselContext, MachineIselError, ParamLoc},
-    lir::{DefOperand, PhysReg, PhysRegSet, RegClass, UseOperand, VirtReg},
+    lir::{
+        DefOperand, DefOperandConstraint, OperandPos, PhysReg, PhysRegSet, RegClass, UseOperand,
+        VirtReg,
+    },
     machine::MachineLower,
     num_utils::{is_sint, is_uint},
     target::x64::{CompoundCondCode, SseFpuBinOp, SseFpuCmpCode, SseFpuPrecision},
@@ -203,7 +206,7 @@ impl MachineLower for X64Machine {
             NodeKind::SintToFloat => emit_cvtsi2s(ctx, node),
             NodeKind::UintToFloat => select_uinttofloat(ctx, node),
             NodeKind::FloatToSint => emit_cvts2si(ctx, node),
-            NodeKind::FloatToUint => select_floattouint(ctx, node)?,
+            NodeKind::FloatToUint => select_floattouint(self, ctx, node)?,
             NodeKind::PtrOff => select_alu(ctx, node, AluBinOp::Add),
             &NodeKind::Load(mem_size) => select_load(ctx, node, mem_size),
             &NodeKind::Store(mem_size) => select_store(ctx, node, mem_size),
@@ -550,6 +553,7 @@ fn select_uinttofloat(ctx: &mut IselContext<'_, '_, X64Machine>, node: Node) {
 }
 
 fn select_floattouint(
+    machine: &X64Machine,
     ctx: &mut IselContext<'_, '_, X64Machine>,
     node: Node,
 ) -> Result<(), MachineIselError> {
@@ -569,7 +573,24 @@ fn select_floattouint(
                 &[UseOperand::any(input)],
             );
         }
-        Type::I64 => return Err(MachineIselError),
+        Type::I64 => {
+            if machine.config.internal_code_model == CodeModel::SmallPic {
+                let tmp_xmm1 = ctx.create_temp_vreg(RC_XMM);
+                let tmp_xmm2 = ctx.create_temp_vreg(RC_XMM);
+
+                ctx.emit_instr(
+                    X64Instr::PseudoFloatToUint64Rel(SseFpuPrecision::Double),
+                    &[
+                        DefOperand::any_reg(output),
+                        DefOperand::new(tmp_xmm1, DefOperandConstraint::AnyReg, OperandPos::Early),
+                        DefOperand::any_reg(tmp_xmm2),
+                    ],
+                    &[UseOperand::any_reg(input)],
+                );
+            } else {
+                return Err(MachineIselError);
+            }
+        }
         _ => unreachable!(),
     }
 
diff --git a/crates/filetests/cases/codegen/floattouint.spdr b/crates/filetests/cases/codegen/floattouint.spdr
@@ -12,3 +12,25 @@ func @floattouint32:i32(f64) {
     %2:i32 = floattouint %1
     return %0, %2
 }
+
+func @floattouint64:i64(f64) {
+    # check: function `floattouint64`:
+    # nextln: 000000: 55                              push rbp
+    # nextln: 000001: 48 89 e5                        mov rbp, rsp
+    # nextln: 000004: f2 0f 10 0d 00 00 00 00         movsd xmm1, qword ptr [rip]  # RELOC_PC32 -> @<CP> + -4
+    # nextln: 00000c: 66 0f 2e c1                     ucomisd xmm0, xmm1
+    # nextln: 000010: 0f 83 0a 00 00 00               jae 0x20
+    # nextln: 000016: f2 48 0f 2d c0                  cvtsd2si rax, xmm0
+    # nextln: 00001b: e9 0e 00 00 00                  jmp 0x2e
+    # nextln: 000020: f2 0f 5c c1                     subsd xmm0, xmm1
+    # nextln: 000024: f2 48 0f 2d c0                  cvtsd2si rax, xmm0
+    # nextln: 000029: 48 0f ba f8 3f                  btc rax, 0x3f
+    # nextln: 00002e: 5d                              pop rbp
+    # nextln: 00002f: c3                              ret
+    # nextln: <CP>:
+    # nextln: 000000: 00 00 00 00 00 00 e0 43
+
+    %0:ctrl, %1:f64 = entry
+    %2:i64 = floattouint %1
+    return %0, %2
+}
diff --git a/crates/filetests/cases/isel/floattouint.spdr b/crates/filetests/cases/isel/floattouint.spdr
@@ -10,3 +10,14 @@ func @floattouint32:i32(f64) {
     %2:i32 = floattouint %1
     return %0, %2
 }
+
+func @floattouint64:i64(f64) {
+    # check: function `floattouint64`:
+    # nextln:       block0[%1:xmm($$xmm0)]:
+    # nextln: 0000:      %0:gpr(reg)[late], %2:xmm(reg)[early], %3:xmm(reg)[late] = PseudoFloatToUint64Rel(Double) %1(reg)[early]
+    # nextln: 0001:      Ret %0($$rax)[early]
+
+    %0:ctrl, %1:f64 = entry
+    %2:i64 = floattouint %1
+    return %0, %2
+}