diff --git a/src/engine/Metrics.v3 b/src/engine/Metrics.v3 index d6c0982a1..64b4f361e 100644 --- a/src/engine/Metrics.v3 +++ b/src/engine/Metrics.v3 @@ -31,6 +31,13 @@ component Metrics { "Number of functions successfully compiled by single-pass compiler"); def spc_time_per_byte = r("spc:time_per_byte", spc_time_us, spc_in_bytes, "Ratio of compile time per input bytecode byte"); + def spc_static_calls = m("spc:static_calls", "calls", "Number of call instructions encountered by single-pass compiler"); + def spc_static_inlined_calls = m("spc:static_inlined_calls", "calls", "Number of direct call sites inlined by single-pass compiler"); + // XXX does not include inlined whamm probes, but does include inlining within a whamm inline + def spc_dynamic_calls = m("spc:dynamic_calls", "calls", "Number of call instructions executed at runtime"); + def spc_dynamic_inlined_calls = m("spc:dynamic_inlined_calls", "calls", "Number of inlined call sites executed at runtime"); + def spc_static_remat = m("spc:static_remat", "sites", "Number of sites where stack frame reconstruction is emitted"); + def spc_dynamic_remat = m("spc:dynamic_remat", "sites", "Number of stack frame reconstructions executed at runtime"); // Metrics from executing wasm code. def start_time_us = t("start:time_us", "Time taken to execute wasm module start function(s)."); diff --git a/src/engine/Tuning.v3 b/src/engine/Tuning.v3 index 9595ef8bb..56c1ab2c1 100644 --- a/src/engine/Tuning.v3 +++ b/src/engine/Tuning.v3 @@ -59,7 +59,10 @@ component SpcTuning { var intrinsifyOperandProbe = true; // recognize and optimize OperandProbes var compileWhammModules = true; // compile whamm module, if possible var intrinsifyWhammProbe = true; // recognize and optimize WhammProbes - var inlineSmallFunc = true; // inline small functions, currently only applicable for whamm probes + var inlineWhammProbe = true; // inline whamm probe functions + var maxInlineBytecodeSize = 100; // max bytecode size to inline + var maxInlineParams = 10; // max parameters to inline + var maxInlineDepth = 1; // max inlining nesting depth def probeCallFreesRegs = true; // probe calls frees registers in abstract state def runtimeCallFreesRegs = true; // runtime calls frees registers in abstract state var intrinsifyMemoryProbes = true; diff --git a/src/engine/compiler/CompilerOptions.v3 b/src/engine/compiler/CompilerOptions.v3 index 797310309..b1af562ed 100644 --- a/src/engine/compiler/CompilerOptions.v3 +++ b/src/engine/compiler/CompilerOptions.v3 @@ -27,8 +27,14 @@ component CompilerOptions { .onSet(fun v => void(SpcTuning.intrinsifyMemoryProbes = v)); group.newBoolOption("compile-whamm-modules", true, "Compile whamm monitor modules before execution.") .onSet(fun v => void(SpcTuning.compileWhammModules = v)); - group.newBoolOption("inline-whamm-probes", true, "Inline small whamm probes.") - .onSet(fun v => void(SpcTuning.inlineSmallFunc = v)); + group.newBoolOption("inline-whamm-probes", true, "Inline whamm probes.") + .onSet(fun v => void(SpcTuning.inlineWhammProbe = v)); + group.newIntOption("inline-max-bytecode-size", SpcTuning.maxInlineBytecodeSize, "Maximum number of bytecodes in a function that can be inlined.") + .onSet(fun v => void(SpcTuning.maxInlineBytecodeSize = v)); + group.newIntOption("inline-max-params", SpcTuning.maxInlineParams, "Maximum number of parameters of a function that can be inlined.") + .onSet(fun v => void(SpcTuning.maxInlineParams = v)); + group.newIntOption("inline-max-depth", SpcTuning.maxInlineDepth, "Maximum inlining nesting depth.") + .onSet(fun v => void(SpcTuning.maxInlineDepth = v)); } def printHelp(out: TraceBuilder) { diff --git a/src/engine/compiler/MacroAssembler.v3 b/src/engine/compiler/MacroAssembler.v3 index d3868cf1e..7a39cb971 100644 --- a/src/engine/compiler/MacroAssembler.v3 +++ b/src/engine/compiler/MacroAssembler.v3 @@ -21,8 +21,11 @@ class MasmLabel(create_pos: int) { class MacroAssembler(valuerep: Tagging, regConfig: RegConfig) { var unimplemented: void -> void; // function to call for unimplemented bytecodes var trap_labels: Array; // maps TrapReason to a label - var source_locs: Vector<(int, int)>; // list of (offset, source_loc) pairs + var source_locs: Vector<(int, List<(int, int)>)>; // list of (offset, [(fid, pc)]) + var inline_ctx: List<(int, int)>; // list of (fid, pc) var source_loc: int = -1; // current source location, if any + var current_fid: int = -1; // current function id, if any + var newTrapLabel: TrapReason -> MasmLabel; var embeddedRefOffsets: Vector; var offsets = Target.getOffsets(); @@ -80,7 +83,15 @@ class MacroAssembler(valuerep: Tagging, regConfig: RegConfig) { def recordSourceLoc(offset: int) { if (source_loc < 0) return; if (source_locs == null) source_locs = Vector.new(); - source_locs.put(offset, source_loc); + + source_locs.put(offset, List.new((current_fid, source_loc), inline_ctx)); + } + def pushInlineContext(fid: int) { + inline_ctx = List.new((fid, source_loc), inline_ctx); + } + def popInlineContext() { + if (inline_ctx != null) // need more specific than null? + inline_ctx = inline_ctx.tail; } def at(src: int) -> this { source_loc = src; @@ -336,6 +347,7 @@ class MacroAssembler(valuerep: Tagging, regConfig: RegConfig) { def emit_call_runtime_Probe_instr(); def emit_call_runtime_getFrameAccessorMetaRef(); def emit_increment_CountProbe(tmp: Reg, probe: CountProbe, increment: u64); + def emit_inc_metric(metric: Metric); def emit_call_OperandProbe_i_v_fire(probe: OperandProbe_i_v, value_reg: Reg); def emit_call_MemoryReadProbe_fire(probe: MemoryReadProbe); def emit_call_runtime_cast(); diff --git a/src/engine/compiler/SinglePassCompiler.v3 b/src/engine/compiler/SinglePassCompiler.v3 index 0b5dc2dc0..0e33e1786 100644 --- a/src/engine/compiler/SinglePassCompiler.v3 +++ b/src/engine/compiler/SinglePassCompiler.v3 @@ -1,6 +1,17 @@ // Copyright 2022 Ben L. Titzer. All rights reserved. // See LICENSE for details of Apache 2.0 license. +// Inlining TODO +// - reuse tail call logic in abstract stack move +// - inline intra-module calls only +// - revert change that always swaps membase and instance in whamm +// - whamm, keep track of swapping instance/membase as part of SpcFrame +// - FIXME whamm bugs, wrong value created in abstract state +// - benchmark inlining improvements +// - handle OSR/dynamic tier-up/tier down? +// - pre-bump rsp +// - compilation hints proposal + // Describes the register and frame configuration for the single-pass compiler. class SpcExecEnv { // Frame information. @@ -78,7 +89,6 @@ def KIND_REF_U64 = SpcConsts.KIND_REF_U64; // Compiles Wasm bytecode to machine code in a single pass via a MacroAssembler. class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAlloc, extensions: Extension.set, limits: Limits) extends BytecodeVisitor { - var it = BytecodeIterator.new(); def instrTracer = if(Trace.compiler, InstrTracer.new()); def config = masm.regConfig; def regs = xenv; @@ -93,10 +103,17 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Abstract state of the value stack def state = SpcState.new(regAlloc); // Other state - def trap_labels = Vector<(TrapReason, MasmLabel)>.new(); + def trap_labels = Vector<(TrapReason, MasmLabel, Array)>.new(); + + def it = BytecodeIterator.new(); + + // Frame state (refers to the fields in the top SpcFrame in SpcState) var module: Module; var func: FuncDecl; var sig: SigDecl; + var num_locals: int; + var local_base_sp: u31; // TODO could use a Range so that all offsets are from 0 again + var success = true; var osr_pc: int; var osr_offset: int; @@ -106,14 +123,11 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var ret_label: MasmLabel; var last_probe = 0; var skip_to_end: bool; + var is_whamm_inlined: bool; // compiling an inlined Whamm probe? + var whamm_probe_ctl_base: u31; // ctl_stack.top when Whamm probe compilation started // XXX: hack var handler_dest_info = Vector.new(); - // when function is inlined, we continue using caller's abstract state, and - // push callee's params/locals as needed, thus we need to track the base sp of the locals - // in the current context. - var local_base_sp: u31 = 0; - var is_inlined = false; // tracks the last masm writer offset to generate instruction trace for each bytecode. var codegen_offset: u64 = 0; @@ -124,6 +138,54 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.newTrapLabel = newTrapLabel; // trap labels are per-pc } + // Manage inlined frames stack + def pushFrame(frame: SpcFrame) { + if (func != null) + masm.pushInlineContext(func.func_index); + + def current = state.frame_stack.peek(); + if (current != null) + current.pc = it.pc; + state.frame_stack.push(frame); + // Update cached copies from new top frame + it.reset(frame.func).at(frame.pc, -1); + module = frame.module; + func = frame.func; + sig = frame.sig; + num_locals = frame.num_locals; + local_base_sp = frame.local_base_sp; + } + def popFrame() -> SpcFrame { + masm.popInlineContext(); + + var frame = state.frame_stack.pop(); + // Restore cached copies from new top frame + var current = state.frame_stack.peek(); + it.reset(current.func).at(current.pc, -1); + it.current(); // position codeptr.pos past the instruction at current.pc so it.next() advances correctly + module = current.module; + func = current.func; + sig = current.sig; + num_locals = current.num_locals; + local_base_sp = current.local_base_sp; + return frame; + } + def isInlined() -> bool { + return state.frame_stack.top > 1; + } + def inlineDepth() -> int { + return state.frame_stack.top - 1; + } + def snapshotFrames() -> Array { + var frames = Array.new(state.frame_stack.top); + for (i < state.frame_stack.top) { + var f = state.frame_stack.elems[i]; + var pc = if(i == state.frame_stack.top - 1, it.pc, f.pc); + frames[i] = SpcFrame.new(f.func, f.sig, f.module, f.local_base_sp, f.ctl_base_sp, f.num_locals, pc); + } + return frames; + } + def gen(module: Module, func: FuncDecl, err: ErrorGen) -> bool { this.osr_pc = -1; this.err = err; @@ -141,10 +203,6 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var before_data_bytes = masm.curDataBytes(); // Reset internal state. - this.module = module; - this.func = func; - it.reset(func); - sig = func.sig; regAlloc.clear(); trap_labels.resize(0); success = true; @@ -153,10 +211,22 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl handler_dest_info.clear(); handler_dest_info.resize(func.handlers.handler_dests.length); + // Reset frame state + this.module = null; + this.func = null; + this.sig = null; + this.num_locals = 0; + this.local_base_sp = 0; + // Initialize parameters, locals, and first control stack entry. var end_label = masm.newLabel(func.cur_bytecode.length); - state.reset(sig, end_label); - state.num_locals = func.num_slots(); + state.reset(func.sig, end_label); + + // Push initial frame for top-level function + state.frame_stack.clear(); + var initial_frame = SpcFrame.new(func, func.sig, module, 0, 0, func.num_slots(), 0); + + pushFrame(initial_frame); // Emit prologue, which allocates the frame and initializes various registers. emitPrologue(); @@ -179,6 +249,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (Trace.compiler) traceOpcodeAndStack(false); last_probe = 0; masm.source_loc = it.pc; + masm.current_fid = func.func_index; it.dispatch(this); unrefRegs(); if (Trace.compiler && Trace.asm) { @@ -195,9 +266,22 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Emit trap labels. for (i < trap_labels.length) { var e = trap_labels[i]; - masm.bindLabel(e.1); - masm.emit_mov_m_i(xenv.pc_slot, e.1.create_pos); - masm.emit_jump_to_trap_at(e.0); + var reason = e.0; + var label = e.1; + var frames = e.2; + + masm.bindLabel(label); + + // If trap occurred in inlined function, reconstruct frames + if (frames.length > 1) { + unrefRegs(); + emitReconstructStackFrames(frames); + } else { + // otherwise, put pc as before + masm.emit_mov_m_i(xenv.pc_slot, label.create_pos); + } + + masm.emit_jump_to_trap_at(reason); } // Emit handler stubs. @@ -337,7 +421,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // Clear FrameAccessor masm.emit_mov_m_l(frame.accessor_slot, 0); // XXX: value kind // Clear inlined whamm instance - if (SpcTuning.inlineSmallFunc && SpcTuning.intrinsifyWhammProbe) { + if (SpcTuning.inlineWhammProbe && SpcTuning.intrinsifyWhammProbe) { masm.emit_mov_m_l(frame.inlined_instance_slot, 0); } @@ -377,7 +461,22 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (last_probe == 0) return; var probe = Instrumentation.getLocalProbe(module, func.func_index, last_probe); last_probe = 0; + + // Reconstruct inlined frames before emitting probe (if needed) + var reconstructed_space = 0; + if (isInlined()) { + var frames = snapshotFrames(); + unrefRegs(); + reconstructed_space = emitReconstructStackFrames(frames); + } + emitProbe0(it.pc, probe); + + // Clean up reconstructed frames after the probe returns + if (reconstructed_space > 0) { + masm.emit_addw_r_i(regs.sp, reconstructed_space); + } + if (Trace.compiler) traceOpcodeAndStack(true); } def emitProbe0(pc: int, probe: Probe) { @@ -460,42 +559,192 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (!spillMode.free_regs) state.emitRestoreAll(resolver); } + // Emit code for an inlined regular function call + def emitInlinedCall(callee_func: FuncDecl, whamm: WhammProbe) { + var sig = callee_func.sig; + var params_count = u32.view(sig.params.length); + var results_count = u32.view(sig.results.length); + var orig_sp = state.sp; + + // Arguments are already on stack + // Stack: [..., arg0, arg1, ..., argN] <- sp + // We want callee's local 0 = arg0, so: + var new_local_base_sp: u31 = u31.view(orig_sp - params_count); + var new_ctl_base_sp = u31.view(state.ctl_stack.top); + + var num_locals = callee_func.num_slots(); + + // Push an implicit block for the head of the function + var end_label = masm.newLabel(callee_func.cur_bytecode.length); + state.pushBlock(sig.params, sig.results, end_label); + + var m: Module = module; + + // Whamm probe configuration + if (whamm != null) { + def whamm_sig = whamm.sig; + def whamm_wf = WasmFunction.!(whamm.func); + def whamm_instance = whamm_wf.instance; + def whamm_func_decl = whamm_wf.decl; + + m = whamm_instance.module; + new_local_base_sp = u31.view(state.sp) - u31.view(whamm_sig.length); // XXX + } + + // Create and push frame for inlined function + var callee_frame = SpcFrame.new(callee_func, sig, // XXX correct for whamm probes? + m, new_local_base_sp, new_ctl_base_sp, num_locals, 0); + + pushFrame(callee_frame); + + // Emit function entry probe, if any. + // XXX expensive because frame materialization required + if (whamm == null && !FeatureDisable.entryProbes && func.entry_probed) { + var probe = Instrumentation.getLocalProbe(module, callee_func.func_index, 0); + + // Reconstruct inlined frames before emitting probe + var reconstructed_space = 0; + if (isInlined()) { + var frames = snapshotFrames(); + unrefRegs(); + reconstructed_space = emitReconstructStackFrames(frames); + } + emitProbe0(0, probe); + // Clean up reconstructed frames after the call returns + if (reconstructed_space > 0) { + masm.emit_addw_r_i(regs.sp, reconstructed_space); + } + } + + // Allocate callee's non-parameter locals + it.dispatchLocalDecls(this); + + // Compile callee's bytecode + if (Trace.compiler) Trace.OUT.puts(" Start inlined function body").ln(); + while (it.more() && success) { + if (Trace.compiler) traceOpcodeAndStack(false); + last_probe = 0; + masm.source_loc = it.pc; + masm.current_fid = func.func_index; + it.dispatch(this); + if (Trace.compiler && Trace.asm) { + OUT.puts("JIT code: "); + masm.printCodeBytes(OUT, codegen_offset, masm.curCodeBytes()); + codegen_offset = masm.curCodeBytes(); + OUT.ln(); + } + unrefRegs(); + if (Debug.compiler) checkRegAlloc(); + it.next(); + if (skip_to_end) doSkipToEndOfBlock(); + } + if (Trace.compiler) Trace.OUT.puts(" End inlined function body").ln(); + + // Check if the inlined function is unreachable (e.g., ended with UNREACHABLE, RETURN, THROW) + var inlined_reachable = state.ctl_stack.peek().reachable; + + // Restore caller context by popping frame + popFrame(); // Automatically restores cached fields + + // Note: Control stack cleanup (popping implicit BLOCK) is handled by visit_END + + // If inlined function is unreachable, no results to clean up + if (!inlined_reachable) { + if (Trace.compiler) { + Trace.OUT.puts(" Inlined function unreachable, skipping result cleanup").ln(); + Trace.OUT.put3(" state.sp=%d, new_local_base_sp=%d, callee_slots=%d", + state.sp, new_local_base_sp, state.sp - new_local_base_sp).ln(); + } + // Drop all callee state (params + locals, no results) + var callee_slots = state.sp - new_local_base_sp; + if (callee_slots > 0) dropN(u32.view(callee_slots)); + if (Trace.compiler) Trace.OUT.put1(" After dropN: state.sp=%d", state.sp).ln(); + setUnreachable(); + return; + } + + // Clean up stack: + // Before: [..., arg0, arg1, ..., argN, local0, local1, ..., localM, result0, ..., resultK] + // After: [..., result0, ..., resultK] + + var total_callee_slots = state.sp - new_local_base_sp; // All callee state + var slots_to_drop = total_callee_slots - results_count; + + // for whamm probes, results_count SHOULD be zero + if (slots_to_drop > 0 && results_count > 0) { + // Need to move results down over parameters and locals + for (i < results_count) { + var result_slot = state.sp - results_count + u32.view(i); + var target_slot = new_local_base_sp + u32.view(i); + if (Trace.compiler) { + Trace.OUT.put3(" Moving result %d: slot %d -> slot %d", i, result_slot, target_slot).ln(); + } + if (result_slot != target_slot) { + var rv = state.state[result_slot]; + if (Trace.compiler) { + Trace.OUT.put2(" rv: flags=%x, const=%d", rv.flags, rv.const).ln(); + } + if (rv.inReg()) { + regAlloc.reassign(rv.reg, int.!(result_slot), int.!(target_slot)); + } else { + // Move in memory (rarely needed if results are in regs) + resolver.addMove((target_slot, rv), (result_slot, rv)); + } + state.state[target_slot] = rv; + } else { + // Result already in the right place + if (Trace.compiler) Trace.OUT.puts(" (already in place)").ln(); + } + } + resolver.emitMoves(); + + // Drop everything above results + for (slot = new_local_base_sp + results_count; slot < state.sp; slot++) { + unrefSlot(slot); + } + state.sp = new_local_base_sp + results_count; + } else if (slots_to_drop > 0) { + // No results, just drop everything + if (Trace.compiler) Trace.OUT.put1("dropping %d slots\n", slots_to_drop); + dropN(u32.view(slots_to_drop)); + } + // If slots_to_drop <= 0, results are already in the right place + + if (Trace.compiler) { + Trace.OUT.put1(" Inlined call complete, sp=%d", state.sp).ln(); + } + } + // saves the overhead of using a runtime call by directly invoking the wasm function associated with the whamm probe def emitWhammProbe(probe: WhammProbe) { // set up args and push to frame slots. var whamm_sig = probe.sig; var inline_config = InlineConfig(false, false, false); - var new_local_base_sp = 0; var orig_sp = state.sp; var callee_func = WasmFunction.!(probe.func); - if (SpcTuning.inlineSmallFunc) { - inline_config = InlineConfig(probe.spc_swap_membase, probe.spc_swap_instance, probe.spc_inline_func); + if (SpcTuning.inlineWhammProbe) { + inline_config = InlineConfig(false, false, probe.spc_inline_func); if (!probe.inline_heuristic_checked) { inline_config = funcCanInline(callee_func.decl); probe.inline_heuristic_checked = true; - probe.spc_swap_instance = inline_config.swap_instance; - probe.spc_swap_membase = inline_config.swap_membase; probe.spc_inline_func = inline_config.can_inline; } - if (inline_config.swap_instance) { // push whamm instance onto abstract stack directly - masm.emit_mov_r_Instance(regs.scratch, callee_func.instance); - masm.emit_mov_m_r(ValueKind.REF, frame.inlined_instance_slot, regs.scratch); - } + // FIXME Always swap instance into inlined_instance_slot + masm.emit_mov_r_Instance(regs.scratch, callee_func.instance); + masm.emit_mov_m_r(ValueKind.REF, frame.inlined_instance_slot, regs.scratch); - // overwrite mem0_base with whamm instance's memory base, restore from frame slot later - if (inline_config.swap_membase) { + // FIXME Always swap membase into inlined_memo0_base_slot + if (callee_func.instance.memories.length > 0) { var membase = callee_func.instance.memories[0].getMemBase64(); masm.emit_mov_r_l(regs.mem0_base, i64.view(membase)); - masm.emit_mov_m_r(ValueKind.REF, frame.inlined_mem0_base_slot, regs.mem0_base); } + masm.emit_mov_m_r(ValueKind.REF, frame.inlined_mem0_base_slot, regs.mem0_base); } if (!inline_config.can_inline) { state.emitSaveAll(resolver, probeSpillMode); - } else { - new_local_base_sp = int.view(state.sp); } for (i < whamm_sig.length) { @@ -610,7 +859,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (inline_config.can_inline) { visit_LOCAL_GET(u31.view(index)); } else { - masm.emit_mov_m_m(ValueKind.REF, slot_addr, masm.slotAddr(index)); + masm.emit_mov_m_m(state.state[index].kind(), slot_addr, masm.slotAddr(index)); } kind = state.state[index].kind().code; } @@ -633,48 +882,13 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl var whamm_module = whamm_instance.module; var whamm_func_decl = callee_func.decl; if (inline_config.can_inline) { - var prev_it = it; - it = BytecodeIterator.new().reset(whamm_func_decl); - var orig_module = module; - - // prepare spc for inlining - this.local_base_sp = u31.view(new_local_base_sp); - this.module = whamm_module; - this.func = whamm_func_decl; - this.sig = whamm_func_decl.sig; - - // inline codegen - it.dispatchLocalDecls(this); - this.is_inlined = true; - if (Trace.compiler) Trace.OUT.puts("Start compiling inlined whamm probe").ln(); - while (it.more() && success) { - if (Trace.compiler) traceOpcodeAndStack(false); - last_probe = 0; - masm.source_loc = it.pc; - it.dispatch(this); - if (Trace.compiler && Trace.asm) { - OUT.puts("JIT code: "); - masm.printCodeBytes(OUT, codegen_offset, masm.curCodeBytes()); - codegen_offset = masm.curCodeBytes(); - OUT.ln(); - } - unrefRegs(); - if (Debug.compiler) checkRegAlloc(); - it.next(); + is_whamm_inlined = true; + emitInlinedCall(whamm_func_decl, probe); + is_whamm_inlined = false; + // Restore mem0_base after probe + if (module.memories.length > 0) { + masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); } - if (Trace.compiler) Trace.OUT.puts("Finished compiling inlined whamm probe").ln(); - - // restore spc after inlining - it = prev_it; - this.local_base_sp = 0; - this.is_inlined = false; - this.module = orig_module; - this.func = it.func; - this.sig = it.func.sig; - masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); - - // clear callee params/locals from abstract state - dropN(state.sp - orig_sp); } else { var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); var func_reg = allocTmpFixed(ValueKind.REF, regs.func_arg); @@ -770,37 +984,38 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl setUnreachable(); } def visit_END() { - if (!this.is_inlined) { - var ctl_top = state.ctl_stack.peek(); - if (ctl_top.opcode == Opcode.LOOP.code) { - state.ctl_stack.pop(); - if (!ctl_top.reachable) setUnreachable(); - } else if (ctl_top.opcode == Opcode.IF.code) { - // simulate empty if-true block - state.emitFallthru(resolver); - masm.emit_br(ctl_top.label); - masm.bindLabel(ctl_top.else_label); - state.doElse(); - ctl_top.opcode = Opcode.ELSE.code; - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - state.ctl_stack.pop(); - } else if (ctl_top.opcode == Opcode.BLOCK.code || ctl_top.opcode == Opcode.ELSE.code) { - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - state.ctl_stack.pop(); - } else if (ctl_top.opcode == Opcode.RETURN.code) { - state.emitFallthru(resolver); - masm.bindLabel(ctl_top.label); - state.resetToMerge(ctl_top); - emitProbe(); - if (ctl_top.merge_count > 1) emitReturn(ctl_top); - state.ctl_stack.pop(); - } + var frame = state.frame_stack.peek(); + var is_implicit_function_block = isInlined() && state.ctl_stack.top == frame.ctl_base_sp + 1; + + var ctl_top = state.ctl_stack.peek(); + if (ctl_top.opcode == Opcode.LOOP.code) { + state.ctl_stack.pop(); + if (!ctl_top.reachable) setUnreachable(); + } else if (ctl_top.opcode == Opcode.IF.code) { + // simulate empty if-true block + state.emitFallthru(resolver); + masm.emit_br(ctl_top.label); + masm.bindLabel(ctl_top.else_label); + state.doElse(); + ctl_top.opcode = Opcode.ELSE.code; + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + state.ctl_stack.pop(); + } else if (ctl_top.opcode == Opcode.BLOCK.code || ctl_top.opcode == Opcode.ELSE.code) { + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); + state.ctl_stack.pop(); + } else if (ctl_top.opcode == Opcode.RETURN.code) { + state.emitFallthru(resolver); + masm.bindLabel(ctl_top.label); + state.resetToMerge(ctl_top); emitProbe(); + if (ctl_top.merge_count > 1) emitReturn(ctl_top); + state.ctl_stack.pop(); } + emitProbe(); } def visit_BR(depth: u31) { var target = state.getControl(depth); @@ -837,6 +1052,29 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } def visitCallDirect(op: Opcode, index: u31, tailCall: bool) { var func = module.functions[index]; + if (op == Opcode.CALL) Metrics.spc_static_calls.val++; + + // Try inlining for intra-module, non-tail calls + if (!tailCall && shouldInline(this, func)) { + if (Trace.compiler) Trace.OUT.put2("Inlining call to func #%d (%d bytes)", index, func.orig_bytecode.length).ln(); + if (op == Opcode.CALL) { + Metrics.spc_static_inlined_calls.val++; + masm.emit_inc_metric(Metrics.spc_dynamic_inlined_calls); + masm.emit_inc_metric(Metrics.spc_dynamic_calls); + } + emitInlinedCall(func, null); + return; + } + if (op == Opcode.CALL) masm.emit_inc_metric(Metrics.spc_dynamic_calls); + // Reconstruct inlined frames before the call, skip whamm probes + var reconstructed_space = 0; + if (isInlined() && !is_whamm_inlined) { + var frames = snapshotFrames(); + unrefRegs(); + reconstructed_space = emitReconstructStackFrames(frames); + } + + // Existing non-inlined call path var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); // Load the instance (which must happen before frame is unwound). var vsp_reg = allocTmpFixed(ValueKind.REF, regs.vsp); @@ -849,6 +1087,12 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_v3_Array_elem_r_ri(ValueKind.REF, func_reg, func_reg, func.func_index); emitCallToReg(func.sig, func_reg, vsp_reg, tmp, func.imp != null, tailCall); + + // Clean up reconstructed frames after the call returns + if (reconstructed_space > 0) { + masm.emit_addw_r_i(regs.sp, reconstructed_space); + } + } def emitCallToReg(sig: SigDecl, func_reg: Reg, vsp_reg: Reg, tmp: Reg, checkHostCall: bool, tailCall: bool) { var retpt = masm.newLabel(it.pc), wasmcall_label = masm.newLabel(it.pc); @@ -856,6 +1100,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl if (tailCall) emitMoveTailCallArgs(sig); // transfer tail call args else state.emitSaveAll(resolver, SpillMode.SAVE_AND_FREE_REGS); // spill entire value stack + // Compute the value stack pointer. emit_compute_vsp(vsp_reg, state.sp); if (checkHostCall) { @@ -878,12 +1123,16 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_jump_r(tmp); setUnreachable(); } else { + masm.emit_call_r(tmp); masm.bindLabel(retpt); + emit_reload_regs(); state.popArgsAndPushResults(sig); } } + // TODO this is basically the same logic as "returning" from an inlined function definition + // - don't adjust frame def emitMoveTailCallArgs(sig: SigDecl) { var p = sig.params, count = u32.!(p.length); var base = state.sp - count; @@ -1913,12 +2162,21 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl state.emitSaveAll(resolver, runtimeSpillMode); emit_compute_vsp(regs.vsp, state.sp); masm.emit_store_curstack_vsp(regs.vsp); + // Reconstruct stack frames across runtime calls that might (Wasm-level) trap. + // XXX expensive operation: + // - either SPC needs to be refactored so that runtime calls can be avoided, or + // - do a lazy reconstruction only if it traps (where?) + var inlined_space = 0; + if (canTrap && isInlined()) + inlined_space = emitReconstructStackFrames(snapshotFrames()); masm.emit_get_curstack(regs.runtime_arg0); masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp); masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0); emit_load_instance(regs.runtime_arg1); masm.emit_mov_r_i(regs.runtime_arg2, arg1); masm.emit_call_runtime_op(op); + if (inlined_space > 0) + masm.emit_addw_r_i(regs.sp, inlined_space); masm.emit_get_curstack(regs.scratch); masm.emit_pop_X86_64Stack_rsp_r_r(regs.scratch); dropN(args); @@ -1930,6 +2188,10 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl state.emitSaveAll(resolver, runtimeSpillMode); emit_compute_vsp(regs.vsp, state.sp); masm.emit_store_curstack_vsp(regs.vsp); + // Reconstruct stack frames across runtime calls that might (Wasm-level) trap. + var inlined_space = 0; + if (canTrap && isInlined()) + inlined_space = emitReconstructStackFrames(snapshotFrames()); masm.emit_get_curstack(regs.runtime_arg0); masm.emit_v3_set_X86_64Stack_rsp_r_r(regs.runtime_arg0, regs.sp); masm.emit_push_X86_64Stack_rsp_r_r(regs.runtime_arg0); @@ -1937,6 +2199,8 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_mov_r_i(regs.runtime_arg2, arg1); masm.emit_mov_r_i(regs.runtime_arg3, arg2); masm.emit_call_runtime_op(op); + if (inlined_space > 0) + masm.emit_addw_r_i(regs.sp, inlined_space); masm.emit_get_curstack(regs.scratch); masm.emit_pop_X86_64Stack_rsp_r_r(regs.scratch); dropN(args); @@ -2054,9 +2318,92 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl masm.emit_addw_r_i(regs.sp, frame.frameSize); masm.emit_ret(); } + def getTrapStubIp(reason: TrapReason) -> long; + def getSpcInlinedFrameIp() -> long; + // Emit code to materialize stack frames for each inlined function. + // The frames array is in the same order as frame_stack: outermost first, innermost last. + def emitReconstructStackFrames(frames: Array) -> int { + Metrics.spc_static_remat.val++; + masm.emit_inc_metric(Metrics.spc_dynamic_remat); + def real_frame = frames[0]; + masm.emit_mov_m_i(xenv.pc_slot, real_frame.pc); + + // Use inlined frame stub IP as return address for all reconstructed frames + var return_addr = getSpcInlinedFrameIp(); + var total_space = 0; + + // load instance + var inst_reg = allocTmp(ValueKind.REF); + //emit_load_instance(inst_reg); + masm.emit_mov_r_m(ValueKind.REF, inst_reg, frame.instance_slot); + var mem_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, mem_reg, frame.mem0_base_slot); + // Load instance.functions + def func_reg = allocTmp(ValueKind.REF); + masm.emit_v3_Instance_functions_r_r(func_reg, inst_reg); + // use same vfp for all frames + def vfp_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, vfp_reg, frame.vfp_slot); + var wasm_func_reg = allocTmp(ValueKind.REF); + + var inl_inst_reg: Reg, inl_mem0_reg: Reg; + if (is_whamm_inlined) { + inl_inst_reg = allocTmp(ValueKind.REF); + inl_mem0_reg = allocTmp(ValueKind.REF); + masm.emit_mov_r_m(ValueKind.REF, inl_inst_reg, frame.inlined_instance_slot); + masm.emit_mov_r_m(ValueKind.REF, inl_mem0_reg, frame.inlined_mem0_base_slot); + } + + // Process the inlined frames (skip the outermost which already exists on native stack) + for (i = 1; i < frames.length; i++) { + var frame_info = frames[i]; + + // Push inlined frame stub IP as return address + masm.emit_subw_r_i(regs.sp, 8); + masm.emit_mov_m_l(MasmAddr(regs.sp, 0), return_addr); + total_space += 8; + + // Allocate concrete stack frame for inlined function + masm.emit_subw_r_i(regs.sp, frame.frameSize); + total_space += frame.frameSize; + + // get functions[func_index] and save into frame + masm.emit_v3_Array_elem_r_ri(ValueKind.REF, wasm_func_reg, func_reg, frame_info.func.func_index); + masm.emit_mov_m_r(ValueKind.REF, frame.wasm_func_slot, wasm_func_reg); + + // Save instance to frame.instance_slot + masm.emit_mov_m_r(ValueKind.REF, frame.instance_slot, inst_reg); + + // Save mem0 base + masm.emit_mov_m_r(ValueKind.REF, frame.mem0_base_slot, mem_reg); + + // use same vfp for all frames + // TODO different vfps? + masm.emit_mov_m_r(ValueKind.REF, frame.vfp_slot, vfp_reg); + + // Save PC to frame.pc_slot + masm.emit_mov_m_i(frame.pc_slot, frame_info.pc); // exact pc of each frame + + // Clear FrameAccessor and inlined_instance_slot + masm.emit_mov_m_l(frame.accessor_slot, 0); + + // if an inlined whamm probe, also grab inlined slots + if (is_whamm_inlined) { + masm.emit_mov_m_r(ValueKind.REF, frame.inlined_instance_slot, inl_inst_reg); + masm.emit_mov_m_r(ValueKind.REF, frame.inlined_mem0_base_slot, inl_mem0_reg); + } else { + masm.emit_mov_m_l(frame.inlined_instance_slot, 0); + masm.emit_mov_m_l(frame.inlined_mem0_base_slot, 0); + } + } + + return total_space; + } def newTrapLabel(reason: TrapReason) -> MasmLabel { var label = masm.newLabel(it.pc); - trap_labels.put(reason, label); + // Snapshot frame_stack for trap reconstruction + var frames = snapshotFrames(); + trap_labels.put((reason, label, frames)); return label; } def unsupported() { @@ -2153,7 +2500,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl // XXX: recompute VFP from VSP - #slots? masm.emit_mov_r_m(ValueKind.REF, regs.vfp, frame.vfp_slot); if (module.memories.length > 0) { - if (is_inlined) { + if (is_whamm_inlined) { masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.inlined_mem0_base_slot); } else { masm.emit_mov_r_m(ValueKind.REF, regs.mem0_base, frame.mem0_base_slot); @@ -2161,7 +2508,7 @@ class SinglePassCompiler(xenv: SpcExecEnv, masm: MacroAssembler, regAlloc: RegAl } } def emit_load_instance(reg: Reg) { - if (is_inlined) { // inline compilation + if (is_whamm_inlined) { masm.emit_mov_r_m(ValueKind.REF, reg, frame.inlined_instance_slot); } else { masm.emit_mov_r_m(ValueKind.REF, reg, frame.instance_slot); @@ -2590,6 +2937,19 @@ type SpcVal(flags: byte, reg: Reg, const: int) #unboxed { } } +// A frame in the compilation frame stack (for inlining). +class SpcFrame { + var func: FuncDecl; + var sig: SigDecl; + var module: Module; + var local_base_sp: u31; // Base index into SpcState.state + var ctl_base_sp: u31; // Base index into SpcState.ctl_stack + var num_locals: int; + var pc: int; // XXX pc only initialized during snapshots + + new(func, sig, module, local_base_sp, ctl_base_sp, num_locals, pc) {} +} + // An entry in the abstract control stack. class SpcControl { var opcode: u16; @@ -2626,12 +2986,15 @@ def isNotZero = int.!=(0, _); def trueToOne(z: bool) -> int { return if(z, 1, 0); } // Contains both the abstract control and abstract value stack. +// TODO keep track of which instance state is cached? (cache immutable things, globals) +// - clear cache on snapshot class SpcState(regAlloc: RegAlloc) { - // Abstract state of the value stack + // Abstract state of the value stack (shared across frames) var state = Array.new(INITIAL_VALUE_STACK_SIZE); var sp: u32; var ctl_stack = ArrayStack.new(); - var num_locals: int; + // Frame management (for inlining) + var frame_stack = ArrayStack.new(); // Reset the state for starting a new function. def reset(sig: SigDecl, ret_label: MasmLabel) { @@ -2680,12 +3043,14 @@ class SpcState(regAlloc: RegAlloc) { target.merge_count = 1; var valcount = u32.view(target.params.length); var popcount = 0u; + var num_locals = frame_stack.peek().num_locals; target.merge_state = doFirstTransferAndGetMerge(valcount, popcount, resolver); resolver.emitMoves(); - if (Trace.compiler) trace0(" loop merge = ", target.merge_state, target.merge_state.length); + if (Trace.compiler) trace0(" loop merge = ", target.merge_state, target.merge_state.length, num_locals); resetTo(sp, target.merge_state); } def doFirstTransferAndGetMerge(valcount: u32, popcount: u32, resolver: SpcMoveResolver) -> Array { + var num_locals = frame_stack.peek().num_locals; return SpcMerger.new(state, num_locals, sp, regAlloc, resolver).createMerge(valcount, popcount); } // Get a merge that contains the current stack up to {val_stack_top} plus {args}, all stored in memory. @@ -2751,8 +3116,9 @@ class SpcState(regAlloc: RegAlloc) { if (target.merge_count == 0) { target.merge_count = 1; var popcount = sp - valcount - target.val_stack_top; + var num_locals = frame_stack.peek().num_locals; target.merge_state = doFirstTransferAndGetMerge(valcount, popcount, resolver); - if (Trace.compiler) trace0(" merge = ", target.merge_state, target.merge_state.length); + if (Trace.compiler) trace0(" merge = ", target.merge_state, target.merge_state.length, num_locals); } else { target.merge_count = 2; // XXX: allow matching constants in merges @@ -2889,14 +3255,17 @@ class SpcState(regAlloc: RegAlloc) { state = Arrays.grow(state, nlength); } def trace() { - trace0("", state, int.!(sp)); + var num_locals = frame_stack.peek().num_locals; + trace0("", state, int.!(sp), num_locals); } - def trace0(str: string, vals: Array, sp: int) { + def trace0(str: string, vals: Array, sp: int, num_locals: int) { OUT.puts(str); OUT.puts("|"); - for (i < num_locals) vals[i].renderTrace(OUT, regAlloc.poolMap.regSet); + // Don't assume vals.length >= num_locals (e.g., for implicit function blocks in inlined functions) + var trace_limit = if(num_locals < vals.length, num_locals, vals.length); + for (i < trace_limit) vals[i].renderTrace(OUT, regAlloc.poolMap.regSet); OUT.puts(" |"); - for (i = num_locals; i < sp; i++) vals[i].renderTrace(OUT, regAlloc.poolMap.regSet); + for (i = num_locals; i < sp && i < vals.length; i++) vals[i].renderTrace(OUT, regAlloc.poolMap.regSet); OUT.ln(); } } @@ -3168,11 +3537,45 @@ class MoveNode { var dstNext: MoveNode; // next in a list of successors } +// Determine if a regular function call should be inlined +def shouldInline(spc: SinglePassCompiler, func: FuncDecl) -> bool { + if (Trace.compiler) OUT.put1("deciding on inlining call to func #%d: ", func.func_index); + + // Don't inline imported functions + if (func.imp != null) { + if (Trace.compiler) OUT.puts("NO (imported)\n"); + return false; + } + + // Don't inline if we're already inlining (prevent unbounded recursion) + if (spc.inlineDepth() >= SpcTuning.maxInlineDepth) { + if (Trace.compiler) OUT.puts("NO (max inline depth exceeded)\n"); + return false; + } + + // Don't inline recursive calls (detect caller == callee) + // TODO enable inlining recursively + if (func == spc.func) { + if (Trace.compiler) OUT.puts("NO (recursive)\n"); + return false; + } + + // Check general inlining heuristic + var config = regularFuncCanInline(func); + if (!config.can_inline) { + if (Trace.compiler) OUT.puts("NO (failed heuristic)\n"); + return false; + } + + if (Trace.compiler) OUT.puts("YES\n"); + return true; +} + // checks function bytecode to see if it can be inlined based on // simple heuristics: length <= 50 and straightline code. def funcCanInline(decl: FuncDecl) -> InlineConfig { var default = InlineConfig(false, false, false); - if (decl.orig_bytecode.length > 50 || decl.sig.params.length > 10) return default; + if (decl.orig_bytecode.length > SpcTuning.maxInlineBytecodeSize || decl.sig.params.length > SpcTuning.maxInlineParams) return default; var bi = BytecodeIterator.new().reset(decl); var swap_instance = false; var swap_membase = false; @@ -3201,6 +3604,28 @@ def funcCanInline(decl: FuncDecl) -> InlineConfig { type InlineConfig(swap_membase: bool, swap_instance: bool, can_inline: bool); +// checks if a regular (non-probe) function can be inlined based on +// simple heuristics: length <= 50 and straightline code. +def regularFuncCanInline(decl: FuncDecl) -> InlineConfig { + var default = InlineConfig(false, false, false); + if (decl.orig_bytecode.length > SpcTuning.maxInlineBytecodeSize || decl.sig.params.length > SpcTuning.maxInlineParams) return default; + var bi = BytecodeIterator.new().reset(decl); + while (bi.more()) { + var op = bi.current(); + match (op) { + // Do not support return-like instructions + RETURN, RETURN_CALL, RETURN_CALL_INDIRECT, RETURN_CALL_REF => return default; + // Cannot handle exception handling instructions. + TRY, CATCH, THROW, RETHROW, THROW_REF, DELEGATE, CATCH_ALL, TRY_TABLE => return default; + // Cannot handle stack switching instructions. + CONT_NEW, CONT_BIND, SUSPEND, RESUME, RESUME_THROW, RESUME_THROW_REF, SWITCH => return default; + _ => ; + } + bi.next(); + } + return InlineConfig(false, false, true); +} + // Used to record the entry point of exception/suspension handlers. Jumping to {stub_label} allows // control transfer to its corresponding handler without falling back to fast-int. // diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3 index 181aa9696..835db433e 100644 --- a/src/engine/x86-64/X86_64MacroAssembler.v3 +++ b/src/engine/x86-64/X86_64MacroAssembler.v3 @@ -801,6 +801,11 @@ class X86_64MacroAssembler extends MacroAssembler { def emit_jump_r(reg: Reg) { asm.ijmp_r(G(reg)); } + def emit_inc_metric(metric: Metric) { + if (!metric.enabled) return; + asm.movq_r_l(scratch, Pointer.atField(metric.val) - Pointer.NULL); + asm.q.inc_m(scratch.plus(0)); + } def emit_increment_CountProbe(tmp: Reg, probe: CountProbe, increment: u64) { var r1 = G(tmp); var refOffset = asm.movq_r_p(r1, long.view(Pointer.atObject(probe))); @@ -1700,4 +1705,4 @@ def TRUNC_i32_f64_u = FloatTrunc.new(false, true, false); def TRUNC_i64_f32_s = FloatTrunc.new(true, false, true); def TRUNC_i64_f32_u = FloatTrunc.new(true, false, false); def TRUNC_i64_f64_s = FloatTrunc.new(true, true, true); -def TRUNC_i64_f64_u = FloatTrunc.new(true, true, false); \ No newline at end of file +def TRUNC_i64_f64_u = FloatTrunc.new(true, true, false); diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 04ad0875f..5b0fe13e4 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -34,6 +34,13 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { mmasm.trap_stubs = TRAPS_STUB; } + def getTrapStubIp(reason: TrapReason) -> long { + return mmasm.trap_stubs.getIpForReason(reason) - Pointer.NULL; + } + def getSpcInlinedFrameIp() -> long { + return INLINED_FRAME_STUB.start - Pointer.NULL; + } + private def visitCompareI(asm: X86_64Assembler, cond: X86_64Cond) -> bool { var b = pop(), a = popReg(); if (b.isConst()) asm.cmp_r_i(G(a.reg), b.const); @@ -1133,6 +1140,7 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { } def ucontext_rip_offset = 168; +def ucontext_rsp_offset = 160; def SIGFPE = 8; def SIGBUS = 10; def SIGSEGV = 11; @@ -1175,7 +1183,7 @@ class X86_64SpcCode extends RiUserCode { RiGc.rescanRoot(sp + X86_64InterpreterFrame.wasm_func.offset); RiGc.rescanRoot(sp + X86_64InterpreterFrame.instance.offset); RiGc.rescanRoot(sp + X86_64InterpreterFrame.accessor.offset); - if (SpcTuning.inlineSmallFunc && SpcTuning.intrinsifyWhammProbe) { + if (SpcTuning.inlineWhammProbe && SpcTuning.intrinsifyWhammProbe) { RiGc.rescanRoot(sp + X86_64InterpreterFrame.stp.offset); // TODO: define X86_64SpcFrame and use dedicated slot } } @@ -1185,7 +1193,7 @@ class X86_64SpcCode extends RiUserCode { class X86_64SpcModuleCode extends X86_64SpcCode { def mapping: Mapping; var codeEnd: int; // for dynamically adding code to the end - var sourcePcs: Vector<(int, int)>; + var sourcePcs: Vector<(int, List<(int, int)>)>; var embeddedRefOffsets: Vector; new(mapping) super("spc-module", mapping.range.start, mapping.range.end) { @@ -1229,35 +1237,77 @@ class X86_64SpcModuleCode extends X86_64SpcCode { return false; } // Updates the siginfo's {ucontext} to set the handler %rip and to write the PC of the fault location - // into the stack frame for the handler. + // into the stack frame for the handler, along with any inlined frames at the program location. private def updateUContextToTrapsStub(ucontext: Pointer, reason: TrapReason) { var p_rip = ucontext + ucontext_rip_offset; var p_rsp = RiOs.getSp(ucontext); if (!RiRuntime.inStackRedZone(p_rsp)) { // Update the current PC in the JIT frame, if it is accessible. var ip = p_rip.load(); - var pc = lookupPc(ip, false); - (p_rsp + X86_64InterpreterFrame.curpc.offset).store(pc); + var inline_ctx = lookupPc(ip, false); + if (inline_ctx == null) { + // old case: lookupPc failed (returned -1) + (p_rsp + X86_64InterpreterFrame.curpc.offset).store(-1); + } else if (inline_ctx.tail == null) { + // old case: store pc directly into stack + (p_rsp + X86_64InterpreterFrame.curpc.offset).store(inline_ctx.head.1); + } else { + // new case: reconstruct inlined frames + p_rsp = reconstructInlinedFramesForTrap(p_rsp, inline_ctx); + (ucontext + ucontext_rsp_offset).store(p_rsp); + } } var handler_ip = TRAPS_STUB.getIpForReason(reason); (p_rip).store(handler_ip); } - // Look up the source {pc} of a location {i} in this code. Returns {-1} if no exact entry is found. + // Reconstructs inlined interpreter frames for an inlined hardware trap context. + // Returns the new rsp to write into the ucontext (top of stack). + private def reconstructInlinedFramesForTrap(r_rsp: Pointer, inline_ctx: List<(int, int)>) -> Pointer { + def frames: Array<(int, int)> = Lists.toArray(inline_ctx); + def outer = frames[frames.length - 1]; + def inlined = frames[0 ... (frames.length - 1)]; + def count = inlined.length; + + // set outermost pc in the real frame + (r_rsp + X86_64InterpreterFrame.curpc.offset).store(outer.1); + + // Read instance from the real outer frame (shared across all inlined frames). + var instance = (r_rsp + X86_64InterpreterFrame.instance.offset).load(); + + // Push inlined frames + for (i = count - 1; i >= 0; i--) { + var fid = inlined[i].0; + var pc = inlined[i].1; + + r_rsp += -8; + r_rsp.store(INLINED_FRAME_STUB.start); + + r_rsp += -X86_64InterpreterFrame.size; // move rsp? + // write func, pc, frame accessor + var wasm_func = WasmFunction.!(instance.functions[fid]); + (r_rsp + X86_64InterpreterFrame.wasm_func.offset).store(wasm_func); + (r_rsp + X86_64InterpreterFrame.curpc.offset).store(pc); + (r_rsp + X86_64InterpreterFrame.accessor.offset).store(null); + } + return r_rsp; + } + // Look up the inline context for a location {ip} in this code. Returns {null} if no exact + // entry is found. The returned list head is the innermost (fid, pc), tail leads to outermost caller. // Return addresses are treated differently than other addresses in the code. - def lookupPc(ip: Pointer, isRetAddr: bool) -> int { + def lookupPc(ip: Pointer, isRetAddr: bool) -> List<(int, int)> { if (Trace.compiler) Trace.OUT.put2("SpcCode.lookupPc(0x%x, ret=%z)", (ip - Pointer.NULL), isRetAddr).ln(); - if (sourcePcs == null) return -1; - if (!mapping.range.contains(ip)) return -1; + if (sourcePcs == null) return null; + if (!mapping.range.contains(ip)) return null; var offset = ip - mapping.range.start - if(isRetAddr, 1); // XXX: use binary search for looking up source PCs in SPC code if (Trace.compiler) Trace.OUT.put1(" looking for offset=%d", offset).ln(); for (i < sourcePcs.length) { var entry = sourcePcs[i]; - if (Trace.compiler) Trace.OUT.put2(" (offset=%d, pc=%d)", entry.0, entry.1).ln(); + if (Trace.compiler) Trace.OUT.put1(" (offset=%d, ctx=[...])", entry.0).ln(); // TODO print full context if (offset == entry.0) return entry.1; } - return -1; + return null; } // Appends code to the end of this module. def appendCode(masm: X86_64MacroAssembler) -> Pointer { @@ -1307,6 +1357,13 @@ class X86_64SpcTrapsStub extends X86_64SpcCode { } } +// Marker for reconstructed inlined frames in stack traces. +// Used as the return address for frames materialized during frame reconstruction. +class X86_64SpcInlinedFrame extends X86_64SpcCode { + new() super("inlined-frame", Pointer.NULL, Pointer.NULL) { } +} + + // The lazy-compile stub needs special handling in the Virgil runtime because it has // a frame that stores the function being compiled. class X86_64SpcCompileStub extends RiUserCode { @@ -1348,6 +1405,9 @@ def LAZY_COMPILE_STUB = X86_64PreGenStub.new("spc-lazy-compile", X86_64SpcCompil def TIERUP_COMPILE_STUB = X86_64PreGenStub.new("spc-tierup-compile", X86_64SpcCompileStub.new("tierup"), genTierUpCompileStub); def TRAPS_STUB = X86_64SpcTrapsStub.new(); def TRAPS_PREGEN = X86_64PreGenStub.new("spc-trap", TRAPS_STUB, genTrapsStub); +def INLINED_FRAME_STUB = X86_64SpcInlinedFrame.new(); +def INLINED_FRAME_PREGEN = X86_64PreGenStub.new("spc-inlined-frame", INLINED_FRAME_STUB, genSpcInlinedFrame); + def genSpcEntryFunc(ic: X86_64InterpreterCode, w: DataWriter) { if (SpcTuning.disable) return; @@ -1452,6 +1512,10 @@ def genTrapsStub(ic: X86_64InterpreterCode, w: DataWriter) { w.skipN(skip); } } +def genSpcInlinedFrame(ic: X86_64InterpreterCode, w: DataWriter) { + var masm = X86_64MacroAssembler.new(w, X86_64MasmRegs.CONFIG); + masm.emit_intentional_crash(); // do not execute this +} def codePointer(f: P -> R) -> Pointer { return CiRuntime.unpackClosure(f).0; } @@ -1478,7 +1542,10 @@ component X86_64Spc { return addr; } def estimateCodeSizeFor(decl: FuncDecl) -> int { - return 60 + decl.orig_bytecode.length * 20; // TODO: huge overestimate + var bc = decl.orig_bytecode.length; + var size = 60 + bc * 20; + if (SpcTuning.maxInlineDepth > 0) size = size << byte.view(SpcTuning.maxInlineDepth + 1); + return size; } private def lazyCompile(wf: WasmFunction) -> (WasmFunction, Pointer, Throwable) { // The global stub simply consults the execution strategy. diff --git a/src/engine/x86-64/X86_64Stack.v3 b/src/engine/x86-64/X86_64Stack.v3 index a47b8665a..dacb2687a 100644 --- a/src/engine/x86-64/X86_64Stack.v3 +++ b/src/engine/x86-64/X86_64Stack.v3 @@ -145,6 +145,7 @@ class X86_64Stack extends WasmStack { x: X86_64InterpreterCode => if (f != null && !f(retip, code, pos, param)) return (true, pos); x: X86_64SpcModuleCode => if (f != null && !f(retip, code, pos, param)) return (true, pos); x: X86_64SpcTrapsStub => if (f != null && !f(retip, code, pos, param)) return (true, pos); + x: X86_64SpcInlinedFrame => if (f != null && !f(retip, code, pos, param)) return (true, pos); x: X86_64ReturnParentStub => { if (stack.parent == null || !continue_to_parent) { if (Trace.stack && Debug.stack) Trace.OUT.puts("walk finished").ln(); @@ -954,9 +955,14 @@ class X86_64FrameAccessor(stack: X86_64Stack, sp: Pointer, decl: FuncDecl) exten var ip = readIp(); var code = RiRuntime.findUserCode(ip); match (code) { - x: X86_64SpcModuleCode => cached_pc = x.lookupPc(ip, true); + x: X86_64SpcModuleCode => { + def chain = x.lookupPc(ip, true); + cached_pc = if(chain != null, chain.head.1, -1); + } x: X86_64InterpreterCode => cached_pc = X86_64Interpreter.computePCFromFrame(sp); x: X86_64SpcTrapsStub => cached_pc = (sp + X86_64InterpreterFrame.curpc.offset).load(); + x: X86_64SpcInlinedFrame => cached_pc = (sp + X86_64InterpreterFrame.curpc.offset).load(); + // An inlined frame reads the PC from the reconstructed frame, while regular code looks it up. _ => cached_pc = -1; } return cached_pc; @@ -982,6 +988,9 @@ class X86_64FrameAccessor(stack: X86_64Stack, sp: Pointer, decl: FuncDecl) exten match (code) { x: X86_64InterpreterCode => ; x: X86_64SpcCode => ; + x: X86_64SpcInlinedFrame => ; + // if quantity of inline frame is ever held in the frame, we can use that + // to skip some of the depth-searching process _ => return depth; } depth++; @@ -1069,6 +1078,8 @@ class X86_64FrameAccessor(stack: X86_64Stack, sp: Pointer, decl: FuncDecl) exten return X86_64SpcCode.?(code); } + // XXX inlined frames may be unwound without the function actually returning, + // i.e. shrinkwrapping around out-calls private def checkNotUnwound() { if (isUnwound()) System.error("FrameAccessorError", "frame has been unwound"); } diff --git a/test/wizeng/failures.jvm b/test/wizeng/failures.jvm new file mode 100644 index 000000000..7c5348aae --- /dev/null +++ b/test/wizeng/failures.jvm @@ -0,0 +1,6 @@ +wizeng/inline_test_arithmetic.wasm +wizeng/inline_test_locals_control.wasm +wizeng/inline_test_nesting.wasm +wizeng/inline_trap_memory.wasm +wizeng/inline_trap_tableoob.wasm +wizeng/inline_trap_unreachable.wasm diff --git a/test/wizeng/failures.v3i b/test/wizeng/failures.v3i new file mode 100644 index 000000000..7c5348aae --- /dev/null +++ b/test/wizeng/failures.v3i @@ -0,0 +1,6 @@ +wizeng/inline_test_arithmetic.wasm +wizeng/inline_test_locals_control.wasm +wizeng/inline_test_nesting.wasm +wizeng/inline_trap_memory.wasm +wizeng/inline_trap_tableoob.wasm +wizeng/inline_trap_unreachable.wasm diff --git a/test/wizeng/failures.x86-linux b/test/wizeng/failures.x86-linux new file mode 100644 index 000000000..7c5348aae --- /dev/null +++ b/test/wizeng/failures.x86-linux @@ -0,0 +1,6 @@ +wizeng/inline_test_arithmetic.wasm +wizeng/inline_test_locals_control.wasm +wizeng/inline_test_nesting.wasm +wizeng/inline_trap_memory.wasm +wizeng/inline_trap_tableoob.wasm +wizeng/inline_trap_unreachable.wasm diff --git a/test/wizeng/inline_test_arithmetic.wasm b/test/wizeng/inline_test_arithmetic.wasm new file mode 100644 index 000000000..c2bc63912 Binary files /dev/null and b/test/wizeng/inline_test_arithmetic.wasm differ diff --git a/test/wizeng/inline_test_arithmetic.wasm.exit b/test/wizeng/inline_test_arithmetic.wasm.exit new file mode 100644 index 000000000..573541ac9 --- /dev/null +++ b/test/wizeng/inline_test_arithmetic.wasm.exit @@ -0,0 +1 @@ +0 diff --git a/test/wizeng/inline_test_arithmetic.wasm.flags b/test/wizeng/inline_test_arithmetic.wasm.flags new file mode 100644 index 000000000..d6ad59add --- /dev/null +++ b/test/wizeng/inline_test_arithmetic.wasm.flags @@ -0,0 +1 @@ +--mode=jit --metrics=spc*calls --inline-max-depth=1 diff --git a/test/wizeng/inline_test_arithmetic.wasm.out b/test/wizeng/inline_test_arithmetic.wasm.out new file mode 100644 index 000000000..816760310 --- /dev/null +++ b/test/wizeng/inline_test_arithmetic.wasm.out @@ -0,0 +1,4 @@ +spc:static_calls : 8 calls +spc:static_inlined_calls : 8 calls +spc:dynamic_calls : 8 calls +spc:dynamic_inlined_calls : 8 calls diff --git a/test/wizeng/inline_test_locals_control.wasm b/test/wizeng/inline_test_locals_control.wasm new file mode 100644 index 000000000..ae4a04817 Binary files /dev/null and b/test/wizeng/inline_test_locals_control.wasm differ diff --git a/test/wizeng/inline_test_locals_control.wasm.exit b/test/wizeng/inline_test_locals_control.wasm.exit new file mode 100644 index 000000000..573541ac9 --- /dev/null +++ b/test/wizeng/inline_test_locals_control.wasm.exit @@ -0,0 +1 @@ +0 diff --git a/test/wizeng/inline_test_locals_control.wasm.flags b/test/wizeng/inline_test_locals_control.wasm.flags new file mode 100644 index 000000000..d6ad59add --- /dev/null +++ b/test/wizeng/inline_test_locals_control.wasm.flags @@ -0,0 +1 @@ +--mode=jit --metrics=spc*calls --inline-max-depth=1 diff --git a/test/wizeng/inline_test_locals_control.wasm.out b/test/wizeng/inline_test_locals_control.wasm.out new file mode 100644 index 000000000..e9a0b0cfd --- /dev/null +++ b/test/wizeng/inline_test_locals_control.wasm.out @@ -0,0 +1,4 @@ +spc:static_calls : 10 calls +spc:static_inlined_calls : 10 calls +spc:dynamic_calls : 10 calls +spc:dynamic_inlined_calls : 10 calls diff --git a/test/wizeng/inline_test_nesting.wasm b/test/wizeng/inline_test_nesting.wasm new file mode 100644 index 000000000..78a259bd8 Binary files /dev/null and b/test/wizeng/inline_test_nesting.wasm differ diff --git a/test/wizeng/inline_test_nesting.wasm.exit b/test/wizeng/inline_test_nesting.wasm.exit new file mode 100644 index 000000000..573541ac9 --- /dev/null +++ b/test/wizeng/inline_test_nesting.wasm.exit @@ -0,0 +1 @@ +0 diff --git a/test/wizeng/inline_test_nesting.wasm.flags b/test/wizeng/inline_test_nesting.wasm.flags new file mode 100644 index 000000000..cada94339 --- /dev/null +++ b/test/wizeng/inline_test_nesting.wasm.flags @@ -0,0 +1 @@ +--mode=jit --metrics=spc*calls --inline-max-depth=2 diff --git a/test/wizeng/inline_test_nesting.wasm.out b/test/wizeng/inline_test_nesting.wasm.out new file mode 100644 index 000000000..0f54fe191 --- /dev/null +++ b/test/wizeng/inline_test_nesting.wasm.out @@ -0,0 +1,4 @@ +spc:static_calls : 19 calls +spc:static_inlined_calls : 17 calls +spc:dynamic_calls : 11 calls +spc:dynamic_inlined_calls : 10 calls diff --git a/test/wizeng/inline_trap_memory.wasm b/test/wizeng/inline_trap_memory.wasm new file mode 100644 index 000000000..f03a3642c Binary files /dev/null and b/test/wizeng/inline_trap_memory.wasm differ diff --git a/test/wizeng/inline_trap_memory.wasm.exit b/test/wizeng/inline_trap_memory.wasm.exit new file mode 100644 index 000000000..ace9d0362 --- /dev/null +++ b/test/wizeng/inline_trap_memory.wasm.exit @@ -0,0 +1 @@ +255 diff --git a/test/wizeng/inline_trap_memory.wasm.flags b/test/wizeng/inline_trap_memory.wasm.flags new file mode 100644 index 000000000..85dace623 --- /dev/null +++ b/test/wizeng/inline_trap_memory.wasm.flags @@ -0,0 +1 @@ +--mode=jit --inline-max-depth=1 diff --git a/test/wizeng/inline_trap_memory.wasm.out b/test/wizeng/inline_trap_memory.wasm.out new file mode 100644 index 000000000..cf539a562 --- /dev/null +++ b/test/wizeng/inline_trap_memory.wasm.out @@ -0,0 +1,3 @@ + +3 + +5 + !trap[MEMORY_OOB] diff --git a/test/wizeng/inline_trap_tableoob.wasm b/test/wizeng/inline_trap_tableoob.wasm new file mode 100644 index 000000000..8fc6cdad8 Binary files /dev/null and b/test/wizeng/inline_trap_tableoob.wasm differ diff --git a/test/wizeng/inline_trap_tableoob.wasm.exit b/test/wizeng/inline_trap_tableoob.wasm.exit new file mode 100644 index 000000000..ace9d0362 --- /dev/null +++ b/test/wizeng/inline_trap_tableoob.wasm.exit @@ -0,0 +1 @@ +255 diff --git a/test/wizeng/inline_trap_tableoob.wasm.flags b/test/wizeng/inline_trap_tableoob.wasm.flags new file mode 100644 index 000000000..85dace623 --- /dev/null +++ b/test/wizeng/inline_trap_tableoob.wasm.flags @@ -0,0 +1 @@ +--mode=jit --inline-max-depth=1 diff --git a/test/wizeng/inline_trap_tableoob.wasm.out b/test/wizeng/inline_trap_tableoob.wasm.out new file mode 100644 index 000000000..44b4678e4 --- /dev/null +++ b/test/wizeng/inline_trap_tableoob.wasm.out @@ -0,0 +1,3 @@ + +3 + +3 + !trap[TABLE_OOB] diff --git a/test/wizeng/inline_trap_unreachable.wasm b/test/wizeng/inline_trap_unreachable.wasm new file mode 100644 index 000000000..ae5ab00da Binary files /dev/null and b/test/wizeng/inline_trap_unreachable.wasm differ diff --git a/test/wizeng/inline_trap_unreachable.wasm.exit b/test/wizeng/inline_trap_unreachable.wasm.exit new file mode 100644 index 000000000..ace9d0362 --- /dev/null +++ b/test/wizeng/inline_trap_unreachable.wasm.exit @@ -0,0 +1 @@ +255 diff --git a/test/wizeng/inline_trap_unreachable.wasm.flags b/test/wizeng/inline_trap_unreachable.wasm.flags new file mode 100644 index 000000000..85dace623 --- /dev/null +++ b/test/wizeng/inline_trap_unreachable.wasm.flags @@ -0,0 +1 @@ +--mode=jit --inline-max-depth=1 diff --git a/test/wizeng/inline_trap_unreachable.wasm.out b/test/wizeng/inline_trap_unreachable.wasm.out new file mode 100644 index 000000000..0445b36ca --- /dev/null +++ b/test/wizeng/inline_trap_unreachable.wasm.out @@ -0,0 +1,3 @@ + +3 + +1 + !trap[UNREACHABLE]