remove unnecessary labels

support lto
syscall tracing skeleton
2025-12-10 11:42:41 +01:00 · 2025-12-10 11:40:24 +01:00 · 2025-12-10 10:51:52 +01:00 · 2025-12-09 07:51:16 +01:00 · 2025-12-09 07:07:22 +01:00 · 2025-12-08 15:07:57 +01:00
5 changed files with 337 additions and 165 deletions
--- a/build.zig
+++ b/build.zig
@@ -35,6 +35,7 @@ pub fn build(b: *std.Build) !void {
        .root_module = mod,
    });
    exe.pie = true;
+    exe.lto = if (optimize == .Debug) .none else .full;
    b.installArtifact(exe);

    const run_step = b.step("run", "Run the app");
--- a/src/Patcher.zig
+++ b/src/Patcher.zig
@@ -6,6 +6,7 @@ const mem = std.mem;
 const posix = std.posix;
 const zydis = @import("zydis").zydis;
 const dis = @import("disassembler.zig");
+const syscalls = @import("syscalls.zig");

 const log = std.log.scoped(.patcher);
 const AddressAllocator = @import("AddressAllocator.zig");
@@ -17,15 +18,11 @@ const Range = @import("Range.zig");

 const assert = std.debug.assert;

-const page_size = 4096;
+const page_size = std.heap.pageSize();
 const jump_rel32: u8 = 0xe9;
 const jump_rel32_size = 5;
 const jump_rel8: u8 = 0xeb;
 const jump_rel8_size = 2;
-const max_ins_bytes = 15;
-// Based on the paper 'x86-64 Instruction Usage among C/C++ Applications' by 'Akshintala et al.'
-// it's '4.25' bytes, so 4 is good enough. (https://oscarlab.github.io/papers/instrpop-systor19.pdf)
-const avg_ins_bytes = 4;

 // TODO: Find an invalid instruction to use.
 // const invalid: u8 = 0xaa;
@@ -33,42 +30,53 @@ const int3: u8 = 0xcc;
 const nop: u8 = 0x90;

 // Prefixes for Padded Jumps (Tactic T1)
-const prefix_fs: u8 = 0x64;
-const prefix_gs: u8 = 0x65;
-const prefix_ss: u8 = 0x36;
-const prefixes = [_]u8{ prefix_fs, prefix_gs, prefix_ss };
-
-const Patcher = @This();
-
-gpa: mem.Allocator,
-flicken: std.StringArrayHashMapUnmanaged(Flicken) = .empty,
-address_allocator: AddressAllocator = .empty,
-/// Tracks the base addresses of pages we have mmap'd for Flicken.
-allocated_pages: std.AutoHashMapUnmanaged(u64, void) = .empty,
-
-pub fn init(gpa: mem.Allocator) !Patcher {
-    var flicken: std.StringArrayHashMapUnmanaged(Flicken) = .empty;
-    try flicken.ensureTotalCapacity(gpa, 8);
-    flicken.putAssumeCapacity("nop", .{ .name = "nop", .bytes = &.{} });
-    return .{
-        .gpa = gpa,
-        .flicken = flicken,
+const prefixes = [_]u8{
+    // prefix_fs,
+    0x64,
+    // prefix_gs,
+    0x65,
+    // prefix_ss,
+    0x36,
 };
-}

-pub fn deinit(patcher: *Patcher) void {
-    _ = patcher;
+var syscall_flicken_bytes = [13]u8{
+    0x49, 0xBB, // mov r11
+    0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, // 8byte immediate
+    0x41, 0xff, 0xd3, // call r11
+};
+
+pub var gpa: mem.Allocator = undefined;
+pub var flicken_templates: std.StringArrayHashMapUnmanaged(Flicken) = .empty;
+pub var address_allocator: AddressAllocator = .empty;
+/// Tracks the base addresses of pages we have mmap'd for Flicken.
+pub var allocated_pages: std.AutoHashMapUnmanaged(u64, void) = .empty;
+pub var mutex: std.Thread.Mutex = .{};
+
+var init_once = std.once(initInner);
+pub fn init() void {
+    init_once.call();
+}
+fn initInner() void {
+    gpa = std.heap.page_allocator;
+    flicken_templates.ensureTotalCapacity(
+        std.heap.page_allocator,
+        page_size / @sizeOf(Flicken),
+    ) catch @panic("failed initializing patcher");
+    flicken_templates.putAssumeCapacity("nop", .{ .name = "nop", .bytes = &.{} });
+    mem.writeInt(u64, syscall_flicken_bytes[2..][0..8], @intFromPtr(&syscalls.syscall_entry), .little);
+    flicken_templates.putAssumeCapacity("syscall", .{ .name = "syscall", .bytes = &syscall_flicken_bytes });
 }

 /// Flicken name and bytes have to be valid for the lifetime it's used. If a trampoline with the
 /// name is already registered it gets overwritten.
 /// NOTE: The name "nop" is reserved and always has the ID 0.
-pub fn addFlicken(patcher: *Patcher, trampoline: Flicken) !FlickenId {
+pub fn addFlicken(trampoline: Flicken) !FlickenId {
    assert(!mem.eql(u8, "nop", trampoline.name));
-    try patcher.flicken.ensureUnusedCapacity(patcher.gpa, 1);
+    assert(!mem.eql(u8, "syscall", trampoline.name));
+    try flicken_templates.ensureUnusedCapacity(gpa, 1);
    errdefer comptime unreachable;

-    const gop = patcher.flicken.getOrPutAssumeCapacity(trampoline.name);
+    const gop = flicken_templates.getOrPutAssumeCapacity(trampoline.name);
    if (gop.found_existing) {
        log.warn("addTrampoline: Overwriting existing trampoline: {s}", .{trampoline.name});
    }
@@ -93,6 +101,8 @@ pub const FlickenId = enum(u64) {
    /// It also needs special handling when constructing the patches, because it's different for
    /// each instruction.
    nop = 0,
+    /// TODO: docs
+    syscall = 1,
    _,
 };

@@ -169,18 +179,28 @@ pub const Statistics = struct {
    }
 };

-pub fn patchRegion(patcher: *Patcher, region: []align(page_size) u8) !void {
+/// Scans a memory region for instructions that require patching and applies the patches
+/// using a hierarchy of tactics (Direct/Punning -> Successor Eviction -> Neighbor Eviction).
+///
+/// The region is processed Back-to-Front to ensure that modifications (punning) only
+/// constrain instructions that have already been processed or are locked.
+pub fn patchRegion(region: []align(page_size) u8) !void {
+    // For now just do a coarse lock.
+    // TODO: should we make this more fine grained?
+    mutex.lock();
+    defer mutex.unlock();
+
    {
        // Block the region, such that we don't try to allocate there anymore.
        const start: i64 = @intCast(@intFromPtr(region.ptr));
-        try patcher.address_allocator.block(
-            patcher.gpa,
+        try address_allocator.block(
+            gpa,
            .{ .start = start, .end = start + @as(i64, @intCast(region.len)) },
            page_size,
        );
    }

-    var arena_impl = std.heap.ArenaAllocator.init(patcher.gpa);
+    var arena_impl = std.heap.ArenaAllocator.init(gpa);
    const arena = arena_impl.allocator();
    defer arena_impl.deinit();

@@ -200,11 +220,12 @@ pub fn patchRegion(patcher: *Patcher, region: []align(page_size) u8) !void {
            const offset = instruction.address - @intFromPtr(region.ptr);
            instruction_starts.set(offset);

-            const should_patch = instruction.instruction.mnemonic == zydis.ZYDIS_MNEMONIC_SYSCALL or
+            const is_syscall = instruction.instruction.mnemonic == zydis.ZYDIS_MNEMONIC_SYSCALL;
+            const should_patch = is_syscall or
                instruction.instruction.attributes & zydis.ZYDIS_ATTRIB_HAS_LOCK > 0;
            if (should_patch) {
                const request: PatchRequest = .{
-                    .flicken = .nop,
+                    .flicken = if (is_syscall) .syscall else .nop,
                    .offset = offset,
                    .size = instruction.instruction.length,
                    .bytes = region[offset..],
@@ -234,7 +255,7 @@ pub fn patchRegion(patcher: *Patcher, region: []align(page_size) u8) !void {
            }
            last_offset = request.offset;

-            if (@as(u64, @intFromEnum(request.flicken)) >= patcher.flicken.count()) {
+            if (@as(u64, @intFromEnum(request.flicken)) >= flicken_templates.count()) {
                const fmt = dis.formatBytes(request.bytes[0..request.size]);
                log.err(
                    "patchRegion: Usage of undefined flicken in request {f} for instruction: {s}",
@@ -269,7 +290,7 @@ pub fn patchRegion(patcher: *Patcher, region: []align(page_size) u8) !void {
                }
            }

-            if (try patcher.attemptDirectOrPunning(
+            if (try attemptDirectOrPunning(
                request,
                arena,
                &locked_bytes,
@@ -279,7 +300,7 @@ pub fn patchRegion(patcher: *Patcher, region: []align(page_size) u8) !void {
                continue :requests;
            }

-            if (try patcher.attemptSuccessorEviction(
+            if (try attemptSuccessorEviction(
                request,
                arena,
                &locked_bytes,
@@ -289,7 +310,7 @@ pub fn patchRegion(patcher: *Patcher, region: []align(page_size) u8) !void {
                continue :requests;
            }

-            if (try patcher.attemptNeighborEviction(
+            if (try attemptNeighborEviction(
                request,
                arena,
                &locked_bytes,
@@ -323,7 +344,6 @@ pub fn patchRegion(patcher: *Patcher, region: []align(page_size) u8) !void {
 }

 fn attemptDirectOrPunning(
-    patcher: *Patcher,
    request: PatchRequest,
    arena: mem.Allocator,
    locked_bytes: *std.DynamicBitSetUnmanaged,
@@ -333,7 +353,7 @@ fn attemptDirectOrPunning(
    const flicken: Flicken = if (request.flicken == .nop)
        .{ .name = "nop", .bytes = request.bytes[0..request.size] }
    else
-        patcher.flicken.entries.get(@intFromEnum(request.flicken)).value;
+        flicken_templates.entries.get(@intFromEnum(request.flicken)).value;

    var pii = PatchInstructionIterator.init(
        request.bytes,
@@ -346,9 +366,9 @@ fn attemptDirectOrPunning(
    // mapped. While harmless (it becomes an unused executable page), it is technically a
    // memory leak. A future fix should track "current attempt" pages separately and unmap
    // them on failure.
-    while (pii.next(&patcher.address_allocator)) |allocated_range| {
+    while (pii.next(.{ .count = 256 })) |allocated_range| {
        try pages_made_writable.ensureUnusedCapacity(arena, touchedPageCount(allocated_range));
-        patcher.ensureRangeWritable(
+        ensureRangeWritable(
            allocated_range,
            pages_made_writable,
        ) catch |err| switch (err) {
@@ -366,7 +386,7 @@ fn attemptDirectOrPunning(
            else => return err,
        };

-        try patcher.address_allocator.block(patcher.gpa, allocated_range, 0);
+        try address_allocator.block(gpa, allocated_range, 0);
        const lock_size = jump_rel32_size + pii.num_prefixes;
        locked_bytes.setRangeValue(
            .{ .start = request.offset, .end = request.offset + lock_size },
@@ -374,7 +394,7 @@ fn attemptDirectOrPunning(
        );

        if (request.size >= 5) {
-            assert(pii.num_prefixes == 0);
+            // assert(pii.num_prefixes == 0);
            stats.jump += 1;
        } else {
            stats.punning[pii.num_prefixes] += 1;
@@ -385,7 +405,6 @@ fn attemptDirectOrPunning(
 }

 fn attemptSuccessorEviction(
-    patcher: *Patcher,
    request: PatchRequest,
    arena: mem.Allocator,
    locked_bytes: *std.DynamicBitSetUnmanaged,
@@ -421,7 +440,7 @@ fn attemptSuccessorEviction(
        succ_request.size,
        succ_flicken.size(),
    );
-    while (succ_pii.next(&patcher.address_allocator)) |succ_range| {
+    while (succ_pii.next(.{ .count = 16 })) |succ_range| {
        // Ensure bytes match original before retry.
        assert(mem.eql(
            u8,
@@ -430,7 +449,7 @@ fn attemptSuccessorEviction(
        ));

        try pages_made_writable.ensureUnusedCapacity(arena, touchedPageCount(succ_range));
-        patcher.ensureRangeWritable(
+        ensureRangeWritable(
            succ_range,
            pages_made_writable,
        ) catch |err| switch (err) {
@@ -452,17 +471,17 @@ fn attemptSuccessorEviction(
        const flicken: Flicken = if (request.flicken == .nop)
            .{ .name = "nop", .bytes = request.bytes[0..request.size] }
        else
-            patcher.flicken.entries.get(@intFromEnum(request.flicken)).value;
+            flicken_templates.entries.get(@intFromEnum(request.flicken)).value;

        var orig_pii = PatchInstructionIterator.init(
            request.bytes,
            request.size,
            flicken.size(),
        );
-        while (orig_pii.next(&patcher.address_allocator)) |orig_range| {
+        while (orig_pii.next(.{ .count = 16 })) |orig_range| {
            if (succ_range.touches(orig_range)) continue;
            try pages_made_writable.ensureUnusedCapacity(arena, touchedPageCount(orig_range));
-            patcher.ensureRangeWritable(
+            ensureRangeWritable(
                orig_range,
                pages_made_writable,
            ) catch |err| switch (err) {
@@ -480,8 +499,8 @@ fn attemptSuccessorEviction(
                else => return err,
            };

-            try patcher.address_allocator.block(patcher.gpa, succ_range, 0);
-            try patcher.address_allocator.block(patcher.gpa, orig_range, 0);
+            try address_allocator.block(gpa, succ_range, 0);
+            try address_allocator.block(gpa, orig_range, 0);
            const lock_size = request.size + jump_rel32_size + succ_pii.num_prefixes;
            locked_bytes.setRangeValue(
                .{ .start = request.offset, .end = request.offset + lock_size },
@@ -501,7 +520,6 @@ fn attemptSuccessorEviction(
 }

 fn attemptNeighborEviction(
-    patcher: *Patcher,
    request: PatchRequest,
    arena: mem.Allocator,
    locked_bytes: *std.DynamicBitSetUnmanaged,
@@ -509,56 +527,48 @@ fn attemptNeighborEviction(
    instruction_starts: *const std.DynamicBitSetUnmanaged,
    stats: *Statistics,
 ) !bool {
-    // Iterate valid neighbors.
-    // Neighbors must be within [-128, 127] range for a short jump.
+    // Valid neighbors must be within [-128, 127] range for a short jump.
    // Since we patch back-to-front, we only look at neighbors *after* the current instruction
    // (higher address) to avoid evicting an instruction we haven't processed/patched yet.
-    // Short jump is 2 bytes (EB xx). Target is IP + 2 + xx.
-    // So min offset is +2 (xx=0). Max offset is +2+127 = +129.
    const start_offset = request.offset + 2;
    const end_offset = @min(
-        start_offset + 128, // 2 + 128
+        start_offset + 128,
        request.bytes.len + request.offset,
    );

    neighbor: for (start_offset..end_offset) |neighbor_offset| {
        if (!instruction_starts.isSet(neighbor_offset)) continue;

-        // Found a candidate victim instruction.
-        // We must access it relative to the request bytes slice.
        const victim_bytes_all = request.bytes[neighbor_offset - request.offset ..];

-        // Disassemble to get size.
        // PERF: We could also search for the next set bit in instruction_starts
        const victim_instr = dis.disassembleInstruction(victim_bytes_all) orelse continue;
        const victim_size = victim_instr.instruction.length;
        const victim_bytes = victim_bytes_all[0..victim_size];

-        // Check locks for victim.
        for (0..victim_size) |i| {
            if (locked_bytes.isSet(neighbor_offset + i)) {
                continue :neighbor;
            }
        }

-        // Save original bytes to revert.
+        // Save original bytes to revert if constraints cannot be solved.
        var victim_orig_bytes: [15]u8 = undefined;
        @memcpy(victim_orig_bytes[0..victim_size], victim_bytes);

        // OUTER LOOP: J_Patch
        // Iterate possible offsets 'k' inside the victim for the patch jump.
-        // J_Patch is 5 bytes. It can extend beyond victim.
-        for (1..victim_size) |k| {
-            // Check if short jump from P reaches V+k
+        var k: u8 = 1;
+        while (k < victim_size) : (k += 1) {
            const target: i64 = @intCast(neighbor_offset + k);
            const source: i64 = @intCast(request.offset + 2);
            const disp = target - source;
-            if (disp > 127 or disp < -128) continue; // Should be covered by loop bounds, but be safe.
+            if (disp > 127 or disp < -128) continue;

            const patch_flicken: Flicken = if (request.flicken == .nop)
                .{ .name = "nop", .bytes = request.bytes[0..request.size] }
            else
-                patcher.flicken.entries.get(@intFromEnum(request.flicken)).value;
+                flicken_templates.entries.get(@intFromEnum(request.flicken)).value;

            // Constraints for J_Patch:
            // Bytes [0 .. victim_size - k] are free (inside victim).
@@ -569,19 +579,18 @@ fn attemptNeighborEviction(
                patch_flicken.size(),
            );

-            while (patch_pii.next(&patcher.address_allocator)) |patch_range| {
+            while (patch_pii.next(.{ .count = 16 })) |patch_range| {
                // J_Patch MUST NOT use prefixes, because it's punned inside J_Victim.
                // Adding prefixes would shift J_Patch relative to J_Victim, making constraints harder.
                if (patch_pii.num_prefixes > 0) break;

                try pages_made_writable.ensureUnusedCapacity(arena, touchedPageCount(patch_range));
-                patcher.ensureRangeWritable(patch_range, pages_made_writable) catch |err| switch (err) {
+                ensureRangeWritable(patch_range, pages_made_writable) catch |err| switch (err) {
                    error.MappingAlreadyExists => continue,
                    else => return err,
                };

                // Tentatively write J_Patch to memory to set constraints for J_Victim.
-                // We must perform the write logic manually because applyPatch assumes request struct.
                // We only need to write the bytes of J_Patch that land inside the victim.
                {
                    const jmp_target = patch_range.start;
@@ -602,15 +611,15 @@ fn attemptNeighborEviction(

                var victim_pii = PatchInstructionIterator.init(
                    victim_bytes_all,
-                    @intCast(k),
+                    k,
                    victim_flicken.size(),
                );

-                while (victim_pii.next(&patcher.address_allocator)) |victim_range| {
+                while (victim_pii.next(.{ .count = 16 })) |victim_range| {
                    if (patch_range.touches(victim_range)) continue;

                    try pages_made_writable.ensureUnusedCapacity(arena, touchedPageCount(victim_range));
-                    patcher.ensureRangeWritable(victim_range, pages_made_writable) catch |err| switch (err) {
+                    ensureRangeWritable(victim_range, pages_made_writable) catch |err| switch (err) {
                        error.MappingAlreadyExists => continue,
                        else => return err,
                    };
@@ -620,55 +629,48 @@ fn attemptNeighborEviction(
                    // 1. Write Patch Trampoline (J_Patch target)
                    {
                        const trampoline: [*]u8 = @ptrFromInt(patch_range.getStart(u64));
-                        @memcpy(trampoline, patch_flicken.bytes);
+                        var reloc_info: ?RelocInfo = null;
                        if (request.flicken == .nop) {
-                            const instr = dis.disassembleInstruction(patch_flicken.bytes).?;
-                            try relocateInstruction(
-                                instr,
-                                @intCast(patch_range.start),
-                                trampoline[0..patch_flicken.bytes.len],
-                            );
+                            reloc_info = .{
+                                .instr = dis.disassembleInstruction(patch_flicken.bytes).?,
+                                .old_addr = @intFromPtr(request.bytes.ptr),
+                            };
                        }
-                        // Jmp back from Patch Trampoline to original code (after request)
-                        trampoline[patch_flicken.bytes.len] = jump_rel32;
-                        const ret_addr: i64 = @intCast(@intFromPtr(&request.bytes[request.size]));
-                        const from = patch_range.end;
-                        const jmp_back_disp: i32 = @intCast(ret_addr - from);
-                        mem.writeInt(i32, trampoline[patch_flicken.bytes.len + 1 ..][0..4], jmp_back_disp, .little);
+                        commitTrampoline(
+                            trampoline,
+                            patch_flicken.bytes,
+                            reloc_info,
+                            @intFromPtr(request.bytes.ptr) + request.size,
+                        ) catch |err| switch (err) {
+                            error.RelocationOverflow => continue,
+                            else => return err,
+                        };
                    }

                    // 2. Write Victim Trampoline (J_Victim target)
                    {
                        const trampoline: [*]u8 = @ptrFromInt(victim_range.getStart(u64));
-                        @memcpy(trampoline, victim_orig_bytes[0..victim_size]);
-                        // Relocate victim instruction
-                        const instr = dis.disassembleInstruction(victim_orig_bytes[0..victim_size]).?;
-                        try relocateInstruction(
-                            instr,
-                            @intCast(victim_range.start),
-                            trampoline[0..victim_size],
-                        );
-                        // Jmp back from Victim Trampoline to original code (after victim)
-                        trampoline[victim_size] = jump_rel32;
-                        const ret_addr: i64 = @intCast(@intFromPtr(&victim_bytes_all[victim_size]));
-                        const from = victim_range.end;
-                        const jmp_back_disp: i32 = @intCast(ret_addr - from);
-                        mem.writeInt(i32, trampoline[victim_size + 1 ..][0..4], jmp_back_disp, .little);
+                        commitTrampoline(
+                            trampoline,
+                            victim_orig_bytes[0..victim_size],
+                            .{
+                                .instr = dis.disassembleInstruction(victim_orig_bytes[0..victim_size]).?,
+                                .old_addr = @intFromPtr(victim_bytes_all.ptr),
+                            },
+                            @intFromPtr(victim_bytes_all.ptr) + victim_size,
+                        ) catch |err| switch (err) {
+                            error.RelocationOverflow => continue,
+                            else => return err,
+                        };
                    }

-                    // 3. Write J_Victim (overwrites head of J_Patch which is fine, we just used it for constraints)
-                    applyPatch(
-                        // Create a fake request for the victim part
-                        .{
-                            .flicken = .nop, // Irrelevant, unused by applyPatch for jump writing
-                            .offset = neighbor_offset,
-                            .size = @intCast(victim_size),
-                            .bytes = victim_bytes_all,
-                        },
-                        victim_flicken, // Unused by applyPatch for jump writing
-                        victim_range,
+                    // 3. Write J_Victim (overwrites head of J_Patch which is fine)
+                    commitJump(
+                        victim_bytes_all.ptr,
+                        @intCast(victim_range.start),
                        victim_pii.num_prefixes,
-                    ) catch unreachable; // Should fit because we allocated it
+                        k, // Total size for padding is limited to k to preserve J_Patch tail
+                    );

                    // 4. Write J_Short at request
                    request.bytes[0] = jump_rel8;
@@ -678,8 +680,8 @@ fn attemptNeighborEviction(
                    }

                    // 5. Locking
-                    try patcher.address_allocator.block(patcher.gpa, patch_range, 0);
-                    try patcher.address_allocator.block(patcher.gpa, victim_range, 0);
+                    try address_allocator.block(gpa, patch_range, 0);
+                    try address_allocator.block(gpa, victim_range, 0);

                    locked_bytes.setRangeValue(
                        .{ .start = request.offset, .end = request.offset + request.size },
@@ -706,6 +708,10 @@ fn attemptNeighborEviction(
    return false;
 }

+/// Applies a standard patch (T1/B1/B2) where the instruction is replaced by a jump to a trampoline.
+///
+/// This handles the logic of writing the trampoline content (including relocation) and
+/// overwriting the original instruction with a `JMP` (plus prefixes/padding).
 fn applyPatch(
    request: PatchRequest,
    flicken: Flicken,
@@ -713,51 +719,78 @@ fn applyPatch(
    num_prefixes: u8,
 ) !void {
    const flicken_addr: [*]u8 = @ptrFromInt(allocated_range.getStart(u64));
-    const flicken_slice = flicken_addr[0..flicken.size()];

-    const jump_to_offset: i32 = blk: {
-        const from: i64 = @intCast(@intFromPtr(&request.bytes[
-            num_prefixes + jump_rel32_size
-        ]));
-        const to = allocated_range.start;
-        break :blk @intCast(to - from);
-    };
-    const jump_back_offset: i32 = blk: {
-        const from = allocated_range.end;
-        const to: i64 = @intCast(@intFromPtr(&request.bytes[request.size]));
-        break :blk @intCast(to - from);
-    };
-    // The jumps have to be in the opposite direction.
-    assert(math.sign(jump_to_offset) * math.sign(jump_back_offset) < 0);
-
-    // Write to the trampoline first, because for the `nop` flicken `flicken.bytes` points to
-    // `request.bytes` which we overwrite in the next step.
-    @memcpy(flicken_addr, flicken.bytes);
+    // Commit Trampoline
+    var reloc_info: ?RelocInfo = null;
    if (request.flicken == .nop) {
-        const instr_bytes = request.bytes[0..request.size];
-        const instr = dis.disassembleInstruction(instr_bytes).?;
+        reloc_info = .{
+            .instr = dis.disassembleInstruction(request.bytes[0..request.size]).?,
+            .old_addr = @intFromPtr(request.bytes.ptr),
+        };
+    }
+
+    const ret_addr = @intFromPtr(request.bytes.ptr) + request.size;
+    try commitTrampoline(flicken_addr, flicken.bytes, reloc_info, ret_addr);
+
+    // Commit Jump (Patch)
+    commitJump(request.bytes.ptr, @intCast(allocated_range.start), num_prefixes, request.size);
+}
+
+const RelocInfo = struct {
+    instr: dis.BundledInstruction,
+    old_addr: u64,
+};
+
+/// Helper to write code into a trampoline.
+///
+/// It copies the original bytes (or flicken content), relocates any RIP-relative instructions
+/// to be valid at the new address, and appends a jump back to the instruction stream.
+fn commitTrampoline(
+    trampoline_ptr: [*]u8,
+    content: []const u8,
+    reloc_info: ?RelocInfo,
+    return_addr: u64,
+) !void {
+    @memcpy(trampoline_ptr[0..content.len], content);
+
+    if (reloc_info) |info| {
        try relocateInstruction(
-            instr,
-            @intCast(allocated_range.start),
-            flicken_slice[0..request.size],
+            info.instr,
+            @intFromPtr(trampoline_ptr),
+            trampoline_ptr[0..content.len],
        );
    }
-    flicken_slice[flicken.bytes.len] = jump_rel32;
-    const jump_back_location = flicken_slice[flicken.bytes.len + 1 ..][0..4];
-    mem.writeInt(i32, jump_back_location, jump_back_offset, .little);

-    @memcpy(request.bytes[0..num_prefixes], prefixes[0..num_prefixes]);
-    request.bytes[num_prefixes] = jump_rel32;
-    mem.writeInt(
-        i32,
-        request.bytes[num_prefixes + 1 ..][0..4],
-        jump_to_offset,
-        .little,
-    );
-    // Pad remaining with int3.
+    // Write jump back
+    trampoline_ptr[content.len] = jump_rel32;
+    const jump_src = @intFromPtr(trampoline_ptr) + content.len + jump_rel32_size;
+    const jump_disp: i32 = @intCast(@as(i64, @intCast(return_addr)) - @as(i64, @intCast(jump_src)));
+    mem.writeInt(i32, trampoline_ptr[content.len + 1 ..][0..4], jump_disp, .little);
+}
+
+/// Helper to overwrite an instruction with a jump to a trampoline.
+///
+/// It handles writing optional prefixes (padding), the `0xE9` opcode, the relative offset,
+/// and fills any remaining bytes of the original instruction with `INT3` to prevent
+/// execution of garbage bytes.
+fn commitJump(
+    from_ptr: [*]u8,
+    to_addr: u64,
+    num_prefixes: u8,
+    total_size: usize,
+) void {
+    const prefixes_slice = from_ptr[0..num_prefixes];
+    @memcpy(prefixes_slice, prefixes[0..num_prefixes]);
+
+    from_ptr[num_prefixes] = jump_rel32;
+
+    const jump_src = @intFromPtr(from_ptr) + num_prefixes + jump_rel32_size;
+    const jump_disp: i32 = @intCast(@as(i64, @intCast(to_addr)) - @as(i64, @intCast(jump_src)));
+    mem.writeInt(i32, from_ptr[num_prefixes + 1 ..][0..4], jump_disp, .little);
+
    const patch_end_index = num_prefixes + jump_rel32_size;
-    if (patch_end_index < request.size) {
-        @memset(request.bytes[patch_end_index..request.size], int3);
+    if (patch_end_index < total_size) {
+        @memset(from_ptr[patch_end_index..total_size], int3);
    }
 }

@@ -780,7 +813,6 @@ fn touchedPageCount(range: Range) u32 {

 /// Ensure `range` is mapped R|W. Assumes `pages_made_writable` has enough free capacity.
 fn ensureRangeWritable(
-    patcher: *Patcher,
    range: Range,
    pages_made_writable: *std.AutoHashMapUnmanaged(u64, void),
 ) !void {
@@ -792,7 +824,7 @@ fn ensureRangeWritable(
        // If the page is already writable, skip it.
        if (pages_made_writable.get(page_addr)) |_| continue;
        // If we mapped it already we have to do mprotect, else mmap.
-        const gop = try patcher.allocated_pages.getOrPut(patcher.gpa, page_addr);
+        const gop = try allocated_pages.getOrPut(gpa, page_addr);
        if (gop.found_existing) {
            const ptr: [*]align(page_size) u8 = @ptrFromInt(page_addr);
            try posix.mprotect(ptr[0..page_addr], protection);
@@ -810,8 +842,8 @@ fn ensureRangeWritable(
                    // (executable, OS, dynamic loader,...) allocated something there.
                    // We block this so we don't try this page again in the future,
                    // saving a bunch of syscalls.
-                    try patcher.address_allocator.block(
-                        patcher.gpa,
+                    try address_allocator.block(
+                        gpa,
                        .{ .start = @intCast(page_addr), .end = @intCast(page_addr + page_size) },
                        page_size,
                    );
@@ -835,6 +867,7 @@ const PatchInstructionIterator = struct {
    num_prefixes: u8,
    pli: PatchLocationIterator,
    valid_range: Range,
+    allocated_count: u64,

    fn init(
        bytes: []const u8,
@@ -851,12 +884,26 @@ const PatchInstructionIterator = struct {
            .num_prefixes = 0,
            .pli = pli,
            .valid_range = valid_range,
+            .allocated_count = 0,
        };
    }

+    pub const Strategy = union(enum) {
+        /// Iterates through all possible ranges.
+        /// Useful for finding the optimal allocation (fewest prefixes).
+        exhaustive: void,
+        /// Limits the search to `count` allocation attempts per valid constraint range found by the
+        /// PatchLocationIterator.
+        ///
+        /// This acts as a heuristic to prevent worst-case performance (scanning every byte of a 2GB
+        /// gap) while still offering better density than a purely greedy approach. A count of 1 is
+        /// equivalent to a greedy strategy.
+        count: u64,
+    };
+
    fn next(
        pii: *PatchInstructionIterator,
-        address_allocator: *AddressAllocator,
+        strategy: Strategy,
    ) ?Range {
        const State = enum {
            allocation,
@@ -870,11 +917,23 @@ const PatchInstructionIterator = struct {
                    pii.valid_range,
                )) |allocated_range| {
                    assert(allocated_range.size() == pii.flicken_size);
+                    pii.allocated_count += 1;
                    // Advancing the valid range, such that the next call to `findAllocation` won't
                    // find the same range again.
+                    switch (strategy) {
+                        .exhaustive => pii.valid_range.start = allocated_range.start + 1,
+                        .count => |c| {
+                            if (pii.allocated_count >= c) {
+                                pii.valid_range.start = pii.valid_range.end;
+                                pii.allocated_count = 0;
+                            } else {
                                pii.valid_range.start = allocated_range.start + 1;
+                            }
+                        },
+                    }
                    return allocated_range;
                } else {
+                    pii.allocated_count = 0;
                    continue :blk .range;
                }
            },
--- a/src/Range.zig
+++ b/src/Range.zig
@@ -55,7 +55,7 @@ pub fn touches(range: Range, other: Range) bool {
 pub fn compare(lhs: Range, rhs: Range) std.math.Order {
    assert(lhs.end >= lhs.start);
    assert(rhs.end >= rhs.start);
-    return if (lhs.start > rhs.end) .gt else if (lhs.end < rhs.start) .lt else .eq;
+    return if (lhs.start >= rhs.end) .gt else if (lhs.end <= rhs.start) .lt else .eq;
 }

 pub fn getStart(range: Range, T: type) T {
--- a/src/main.zig
+++ b/src/main.zig
@@ -32,8 +32,6 @@ const help =

 const UnfinishedReadError = error{UnfinishedRead};

-var patcher: Patcher = undefined;
-
 pub fn main() !void {
    // Parse arguments
    var arg_index: u64 = 1; // Skip own name
@@ -52,10 +50,10 @@ pub fn main() !void {
    }

    // Initialize patcher
-    patcher = try Patcher.init(std.heap.page_allocator); // TODO: allocator
+    Patcher.init();
    // Block the first 64k to avoid mmap_min_addr (EPERM) issues on Linux.
    // TODO: read it from `/proc/sys/vm/mmap_min_addr` instead.
-    try patcher.address_allocator.block(patcher.gpa, .{ .start = 0, .end = 0x10000 }, 0);
+    try Patcher.address_allocator.block(Patcher.gpa, .{ .start = 0, .end = 0x10000 }, 0);

    // Map file into memory
    const file = try lookupFile(mem.sliceTo(std.os.argv[arg_index], 0));
@@ -207,7 +205,7 @@ fn loadStaticElf(ehdr: elf.Header, file_reader: *std.fs.File.Reader) !usize {
        const protections = elfToMmapProt(phdr.p_flags);
        if (protections & posix.PROT.EXEC > 0) {
            log.info("Patching executable segment", .{});
-            try patcher.patchRegion(ptr);
+            try Patcher.patchRegion(ptr);
        }
        try posix.mprotect(ptr, protections);
    }
--- a/src/syscalls.zig
+++ b/src/syscalls.zig
@@ -0,0 +1,114 @@
+const std = @import("std");
+const linux = std.os.linux;
+
+/// Represents the stack layout pushed by `syscall_entry` before calling the handler.
+pub const UserRegs = extern struct {
+    padding: u64, // Result of `sub $8, %rsp` for alignment
+    rflags: u64,
+    rax: u64,
+    rbx: u64,
+    rcx: u64,
+    rdx: u64,
+    rsi: u64,
+    rdi: u64,
+    rbp: u64,
+    r8: u64,
+    r9: u64,
+    r10: u64,
+    r11: u64,
+    r12: u64,
+    r13: u64,
+    r14: u64,
+    r15: u64,
+};
+
+/// The main entry point for intercepted syscalls.
+///
+/// This function is called from `syscall_entry` with a pointer to the saved registers.
+/// It effectively emulates the syscall instruction while allowing for interception.
+export fn syscall_handler(regs: *UserRegs) void {
+    // TODO: Handle signals (masking) to prevent re-entrancy issues if we touch global state.
+    // TODO: Handle `clone` specially because the child thread wakes up with a fresh stack
+    //       and cannot pop the registers we saved here.
+
+    const sys_nr = regs.rax;
+    const sys: linux.SYS = @enumFromInt(sys_nr);
+    const arg1 = regs.rdi;
+    const arg2 = regs.rsi;
+    const arg3 = regs.rdx;
+    const arg4 = regs.r10;
+    const arg5 = regs.r8;
+    const arg6 = regs.r9;
+
+    std.debug.print("Got syscall {s}\n", .{@tagName(sys)});
+    // For now, we just pass through everything.
+    // In the future, we will switch on `sys` to handle mmap, mprotect, etc.
+    const result = std.os.linux.syscall6(sys, arg1, arg2, arg3, arg4, arg5, arg6);
+
+    // Write result back to the saved RAX so it is restored to the application.
+    regs.rax = result;
+}
+
+/// Assembly trampoline that saves state and calls the Zig handler.
+pub fn syscall_entry() callconv(.naked) void {
+    asm volatile (
+        \\     # Respect the Red Zone (128 bytes)
+        \\     sub $128, %rsp
+        \\
+        \\     # Save all GPRs that must be preserved or are arguments
+        \\     push %r15
+        \\     push %r14
+        \\     push %r13
+        \\     push %r12
+        \\     push %r11
+        \\     push %r10
+        \\     push %r9
+        \\     push %r8
+        \\     push %rbp
+        \\     push %rdi
+        \\     push %rsi
+        \\     push %rdx
+        \\     push %rcx
+        \\     push %rbx
+        \\     push %rax
+        \\     pushfq # Save Flags
+        \\
+        \\     # Align stack
+        \\     # Current pushes: 16 * 8 = 128 bytes.
+        \\     # Red zone sub: 128 bytes.
+        \\     # Trampoline call pushed ret addr: 8 bytes.
+        \\     # Total misalign: 8 bytes. We need 16-byte alignment for 'call'.
+        \\     sub $8, %rsp
+        \\
+        \\     # Pass pointer to regs (current rsp) as 1st argument (rdi) and call handler.
+        \\     mov %rsp, %rdi
+        \\     call syscall_handler
+        \\
+        \\     # Restore State
+        \\     add $8, %rsp
+        \\     popfq
+        \\     pop %rax
+        \\     pop %rbx
+        \\     pop %rcx
+        \\     pop %rdx
+        \\     pop %rsi
+        \\     pop %rdi
+        \\     pop %rbp
+        \\     pop %r8
+        \\     pop %r9
+        \\     pop %r10
+        \\     pop %r11
+        \\     pop %r12
+        \\     pop %r13
+        \\     pop %r14
+        \\     pop %r15
+        \\
+        \\     # Restore Red Zone and Return
+        \\     add $128, %rsp
+        \\     ret
+        :
+        // TODO: can we somehow use %[handler] in the assembly instead?
+        // Right now this is just here such that lto does not discard the `syscall_handler` function
+        : [handler] "i" (syscall_handler),
+    );
+}
Author	SHA1	Message	Date
Pascal Zittlau	ef6cd851f7	remove unnecessary labels	2025-12-10 11:42:41 +01:00
Pascal Zittlau	557c98917c	support lto	2025-12-10 11:40:24 +01:00
Pascal Zittlau	c32cd74628	syscall tracing skeleton	2025-12-10 10:51:52 +01:00
Pascal Zittlau	a8f55f6d63	replace greedy strategy with a configurable count strategy	2025-12-09 07:51:16 +01:00
Pascal Zittlau	8d907f071c	convert Patcher to a global singleton Migrates Patcher state to global variables and uses std.once for initialization. This is preparing for future syscall tracing, which requires static access to the patching context across the runtime to be accessed by flicken.	2025-12-09 07:07:22 +01:00
Pascal Zittlau	9d4f325a2c	enable lto for release builds	2025-12-08 15:07:57 +01:00
Pascal Zittlau	0788dd30f2	allow greedy allocation for faster patching	2025-12-08 15:03:44 +01:00
Pascal Zittlau	49ae70ec2c	try other allocation for relocation overflow	2025-12-08 09:56:00 +01:00
Pascal Zittlau	1922669c53	exlusive upper bound	2025-12-08 09:54:29 +01:00
Pascal Zittlau	434681eeb8	minor	2025-12-04 12:09:17 +01:00