From 633c313513daaf43a962c191075f407b8da96aeb Mon Sep 17 00:00:00 2001 From: Pascal Zittlau Date: Mon, 9 Mar 2026 11:04:15 +0100 Subject: [PATCH] constraint solving --- build.zig | 2 +- src/AddressAllocator.zig | 1358 +++++++++++++++++++------- src/PatchLocationIterator.zig | 447 --------- src/Patcher.zig | 1695 ++++++++++++++++----------------- src/Range.zig | 49 +- src/Statistics.zig | 46 + src/backend.zig | 49 + src/loader.zig | 94 ++ src/main.zig | 180 ++-- src/relocation.zig | 98 ++ src/syscalls.zig | 11 +- 11 files changed, 2228 insertions(+), 1801 deletions(-) delete mode 100644 src/PatchLocationIterator.zig create mode 100644 src/Statistics.zig create mode 100644 src/backend.zig create mode 100644 src/loader.zig create mode 100644 src/relocation.zig diff --git a/build.zig b/build.zig index 152ecdf..5331dc9 100644 --- a/build.zig +++ b/build.zig @@ -51,7 +51,7 @@ pub fn build(b: *std.Build) !void { try compileTestApplications(b, target, optimize, false, true); try compileTestApplications(b, target, optimize, true, true); - const exe_tests = b.addTest(.{ .root_module = mod }); + const exe_tests = b.addTest(.{ .root_module = mod, .use_llvm = true }); const run_exe_tests = b.addRunArtifact(exe_tests); const test_step = b.step("test", "Run tests"); test_step.dependOn(b.getInstallStep()); diff --git a/src/AddressAllocator.zig b/src/AddressAllocator.zig index e879788..326468c 100644 --- a/src/AddressAllocator.zig +++ b/src/AddressAllocator.zig @@ -1,4 +1,5 @@ const std = @import("std"); +const math = std.math; const mem = std.mem; const sort = std.sort; const testing = std.testing; @@ -12,431 +13,1116 @@ const AddressAllocator = @This(); /// The **sorted** list of `Range`s that are blocked. ranges: std.ArrayListUnmanaged(Range) = .empty, +child_allocator: mem.Allocator, -pub const empty = AddressAllocator{}; +// TODO: we should likely create an init function that blocks the entire negative address space +pub fn init(child_allocator: mem.Allocator) !AddressAllocator { + var aa: AddressAllocator = .{ .child_allocator = child_allocator }; -pub fn deinit(address_allocator: *AddressAllocator, gpa: mem.Allocator) void { - address_allocator.ranges.deinit(gpa); + const ranges = try child_allocator.alloc(Range, std.heap.pageSize() / @sizeOf(Range)); + aa.ranges = .initBuffer(ranges); + + aa.block(.fromSlice(Range, ranges)) catch unreachable; + + return aa; +} + +pub fn deinit(self: *AddressAllocator) void { + self.ranges.deinit(self.child_allocator); +} + +pub fn allocator(self: *AddressAllocator) mem.Allocator { + return .{ + .ptr = self, + .vtable = &.{ + .alloc = alloc, + .resize = resize, + .remap = remap, + .free = free, + }, + }; +} + +fn alloc(ctx: *anyopaque, n: usize, alignment: std.mem.Alignment, ra: usize) ?[*]u8 { + const self: *AddressAllocator = @ptrCast(@alignCast(ctx)); + + const ptr = self.child_allocator.rawAlloc(n, alignment, ra) orelse return null; + self.block(.fromPtr(ptr, n)) catch @panic("OOM"); + return ptr; +} + +fn resize( + ctx: *anyopaque, + buf: []u8, + alignment: std.mem.Alignment, + new_len: usize, + ret_addr: usize, +) bool { + const self: *AddressAllocator = @ptrCast(@alignCast(ctx)); + + const success = self.child_allocator.rawResize(buf, alignment, new_len, ret_addr); + if (success) { + self.block(.fromPtr(buf.ptr, new_len)) catch @panic("OOM"); + } + return success; +} + +fn remap( + context: *anyopaque, + memory: []u8, + alignment: std.mem.Alignment, + new_len: usize, + return_address: usize, +) ?[*]u8 { + const self: *AddressAllocator = @ptrCast(@alignCast(context)); + + const ptr = self.child_allocator.rawRemap(memory, alignment, new_len, return_address) orelse + return null; + + if (ptr != memory.ptr) { // new memory location + self.unblock(.fromSlice(u8, memory)) catch @panic("OOM"); + } + self.block(.fromPtr(ptr, new_len)) catch @panic("OOM"); + return ptr; +} + +fn free( + ctx: *anyopaque, + buf: []u8, + alignment: std.mem.Alignment, + ret_addr: usize, +) void { + const self: *AddressAllocator = @ptrCast(@alignCast(ctx)); + + self.unblock(.fromSlice(u8, buf)) catch @panic("OOM"); + return self.child_allocator.rawFree(buf, alignment, ret_addr); } /// Block a range to not be used by the `allocate` function. This function will always succeed, if /// there is enough memory available. -pub fn block( - address_allocator: *AddressAllocator, - gpa: mem.Allocator, - range: Range, - alignment: u64, -) !void { - assert(address_allocator.isSorted()); - defer assert(address_allocator.isSorted()); - - const aligned_range = if (alignment != 0) range.alignTo(alignment) else range; - assert(aligned_range.contains(range)); - if (aligned_range.size() == 0) return; +pub fn block(self: *AddressAllocator, range: Range) !void { + if (range.size() == 0) return; // Find the correct sorted position to insert the new range. const insert_idx = sort.lowerBound( Range, - address_allocator.ranges.items, - aligned_range, - Range.compare, + self.ranges.items, + range, + Range.compareTouching, ); log.debug( - "block: range: {f}, alignment: {}, aligned_range: {f}, insert_idx: {}", - .{ range, alignment, aligned_range, insert_idx }, + "block: range: {f}, insert_idx: {}", + .{ range, insert_idx }, ); - // If the new range is the greatest one OR if the entry at `insert_idx` is greater than the - // new range, we can just insert. - if (insert_idx == address_allocator.ranges.items.len or - address_allocator.ranges.items[insert_idx].compare(aligned_range) == .gt) + // If we don't overlap any existing one, we just insert. + if (insert_idx == self.ranges.items.len or + self.ranges.items[insert_idx].compareTouching(range) == .gt) { - log.debug("block: New range inserted", .{}); - return address_allocator.ranges.insert(gpa, insert_idx, aligned_range); + return self.ranges.insert(self.child_allocator, insert_idx, range); } errdefer comptime unreachable; - assert(address_allocator.ranges.items.len > 0); + assert(self.ranges.items.len > 0); - // Now `insert_idx` points to the first entry, that touches `aligned_range`. - assert(address_allocator.ranges.items[insert_idx].touches(aligned_range)); - if (insert_idx > 1 and address_allocator.ranges.items.len > 1) { - assert(!address_allocator.ranges.items[insert_idx - 1].touches(aligned_range)); + // Now `insert_idx` points to the first entry, that touches `range`. + const first = &self.ranges.items[insert_idx]; + assert(first.touches(range)); + if (insert_idx > 0 and self.ranges.items.len > 0) { + assert(!self.ranges.items[insert_idx - 1].touches(range)); } - log.debug("block: `aligned_range` touches at least one existing range.", .{}); + log.debug("block: `range` touches at least one existing range.", .{}); - // NOTE: We merge entries that touch eachother to speedup future traversals. - // There are a few cases how to handle the merging: - // 1. `aligned_range` is contained by the existing range. Then we have to do nothing and can - // return early. - // 2. `aligned_range` contains the existing range. Then we have to overwrite `start` and `end`. - // 3. The existing range is before `aligned_range`. Set `existing.end` to `aligned_range.end`. - // 4. The existing range is after `aligned_range`. Set `existing.start` to `aligned.start`. - // After we have done this to the first range that touches, we will loop over the other ones - // that touch and just have to apply rule 4 repeatedly. - const first = &address_allocator.ranges.items[insert_idx]; - if (first.contains(aligned_range)) { - log.debug("block: Existing range at index {} contains new range. No-op", .{insert_idx}); + first.start = @min(first.start, range.start); + first.end = @max(first.end, range.end); + + // Merge any following overlapping ranges into this one. + // NOTE: We "iterate" through the slice by removing unneeded items and moving all following ones + // back by one. That's why we always look at `insert_idx + 1`. + while (insert_idx + 1 < self.ranges.items.len and + self.ranges.items[insert_idx + 1].touches(range)) + { + const neighbor = self.ranges.items[insert_idx + 1]; + assert(range.end >= neighbor.start); + assert(range.start <= neighbor.start); + first.end = @max(first.end, neighbor.end); + _ = self.ranges.orderedRemove(insert_idx + 1); + } +} + +pub fn unblock( + self: *AddressAllocator, + range: Range, +) !void { + + // Find the correct sorted position to remove the range. + var remove_idx = sort.lowerBound( + Range, + self.ranges.items, + range, + Range.compareOverlapping, + ); + log.debug( + "unblock: range: {f}, remove_idx: {}", + .{ range, remove_idx }, + ); + // If we don't overlap any existing one, we just return. + if (remove_idx == self.ranges.items.len or + self.ranges.items[remove_idx].compareOverlapping(range) == .gt) + { + log.debug("unblock: Range to unblock overlaps nothing", .{}); + for (self.ranges.items) |r| { + assert(!r.overlaps(range)); + } return; - } else if (aligned_range.contains(first.*)) { - log.debug( - "block: New range contains existing range at index {}: {f} -> {f}", - .{ insert_idx, first, aligned_range }, - ); - first.* = aligned_range; - } else if (aligned_range.start <= first.end and aligned_range.end >= first.end) { - assert(aligned_range.start > first.start); - log.debug( - "block: Adjusting range end at index {}: {} -> {}", - .{ insert_idx, first.end, aligned_range.end }, - ); - first.*.end = aligned_range.end; - } else if (aligned_range.end >= first.start and aligned_range.start <= first.start) { - assert(aligned_range.end < first.end); - log.debug( - "block: Adjusting range start at index {}: {} -> {}", - .{ insert_idx, first.start, aligned_range.start }, - ); - first.*.start = aligned_range.start; + } + assert(self.ranges.items.len > 0); + + // Now `remove_idx` points to the first entry, that touches `range`. + const first = &self.ranges.items[remove_idx]; + assert(first.touches(range)); + if (remove_idx > 0 and self.ranges.items.len > 0) { + assert(!self.ranges.items[remove_idx - 1].overlaps(range)); + } + log.debug("unblock: `range` touches at least one existing range.", .{}); + + // We have multiple cases for the first touching range: + // + // [ range to unblock ] + // 0 [ first ] -> split + // + // [ range to unblock ] + // 1 [ first ] + // 1 [ first ] -> change start + // + // [ range to unblock ] + // 2 [ first ] + // 2 [ first ] + // 2 [ first ] -> remove + // + // [ range to unblock ] + // 3 [ first ] + // 3 [ first ] -> change end + // + // If it's cases 0 or 1 the operation is finished because we can't overlap another one. For cases 2 + // and 3 we will have to remove the following ranges until we arrive at one of the following cases: + // 1. + // [ range to unblock ] + // [ last ] + // 2. + // [ range to unblock ] + // [ last ] + // + if (first.start < range.start and first.end > range.end) { + const old_end = first.end; + first.end = range.start; + try self.ranges.insert(self.child_allocator, remove_idx + 1, .{ + .start = range.end, + .end = old_end, + }); + return; + } else if (first.start >= range.start and first.start < range.end and first.end > range.end) { + first.start = range.end; + return; + } else if (first.start >= range.start and first.end <= range.end) { + _ = self.ranges.orderedRemove(remove_idx); + } else if (first.start < range.start and first.end > range.start and first.end <= range.end) { + first.end = range.start; + remove_idx += 1; } else { unreachable; } - // TODO: comment why we do this - if (insert_idx >= address_allocator.ranges.items.len - 1) return; + // NOTE: We "iterate" through the slice by removing unneeded items and moving all following ones + // back by one. That's why we always look at `insert_idx + 1`. + while (remove_idx < self.ranges.items.len) { + const next_range = &self.ranges.items[remove_idx]; + if (next_range.start >= range.end) break; - var neighbor = &address_allocator.ranges.items[insert_idx + 1]; - var i: u64 = 0; - while (neighbor.touches(aligned_range)) { - assert(aligned_range.end >= neighbor.start); - assert(aligned_range.start <= neighbor.start); - - if (neighbor.end > first.end) { - log.debug( - "block: Merging neighbor range at index {}: {} -> {}.", - .{ insert_idx + 1, first.end, neighbor.end }, - ); - first.end = neighbor.end; - } - const removed = address_allocator.ranges.orderedRemove(insert_idx + 1); - log.debug("block: Removed merged range: {f}", .{removed}); - i += 1; - } - log.debug("block: Removed {} ranges.", .{i}); -} - -/// Allocate and block a `Range` of size `size` which will lie inside the given `valid_range`. If no -/// allocation of the given size is possible, return `null`. -pub fn allocate( - address_allocator: *AddressAllocator, - gpa: mem.Allocator, - size: u64, - valid_range: Range, -) !?Range { - const range = address_allocator.findAllocation(size, valid_range) orelse return null; - try address_allocator.block(gpa, range, 0); - return range; -} - -/// Find a free `Range` of size `size` within `valid_range` without blocking it. -pub fn findAllocation( - address_allocator: *AddressAllocator, - size: u64, - valid_range: Range, -) ?Range { - log.debug("findAllocation: Allocating size {} in range {f}", .{ size, valid_range }); - if (valid_range.size() < size) return null; - if (size == 0) return null; - const size_i: i64 = @intCast(size); - - const start_idx = sort.lowerBound( - Range, - address_allocator.ranges.items, - valid_range, - Range.compare, - ); - // `candidate_start` tracks the beginning of the current free region being examined. - var candidate_start = valid_range.start; - // If the range before the start index overlaps with our search start, we have to adjust. - if (start_idx > 0) { - const prev = address_allocator.ranges.items[start_idx - 1]; - if (prev.end > candidate_start) { - candidate_start = prev.end; - } - } - for (address_allocator.ranges.items[start_idx..]) |reserved| { - if (candidate_start >= valid_range.end) { - log.debug("findAllocation: Searched past the valid range.", .{}); + if (next_range.end <= range.end) { + _ = self.ranges.orderedRemove(remove_idx); + } else { + next_range.start = range.end; break; } + } +} - // The potential allocation gap is before the current reserved block. - if (candidate_start < reserved.start) { - // Determine the actual available portion of the gap within our search `range`. - const gap_end = @min(reserved.start, valid_range.end); - if (gap_end >= candidate_start + size_i) { - const new_range = Range{ - .start = candidate_start, - .end = candidate_start + size_i, - }; - log.debug("findAllocation: Found free gap: {f}", .{new_range}); - return new_range; +test "fuzz against bitset" { + const iterations = 64 * 1024; + const size = 1024; + + var aa = AddressAllocator{ .child_allocator = testing.allocator }; + defer aa.deinit(); + + var bitset_ref = try std.bit_set.DynamicBitSetUnmanaged.initEmpty(testing.allocator, size); + defer bitset_ref.deinit(testing.allocator); + + var prng = std.Random.DefaultPrng.init(testing.random_seed); + const random = prng.random(); + + var expected_ranges = try std.ArrayListUnmanaged(Range).initCapacity(testing.allocator, size / 2); + defer expected_ranges.deinit(testing.allocator); + + var bitset_temp = try std.bit_set.DynamicBitSetUnmanaged.initEmpty(testing.allocator, size); + defer bitset_temp.deinit(testing.allocator); + + for (0..iterations) |_| { + const is_block = random.boolean(); + const start = random.intRangeLessThan(usize, 0, size); + const len = random.intRangeAtMost(usize, 1, size - start); + const end = start + len; + + const range = Range{ .start = @intCast(start), .end = @intCast(end) }; + + if (is_block) { + try aa.block(range); + bitset_ref.setRangeValue(.{ .start = start, .end = end }, true); + } else { + try aa.unblock(range); + bitset_ref.setRangeValue(.{ .start = start, .end = end }, false); + } + + bitset_temp.unsetAll(); + for (aa.ranges.items) |r| { + bitset_temp.setRangeValue(.{ .start = @intCast(r.start), .end = @intCast(r.end) }, true); + } + try testing.expect(bitset_ref.eql(bitset_temp)); + } +} + +/// An internal iterator that cleanly yields unblocked memory holes. +const HoleIterator = struct { + ranges: []const Range, + valid_range: Range, + size: i64, + candidate_start: i64, + idx: usize, + + fn init(aa: *const AddressAllocator, valid_range: Range, size: u64) HoleIterator { + const start_idx = sort.lowerBound( + Range, + aa.ranges.items, + valid_range, + Range.compareOverlapping, + ); + return .{ + .ranges = aa.ranges.items, + .valid_range = valid_range, + .size = @intCast(size), + .candidate_start = valid_range.start, + .idx = start_idx, + }; + } + + fn next(self: *HoleIterator) ?Range { + while (self.idx < self.ranges.len) { + const reserved = self.ranges[self.idx]; + if (self.candidate_start >= self.valid_range.end) return null; + + if (self.candidate_start < reserved.start) { + const hole_end = @min(reserved.start, self.valid_range.end); + const hole_start = self.candidate_start; + self.candidate_start = reserved.end; + + if (hole_end >= hole_start + self.size) { + return Range{ .start = hole_start, .end = hole_end }; + } + } else { + self.candidate_start = @max(self.candidate_start, reserved.end); + } + self.idx += 1; + } + + if (self.candidate_start < self.valid_range.end) { + const hole_start = self.candidate_start; + const hole_end = self.valid_range.end; + self.candidate_start = self.valid_range.end; // Mark done to prevent infinite loops + if (hole_end >= hole_start + self.size) { + return Range{ .start = hole_start, .end = hole_end }; } } - // The gap was not large enough. Move the candidate start past the current reserved block - // for the next iteration. - candidate_start = @max(candidate_start, reserved.end); + return null; } - // Check the remaining space at the end of the search range. - if (valid_range.end >= candidate_start + size_i) { - const new_range = Range{ - .start = candidate_start, - .end = candidate_start + size_i, - }; - log.debug("findAllocation: Found free gap at end: {f}", .{new_range}); - return new_range; + test { + var aa = AddressAllocator{ .child_allocator = testing.allocator }; + defer aa.deinit(); + + try aa.block(.{ .start = 100, .end = 200 }); + try aa.block(.{ .start = 300, .end = 400 }); + + var it = HoleIterator.init(&aa, .{ .start = 0, .end = 500 }, 10); + + try testing.expectEqual(Range{ .start = 0, .end = 100 }, it.next().?); + try testing.expectEqual(Range{ .start = 200, .end = 300 }, it.next().?); + try testing.expectEqual(Range{ .start = 400, .end = 500 }, it.next().?); + try testing.expectEqual(null, it.next()); + } +}; + +const Constraint = struct { + min_rel: i32, + max_rel: i32, + mask: u32, + pattern: u32, +}; + +/// Solves a single 32-bit relative jump constraint in O(1) time. +/// +/// Returns the smallest `rel32` such that +/// - `min_rel <= rel32 <= max_rel` and +/// - `(rel32 & mask) == pattern` +/// +/// Context: +/// During "Instruction Punning", we overwrite an instruction with a 5-byte jump (`E9 xx xx xx xx`). +/// If the original instruction is smaller than 5 bytes, our jump offset (`xx xx xx xx`) will spill +/// into the next instruction. To prevent crashing, the spilled bytes must form the successor +/// instruction. This restricts certain bits/bytes of our `rel32` offset to fixed values. +/// +/// The algorithm uses a bit-twiddling hack to isolate the "free" (unmasked) bits, increment them as +/// a single continuous integer, and map them back around the fixed "pattern" bits, completely +/// avoiding loops over the search space. +/// +/// Visualization of the bit-twiddling constraint logic: +/// ------------------------------------------------------------------------- +/// Mask: 1111 1111 0000 0000 1111 1111 0000 0000 (1 = Locked bits) +/// Pattern: 0000 0000 0000 0000 1110 1001 0000 0000 (The forced values) +/// Free: 0000 0000 1111 1111 0000 0000 1111 1111 (~Mask) +/// +/// Current Candidate: [ Fixed A ] [ Free 1 ] [ Fixed B ] [ Free 0 ] +/// +/// If `Current Candidate < min_rel`, we add 1 to the "Free" bits. +/// The hack `(((candidate & free) | mask) + 1) & free` allows the arithmetic carry to jump over the +/// fixed bits without corrupting them: +/// +/// Next Valid Val: [ Fixed A ][ Free 1 + carry ] [ Fixed B ] [ Free 0 + 1 ] +/// ------------------------------------------------------------------------- +fn solveRelativeConstraint(c: Constraint) ?i32 { + log.debug( + "solveRelative: min: {x}, max: {x}, mask: {x}, pattern: {x}", + .{ c.min_rel, c.max_rel, c.mask, c.pattern }, + ); + assert((c.pattern & ~c.mask) == 0); + if (c.min_rel > c.max_rel) return null; + + // Force the pattern onto the current minimum value + var candidate: u32 = (@as(u32, @bitCast(c.min_rel)) & ~c.mask) | c.pattern; + log.debug(" candidate (init): {x}", .{candidate}); + + // If forcing the pattern made the value smaller than min_rel, we must increment the "free" bits + // to find the next valid higher number. + if (@as(i32, @bitCast(candidate)) < c.min_rel) { + if (~c.mask == 0) { + log.debug(" failed: fully constrained", .{}); + return null; + } + + const incremented_free = (((candidate & ~c.mask) | c.mask) +% 1) & ~c.mask; + assert(incremented_free & c.mask == 0); // All constrained bits are 0 + candidate = incremented_free | c.pattern; + log.debug(" candidate (incr): {x}", .{candidate}); } - log.debug("findAllocation: No suitable gap found.", .{}); + const result: i32 = @bitCast(candidate); + if (result >= c.min_rel and result <= c.max_rel) { + log.debug(" success: {x}", .{result}); + return result; + } + log.debug(" failed: result {x} out of bounds", .{result}); return null; } -fn isSorted(address_allocator: *const AddressAllocator) bool { - return sort.isSorted(Range, address_allocator.ranges.items, {}, isSortedInner); +test "solveRelativeConstraint basic" { + try testing.expectEqual(100, solveRelativeConstraint(.{ + .min_rel = 100, + .max_rel = 200, + .mask = 0, + .pattern = 0, + })); } -fn isSortedInner(_: void, lhs: Range, rhs: Range) bool { - return switch (lhs.compare(rhs)) { - .lt => true, - .gt => false, - .eq => unreachable, + +test "solveRelativeConstraint aligned" { + try testing.expectEqual(0x10E8, solveRelativeConstraint(.{ + .min_rel = 0x1000, + .max_rel = 0x2000, + .mask = 0xFF, + .pattern = 0xE8, + })); + try testing.expectEqual(0x10E8, solveRelativeConstraint(.{ + .min_rel = 0x10E8, + .max_rel = 0x2000, + .mask = 0xFF, + .pattern = 0xE8, + })); + try testing.expectEqual(0x11E8, solveRelativeConstraint(.{ + .min_rel = 0x10E9, + .max_rel = 0x2000, + .mask = 0xFF, + .pattern = 0xE8, + })); +} + +test "solveRelativeConstraint negative" { + try testing.expectEqual(@as(i32, @bitCast(@as(u32, 0xFFFFF0E8))), solveRelativeConstraint(.{ + .min_rel = -0x1000, + .max_rel = 0, + .mask = 0xFF, + .pattern = 0xE8, + })); +} + +test "solveRelativeConstraint impossible" { + try testing.expectEqual(null, solveRelativeConstraint(.{ + .min_rel = 0x1000, + .max_rel = 0x10E7, + .mask = 0xFF, + .pattern = 0xE8, + })); + try testing.expectEqual(null, solveRelativeConstraint(.{ + .min_rel = 0x10000000, + .max_rel = 0x11000000, + .mask = 0xFFFFFFFF, + .pattern = 0x12345678, + })); +} + +test "solveRelativeConstraint overflow" { + try testing.expectEqual(0x12345678, solveRelativeConstraint(.{ + .min_rel = 0x10000000, + .max_rel = 0x20000000, + .mask = 0xFFFFFFFF, + .pattern = 0x12345678, + })); + + try testing.expectEqual(null, solveRelativeConstraint(.{ + .min_rel = 2147483640, + .max_rel = 2147483647, + .mask = 0xFF, + .pattern = 0x00, + })); +} + +pub const Request = struct { + source: u64, + size: u64, + valid_range: Range, + mask: u32 = 0, + pattern: u32 = 0, +}; + +/// Finds the first free range of `size` bytes within `valid_range` that also satisfies the relative +/// 32-bit jump constraints `mask` and `pattern` from `jump_source`. +/// Runs in `O(|H| + log(#R))` for +/// - `H` being the set of holes in the valid range and +/// - `#R` being the number of ranges in the AddressAllocator. +pub fn findAllocation( + self: *AddressAllocator, + r: Request, +) ?Range { + if (r.valid_range.size() < r.size) return null; + if (r.size == 0) return null; + + var it = HoleIterator.init(self, r.valid_range, r.size); + while (it.next()) |hole| { + log.debug("findAllocation: Hole: {f}", .{hole}); + const bounds = getRelativeBounds(hole, @intCast(r.size), r.source) orelse continue; + const rel32 = solveRelativeConstraint(.{ + .min_rel = bounds.min, + .max_rel = bounds.max, + .mask = r.mask, + .pattern = r.pattern, + }) orelse continue; + + const start = @as(i64, @intCast(r.source)) + rel32; + const end = start + @as(i64, @intCast(r.size)); + + assert(end - start == r.size); + assert(start >= r.valid_range.start); + assert(end <= r.valid_range.end); + return .{ .start = start, .end = end }; + } + + return null; +} + +fn getRelativeBounds(hole: Range, size: i64, source: u64) ?struct { min: i32, max: i32 } { + if (hole.end - hole.start < size) return null; + + const offset_to_min = hole.start - @as(i64, @intCast(source)); + const offset_to_max = (hole.end - size) - @as(i64, @intCast(source)); + + const min_rel = @max(offset_to_min, math.minInt(i32)); + const max_rel = @min(offset_to_max, math.maxInt(i32)); + if (min_rel > max_rel) return null; + + return .{ + .min = @intCast(min_rel), + .max = @intCast(max_rel), }; } -test "block basic" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +test "findConstrainedAllocation" { + var aa = AddressAllocator{ .child_allocator = testing.allocator }; + defer aa.deinit(); - try aa.block(testing.allocator, .{ .start = 0, .end = 100 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 100 }, aa.ranges.items[0]); + try aa.block(.{ .start = 0x1000, .end = 0x2000 }); + try aa.block(.{ .start = 0x3000, .end = 0x4000 }); - try aa.block(testing.allocator, .{ .start = 200, .end = 300 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 100 }, aa.ranges.items[0]); - try testing.expectEqual(Range{ .start = 200, .end = 300 }, aa.ranges.items[1]); - try testing.expectEqual(2, aa.ranges.items.len); + try testing.expectEqual( + Range{ .start = 0x00AA, .end = 0x00BA }, + aa.findAllocation(.{ + .size = 0x10, + .valid_range = .{ .start = 0x0000, .end = 0x4000 }, + .source = 0, + .mask = 0xFF, + .pattern = 0xAA, + }), + ); + + try testing.expectEqual( + Range{ .start = 0x20AA, .end = 0x20BA }, + aa.findAllocation(.{ + .size = 0x10, + .valid_range = .{ .start = 0x1000, .end = 0x4000 }, + .source = 0, + .mask = 0xFF, + .pattern = 0xAA, + }), + ); + + try testing.expectEqual( + null, + aa.findAllocation(.{ + .size = 0x10, + .valid_range = .{ .start = 0x2000, .end = 0x8000 }, + .source = 0, + .mask = 0xFFFF, + .pattern = 0xAAAA, + }), + ); + + try testing.expectEqual( + Range{ .start = 0x40AA, .end = 0x50AA }, + aa.findAllocation(.{ + .size = 0x1000, + .valid_range = .{ .start = 0x2000, .end = 0x8000 }, + .source = 0, + .mask = 0xFF, + .pattern = 0xAA, + }), + ); } -test "block in hole" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +pub const CoupledResult = struct { + rel1: i32, + rel2: i32, +}; - try aa.block(testing.allocator, .{ .start = 0, .end = 100 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 100 }, aa.ranges.items[0]); +/// Attempts to find a joint bit-pattern that satisfies two overlapping jump constraints. +/// +/// Context: +/// In tactics like Successor Eviction, we overwrite two adjacent instructions with 5-byte jumps (J1 +/// and J2). If the distance between them is less than 5 bytes, their physical bytes overlap in +/// memory. +/// +/// `k` represents the physical distance (in bytes) between the start of J1 and J2 (1 <= k <= 4). +/// Because x86_64 uses Little-Endian representation, the Most Significant Bytes (MSB) of J1's +/// relative offset (`rel1`) physically overlap with the Least Significant Bytes (LSB) of J2's +/// relative offset (`rel2`). +/// +/// Furthermore, J2's opcode (`0xE9`) falls squarely inside the bytes of `rel1`. +/// +/// Memory Layout & Endianness Overlap (Example where K = 2): +/// ----------------------------------------------------------------------------------- +/// Memory Offset: +0 +1 +2 +3 +4 +5 +6 +/// J1 Bytes: [0xE9] [ X0 ] [ X1 ] [ X2 ] [ X3 ] +/// J2 Bytes: [0xE9] [ Y0 ] [ Y1 ] [ Y2 ] [ Y3 ] +/// +/// Consequences for `rel1` (X) and `rel2` (Y): +/// 1. Opcode Constraint: `X1` MUST exactly equal `0xE9`. +/// 2. Shared Bytes (S): `X2` MUST exactly equal `Y0`. +/// `X3` MUST exactly equal `Y1`. +/// ----------------------------------------------------------------------------------- +/// +/// Algorithm ("The Squeeze"): +/// Iterating possibly billions of combinations of X and Y is too slow. Instead, we use the +/// constraints of the memory layout: +/// +/// `rel1` is constrained to a physical memory hole `[min1, max1]`. Because memory holes are usually +/// small (e.g., 4KB), the Most Significant Bytes of `rel1` (which are exactly our Shared Bytes 'S') +/// are heavily restricted. +/// +/// There are usually only a few possible values for S: +/// 1. We extract the possible values for S from `min1..max1`. +/// 2. We apply S as a strict constraint on the lower bytes of `rel2`. +/// 3. We delegate the remaining independent bits (X0, Y2 and Y3) to the `solveRelativeConstraint`. +/// +/// Parameters: +/// `k`: The physical byte offset of J2 relative to J1 (1 <= k <= 4). +/// `min1`, `max1`: The valid rel32 hardware bounds for J1. +/// `min2`, `max2`: The valid rel32 hardware bounds for J2. +/// `mask1`, `pattern1`: The original byte constraints on J1. +/// `mask2`, `pattern2`: The original byte constraints on J2. +pub fn solveCoupledConstraint( + k: u8, + c1: Constraint, + c2: Constraint, +) ?CoupledResult { + log.debug("solveCoupled: k={}", .{k}); + log.debug(" C1: min={x} max={x} mask={x} pat={x}", .{ c1.min_rel, c1.max_rel, c1.mask, c1.pattern }); + log.debug(" C2: min={x} max={x} mask={x} pat={x}", .{ c2.min_rel, c2.max_rel, c2.mask, c2.pattern }); + assert(k >= 1); + assert(k <= 4); - try aa.block(testing.allocator, .{ .start = 400, .end = 500 }, 0); - try testing.expectEqual(2, aa.ranges.items.len); - try testing.expectEqual(Range{ .start = 0, .end = 100 }, aa.ranges.items[0]); - try testing.expectEqual(Range{ .start = 400, .end = 500 }, aa.ranges.items[1]); + // The opcode for J2 (0xE9) physically falls inside rel32 of J1 at byte index `k - 1` of rel1. + const e9_shift = @as(u5, @intCast(k - 1)) * 8; + const e9_mask = @as(u32, 0xFF) << e9_shift; - try aa.block(testing.allocator, .{ .start = 200, .end = 300 }, 0); - try testing.expectEqual(3, aa.ranges.items.len); - try testing.expectEqual(Range{ .start = 0, .end = 100 }, aa.ranges.items[0]); - try testing.expectEqual(Range{ .start = 200, .end = 300 }, aa.ranges.items[1]); - try testing.expectEqual(Range{ .start = 400, .end = 500 }, aa.ranges.items[2]); + if ((c1.mask & e9_mask) != 0 and (c1.pattern & e9_mask) != (@as(u32, 0xE9) << e9_shift)) { + log.debug(" failed: opcode 0xE9 conflict in C1", .{}); + return null; // Caller's pattern conflicts with the mandatory J2 opcode + } + const c_mask1 = c1.mask | e9_mask; + const c_pattern1 = (c1.pattern & ~e9_mask) | (@as(u32, 0xE9) << e9_shift); + + if (k == 4) { + // J1 is completely resolved just with the 0xE9 constraint applied above. + log.debug(" Fast path K=4", .{}); + const rel1 = solveRelativeConstraint(.{ + .min_rel = c1.min_rel, + .max_rel = c1.max_rel, + .mask = c_mask1, + .pattern = c_pattern1, + }) orelse return null; + const rel2 = solveRelativeConstraint(.{ + .min_rel = c2.min_rel, + .max_rel = c2.max_rel, + .mask = c2.mask, + .pattern = c2.pattern, + }) orelse return null; + return .{ .rel1 = rel1, .rel2 = rel2 }; + } + + // Determine the bitwise shift and mask for the Shared Bytes (S) + const s_shift = @as(u5, @intCast(k)) * 8; + const num_shared = @as(u5, @intCast(4 - k)); + const s_mask = (@as(u32, 1) << (num_shared * 8)) - 1; + + log.debug(" Shared Bytes: shift={}, mask={x}", .{ s_shift, s_mask }); + + var current_min = c1.min_rel; + while (current_min <= c1.max_rel) { + const u_rel: u32 = @bitCast(current_min); + const S = u_rel >> s_shift; // Extract shared bytes from top of rel1 + + // Calculate the maximum u32 value that shares this S + const max_u_rel_for_S = (S << s_shift) | ((@as(u32, 1) << s_shift) - 1); + const max_i_rel_for_S: i32 = @bitCast(max_u_rel_for_S); + const local_max1 = @min(c1.max_rel, max_i_rel_for_S); + + // Does this S conflict with J2's requirements? + if ((c2.mask & s_mask) != 0) { + if ((c2.pattern & c2.mask & s_mask) != (S & c2.mask & s_mask)) { + // Advance to the next block of S. + log.debug(" Conflict at S={x} (min={x})", .{ S, current_min }); + if (max_i_rel_for_S == std.math.maxInt(i32)) break; + const next_min = max_i_rel_for_S + 1; + if (next_min > c1.max_rel) break; + current_min = next_min; + continue; + } + } + + log.debug(" Trying S={x} range [{x}, {x}]", .{ S, current_min, local_max1 }); + + // Apply S as a strict constraint on the lowest bytes of J2 + const c_mask2 = c2.mask | s_mask; + const c_pattern2 = (c2.pattern & ~s_mask) | S; + + // O(1) solver execution for this specific S value + const opt_rel1 = solveRelativeConstraint(.{ + .min_rel = current_min, + .max_rel = local_max1, + .mask = c_mask1, + .pattern = c_pattern1, + }); + const opt_rel2 = solveRelativeConstraint(.{ + .min_rel = c2.min_rel, + .max_rel = c2.max_rel, + .mask = c_mask2, + .pattern = c_pattern2, + }); + if (opt_rel1 != null and opt_rel2 != null) { + log.debug(" Success: rel1={x} rel2={x}", .{ opt_rel1.?, opt_rel2.? }); + return .{ .rel1 = opt_rel1.?, .rel2 = opt_rel2.? }; + } + + if (max_i_rel_for_S == std.math.maxInt(i32)) break; + const next_min = max_i_rel_for_S + 1; + if (next_min > c1.max_rel) break; + current_min = next_min; + } + + log.debug(" failed: no coupled solution found", .{}); + return null; } -test "block touch with previous" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +test "solveCoupledConstraint K=4 (Independent)" { + // If K=4, J1 and J2 don't share rel32 bytes, but byte 3 of rel1 MUST be 0xE9 (the J2 opcode). + // Let's force rel1 to be in[0x12000000, 0x120000FF]. + // Since highest byte (byte 3) must be 0xE9, no value starting with 0x12 will work. + try testing.expectEqual(null, solveCoupledConstraint( + 4, + .{ + .min_rel = 0x12000000, + .max_rel = 0x120000FF, + .mask = 0, + .pattern = 0, + }, + .{ + .min_rel = 0, + .max_rel = 100, + .mask = 0, + .pattern = 0, + }, + )); - try aa.block(testing.allocator, .{ .start = 0, .end = 100 }, 0); - try aa.block(testing.allocator, .{ .start = 100, .end = 200 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 200 }, aa.ranges.items[0]); - try testing.expectEqual(1, aa.ranges.items.len); - - try aa.block(testing.allocator, .{ .start = 100, .end = 300 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 300 }, aa.ranges.items[0]); - try testing.expectEqual(1, aa.ranges.items.len); - - try aa.block(testing.allocator, .{ .start = 300, .end = 400 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 400 }, aa.ranges.items[0]); - try testing.expectEqual(1, aa.ranges.items.len); + const res = solveCoupledConstraint( + 4, + .{ + .min_rel = @bitCast(@as(u32, 0xE8000000)), + .max_rel = @bitCast(@as(u32, 0xEA000000)), + .mask = 0, + .pattern = 0, + }, + .{ + .min_rel = 0x1234, + .max_rel = 0x1234, + .mask = 0, + .pattern = 0, + }, + ); + try testing.expect(res != null); + try testing.expectEqual(@as(i32, @bitCast(@as(u32, 0xE9000000))), res.?.rel1); + try testing.expectEqual(0x1234, res.?.rel2); } -test "block touch with following" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); - - try aa.block(testing.allocator, .{ .start = 200, .end = 300 }, 0); - try aa.block(testing.allocator, .{ .start = 100, .end = 200 }, 0); - try testing.expectEqual(Range{ .start = 100, .end = 300 }, aa.ranges.items[0]); - try testing.expectEqual(1, aa.ranges.items.len); - - try aa.block(testing.allocator, .{ .start = 0, .end = 200 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 300 }, aa.ranges.items[0]); - try testing.expectEqual(1, aa.ranges.items.len); - - try aa.block(testing.allocator, .{ .start = -100, .end = 0 }, 0); - try testing.expectEqual(Range{ .start = -100, .end = 300 }, aa.ranges.items[0]); - try testing.expectEqual(1, aa.ranges.items.len); +test "solveCoupledConstraint K=2 (2 byte overlap)" { + // K=2 means the top 2 bytes of rel1 are the bottom 2 bytes of rel2. + // J2 opcode (0xE9) sits at byte 1 of rel1. + const res = solveCoupledConstraint( + 2, + .{ + .min_rel = 0x12340000, + .max_rel = 0x1234FFFF, + .mask = 0, + .pattern = 0, + }, + .{ + .min_rel = 0x00000000, + .max_rel = 0x0000FFFF, + .mask = 0, + .pattern = 0, + }, + ); + try testing.expect(res != null); + try testing.expectEqual(0x1234E900, res.?.rel1); + try testing.expectEqual(0x00001234, res.?.rel2); } -test "block overlap with previous and following" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); - - try aa.block(testing.allocator, .{ .start = 0, .end = 100 }, 0); - try aa.block(testing.allocator, .{ .start = 200, .end = 300 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 100 }, aa.ranges.items[0]); - try testing.expectEqual(Range{ .start = 200, .end = 300 }, aa.ranges.items[1]); - try testing.expectEqual(2, aa.ranges.items.len); - - try aa.block(testing.allocator, .{ .start = 50, .end = 250 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 300 }, aa.ranges.items[0]); - try testing.expectEqual(1, aa.ranges.items.len); +test "solveCoupledConstraint K=2 conflict" { + // Same as above, but J2 explicitly forbids lower bytes from being 0x1234. + const res = solveCoupledConstraint( + 2, + .{ + .min_rel = 0x12340000, + .max_rel = 0x1234FFFF, + .mask = 0, + .pattern = 0, + }, + .{ + .min_rel = 0x00000000, + .max_rel = 0x0000FFFF, + .mask = 0x0000FFFF, + .pattern = 0x00005678, + }, + ); + try testing.expectEqual(null, res); } -test "block contained by existing" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); - - try aa.block(testing.allocator, .{ .start = 100, .end = 300 }, 0); - try aa.block(testing.allocator, .{ .start = 200, .end = 250 }, 0); - try testing.expectEqual(Range{ .start = 100, .end = 300 }, aa.ranges.items[0]); - try testing.expectEqual(1, aa.ranges.items.len); +test "solveCoupledConstraint K=2 spans multiple S values" { + // We give J1 a wide range:[0x00000000, 0x00060000]. S can be 0 to 6. + // We force J2 to require lower bytes = 0x0004. This forces the solver to skip S=0 and similar + // and find S=4. + const res = solveCoupledConstraint( + 2, + .{ + .min_rel = 0, + .max_rel = 0x00060000, + .mask = 0, + .pattern = 0, + }, + .{ + .min_rel = 0, + .max_rel = 0x0000FFFF, + .mask = 0x0000FFFF, + .pattern = 0x00000004, + }, + ); + try testing.expect(res != null); + try testing.expectEqual(0x0004E900, res.?.rel1); + try testing.expectEqual(0x00000004, res.?.rel2); } -test "block contains existing" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +/// Finds two allocations that simultaneously satisfy their individual offset constraints and the +/// physical overlap constraints of their origin instructions. +/// `r1` (for J1) and `r2` (for J2) separated by `k` bytes. +/// +/// Runs in O(|H1| * |H2| + log(#R)) for +/// - `H1` and `H2` being the set of holes in the valid ranges in `r1` and `r2` +/// - `#R` being the number of ranges in the AddressAllocator. +pub fn findCoupledAllocation( + self: *AddressAllocator, + k: u8, + r1: Request, + r2: Request, +) ?[2]Range { + if (r1.valid_range.size() < r1.size or r1.size == 0) return null; + if (r2.valid_range.size() < r2.size or r2.size == 0) return null; + assert(r2.source > r1.source); + assert(r2.source - r1.source == k); - try aa.block(testing.allocator, .{ .start = 50, .end = 100 }, 0); - try aa.block(testing.allocator, .{ .start = 0, .end = 200 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 200 }, aa.ranges.items[0]); - try testing.expectEqual(1, aa.ranges.items.len); + var it1 = HoleIterator.init(self, r1.valid_range, r1.size); + while (it1.next()) |hole1| { + log.debug("findCoupledAllocation: Hole1: {f}", .{hole1}); + const b1 = getRelativeBounds(hole1, @intCast(r1.size), r1.source) orelse continue; + + var it2 = HoleIterator.init(self, r2.valid_range, r2.size); + while (it2.next()) |hole2| { + log.debug(" Hole2: {f}", .{hole2}); + const b2 = getRelativeBounds(hole2, @intCast(r2.size), r2.source) orelse continue; + + const c1 = Constraint{ + .min_rel = b1.min, + .max_rel = b1.max, + .mask = r1.mask, + .pattern = r1.pattern, + }; + const c2 = Constraint{ + .min_rel = b2.min, + .max_rel = b2.max, + .mask = r2.mask, + .pattern = r2.pattern, + }; + + if (solveCoupledConstraint(k, c1, c2)) |result| { + const start1 = @as(i64, @intCast(r1.source)) + result.rel1; + const end1 = start1 + @as(i64, @intCast(r1.size)); + + const start2 = @as(i64, @intCast(r2.source)) + result.rel2; + const end2 = start2 + @as(i64, @intCast(r2.size)); + + assert(end1 - start1 == r1.size); + assert(end2 - start2 == r2.size); + + // If we used the same hole, we must ensure the actual allocations don't overlap. + const range1 = Range{ .start = start1, .end = end1 }; + const range2 = Range{ .start = start2, .end = end2 }; + // TODO: Support allocating both trampolines in the exact same memory hole. + // This requires dynamically partitioning the hole so the trampolines don't overlap + // each other. For now, simply skip this case. + if (range1.overlaps(range2)) continue; + + return [2]Range{ + .{ .start = start1, .end = end1 }, + .{ .start = start2, .end = end2 }, + }; + } + } + } + + return null; } -test "block overlaps multiple" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +/// A generic helper to mechanically verify that a coupled allocation satisfies all bitwise and +/// physical overlap constraints. +fn verifyCoupled(k: u8, r1: Request, r2: Request, j1_range: Range, j2_range: Range) !void { + const rel1: i32 = @intCast(j1_range.start - @as(i64, @intCast(r1.source))); + const rel2: i32 = @intCast(j2_range.start - @as(i64, @intCast(r2.source))); + const u_rel1: u32 = @bitCast(rel1); + const u_rel2: u32 = @bitCast(rel2); - try aa.block(testing.allocator, .{ .start = 0, .end = 100 }, 0); - try aa.block(testing.allocator, .{ .start = 150, .end = 200 }, 0); - try aa.block(testing.allocator, .{ .start = 250, .end = 300 }, 0); - try aa.block(testing.allocator, .{ .start = 350, .end = 400 }, 0); - try aa.block(testing.allocator, .{ .start = 450, .end = 500 }, 0); - try testing.expectEqual(5, aa.ranges.items.len); + // Opcode Constraint + const e9_shift = @as(u5, @intCast(k - 1)) * 8; + try testing.expectEqual(@as(u32, 0xE9), (u_rel1 >> e9_shift) & 0xFF); - try aa.block(testing.allocator, .{ .start = 50, .end = 475 }, 0); - try testing.expectEqual(Range{ .start = 0, .end = 500 }, aa.ranges.items[0]); - try testing.expectEqual(1, aa.ranges.items.len); + // Shared Bytes Constraint + if (k < 4) { + const shared_shift = @as(u5, @intCast(k)) * 8; + const shared_mask = (@as(u32, 1) << (@as(u5, @intCast(4 - k)) * 8)) - 1; + const shared1 = (u_rel1 >> shared_shift) & shared_mask; + const shared2 = u_rel2 & shared_mask; + try testing.expectEqual(shared1, shared2); + } + + // Original User Constraints + try testing.expectEqual(r1.pattern, u_rel1 & r1.mask); + try testing.expectEqual(r2.pattern, u_rel2 & r2.mask); } -test "allocate in empty allocator" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +test "findCoupledAllocation" { + var aa = AddressAllocator{ .child_allocator = testing.allocator }; + defer aa.deinit(); - const search_range = Range{ .start = 0, .end = 1000 }; - const allocated = try aa.allocate(testing.allocator, 100, search_range); - try testing.expectEqual(1, aa.ranges.items.len); - try testing.expectEqual(Range{ .start = 0, .end = 100 }, aa.ranges.items[0]); - try testing.expectEqual(Range{ .start = 0, .end = 100 }, allocated); + // Block memory so we have distinct holes. + // We need a hole that allows `rel1` to have `0xE9` in its second byte. + // This means `rel1` needs to be around `0xE900`. + try aa.block(.{ .start = 0x2000, .end = 0xE000 }); + try aa.block(.{ .start = 0xF000, .end = 0x10000 }); + + const r1 = Request{ .source = 0, .size = 10, .valid_range = .{ .start = 0, .end = 0x20000 } }; + const r2 = Request{ .source = 2, .size = 10, .valid_range = .{ .start = 0, .end = 0x20000 } }; + const res = aa.findCoupledAllocation(2, r1, r2); + try testing.expect(res != null); + + const j1_range = res.?[0]; + const j2_range = res.?[1]; + try testing.expect(j1_range.start >= 0xE000 and j1_range.end <= 0xF000); + try testing.expect(j2_range.start >= 0x0000 and j2_range.end <= 0x2000); + + try verifyCoupled(2, r1, r2, j1_range, j2_range); } -test "allocate with no space" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +test "findCoupledAllocation K=1 (3 shared bytes)" { + var aa = AddressAllocator{ .child_allocator = testing.allocator }; + defer aa.deinit(); - const range = Range{ .start = 0, .end = 1000 }; - try aa.block(testing.allocator, range, 0); - const allocated = try aa.allocate(testing.allocator, 100, range); - try testing.expect(allocated == null); + try aa.block(.{ .start = 0x2000, .end = 0x01000000 }); + + const r1 = Request{ .source = 0, .size = 10, .valid_range = .{ .start = 0, .end = 0x10000000 } }; + const r2 = Request{ .source = 1, .size = 10, .valid_range = .{ .start = 0, .end = 0x10000000 } }; + const res = aa.findCoupledAllocation(1, r1, r2); + try testing.expect(res != null); + + // For K=1, rel1's lowest byte MUST be 0xE9. + // In Hole 1, the smallest valid rel1 is 0x000000E9. + // This makes the shared bytes (top 3 bytes) 0x000000. + try testing.expectEqual(0xE9, res.?[0].start); + try testing.expectEqual(0x01, res.?[1].start); + + try verifyCoupled(1, r1, r2, res.?[0], res.?[1]); } -test "allocate in a gap" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +test "findCoupledAllocation K=3 (1 shared byte)" { + var aa = AddressAllocator{ .child_allocator = testing.allocator }; + defer aa.deinit(); - try aa.block(testing.allocator, .{ .start = 0, .end = 100 }, 0); - try aa.block(testing.allocator, .{ .start = 200, .end = 300 }, 0); + // K=3 means rel1 byte 2 MUST be 0xE9. rel1 looks like 0xXXE9XXXX. + // Smallest positive is ~0x00E90000. We need a hole there. + try aa.block(.{ .start = 0x2000, .end = 0x00E90000 }); - const search_range = Range{ .start = 0, .end = 1000 }; - const allocated = try aa.allocate(testing.allocator, 50, search_range); - try testing.expectEqual(Range{ .start = 100, .end = 150 }, allocated); - try testing.expectEqual(2, aa.ranges.items.len); - try testing.expectEqual(Range{ .start = 0, .end = 150 }, aa.ranges.items[0]); - try testing.expectEqual(Range{ .start = 200, .end = 300 }, aa.ranges.items[1]); + const r1 = Request{ .source = 0, .size = 10, .valid_range = .{ .start = 0, .end = 0x10000000 } }; + const r2 = Request{ .source = 3, .size = 10, .valid_range = .{ .start = 0, .end = 0x10000000 } }; + const res = aa.findCoupledAllocation(3, r1, r2); + try testing.expect(res != null); + try verifyCoupled(3, r1, r2, res.?[0], res.?[1]); } -test "allocate at the end" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +test "findCoupledAllocation K=4 (Independent)" { + var aa = AddressAllocator{ .child_allocator = testing.allocator }; + defer aa.deinit(); - try aa.block(testing.allocator, .{ .start = 0, .end = 100 }, 0); + try aa.block(.{ .start = 0x2000, .end = 0x01000000 }); - const search_range = Range{ .start = 0, .end = 1000 }; - const allocated = try aa.allocate(testing.allocator, 200, search_range); - try testing.expectEqual(Range{ .start = 100, .end = 300 }, allocated); - try testing.expectEqual(1, aa.ranges.items.len); - try testing.expectEqual(Range{ .start = 0, .end = 300 }, aa.ranges.items[0]); + const r1 = Request{ + .source = 0x50000000, + .size = 10, + .valid_range = .{ .start = 0, .end = 0x60000000 }, + }; + const r2 = Request{ + .source = 0x50000004, + .size = 10, + .valid_range = .{ .start = 0, .end = 0x60000000 }, + }; + + const res = aa.findCoupledAllocation(4, r1, r2); + try testing.expect(res != null); + try verifyCoupled(4, r1, r2, res.?[0], res.?[1]); } -test "allocate within specific search range" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +test "findCoupledAllocation Negative Jumps (Both Backwards)" { + var aa = AddressAllocator{ .child_allocator = testing.allocator }; + defer aa.deinit(); - try aa.block(testing.allocator, .{ .start = 0, .end = 100 }, 0); - try aa.block(testing.allocator, .{ .start = 400, .end = 500 }, 0); + // We block everything except two specific holes far behind the jump source. + try aa.block(.{ .start = 0, .end = 0x10000000 }); + try aa.block(.{ .start = 0x10010000, .end = 0x20000000 }); + try aa.block(.{ .start = 0x20010000, .end = 0x60000000 }); - // Search range starts after first block and has a gap - const search_range = Range{ .start = 200, .end = 400 }; - const allocated = try aa.allocate(testing.allocator, 100, search_range); - try testing.expectEqual(Range{ .start = 200, .end = 300 }, allocated); - try testing.expectEqual(3, aa.ranges.items.len); - try testing.expectEqual(Range{ .start = 0, .end = 100 }, aa.ranges.items[0]); - try testing.expectEqual(Range{ .start = 400, .end = 500 }, aa.ranges.items[2]); - try testing.expectEqual(Range{ .start = 200, .end = 300 }, aa.ranges.items[1]); + const r1 = Request{ + .source = 0x50000000, + .size = 10, + .valid_range = .{ .start = 0, .end = 0x60000000 }, + }; + const r2 = Request{ + .source = 0x50000002, + .size = 10, + .valid_range = .{ .start = 0, .end = 0x60000000 }, + }; + + // The math solver natively handles the two's complement wraparound. + const res = aa.findCoupledAllocation(2, r1, r2); + try testing.expect(res != null); + try verifyCoupled(2, r1, r2, res.?[0], res.?[1]); } -test "allocate exact gap size" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +test "findCoupledAllocation with Mask/Pattern Constraints" { + var aa = AddressAllocator{ .child_allocator = testing.allocator }; + defer aa.deinit(); - try aa.block(testing.allocator, .{ .start = 0, .end = 100 }, 0); - try aa.block(testing.allocator, .{ .start = 200, .end = 300 }, 0); + try aa.block(.{ .start = 0, .end = 0x10000 }); + try aa.block(.{ .start = 0x20000, .end = 0x44440000 }); + try aa.block(.{ .start = 0x44450000, .end = 0x80000000 }); - const search_range = Range{ .start = 0, .end = 1000 }; - const allocated = try aa.allocate(testing.allocator, 100, search_range); - try testing.expectEqual(Range{ .start = 100, .end = 200 }, allocated); - try testing.expectEqual(1, aa.ranges.items.len); - try testing.expectEqual(Range{ .start = 0, .end = 300 }, aa.ranges.items[0]); + // K=2. We force the shared bytes to be exactly 0x4444. + const r1 = Request{ .source = 0, .size = 10, .valid_range = .{ .start = 0, .end = 0x80000000 } }; + const r2 = Request{ + .source = 2, + .size = 10, + .valid_range = .{ .start = 0, .end = 0x80000000 }, + .mask = 0x0000FFFF, + .pattern = 0x00004444, + }; + + const res = aa.findCoupledAllocation(2, r1, r2); + try testing.expect(res != null); + try verifyCoupled(2, r1, r2, res.?[0], res.?[1]); + + // Explicitly verify the constraint was propagated to J1 + const rel1: i32 = @intCast(res.?[0].start); + const u_rel1: u32 = @bitCast(rel1); + try testing.expectEqual(@as(u32, 0x4444), (u_rel1 >> 16) & 0xFFFF); } -test "allocate fails when too large" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); +test "findCoupledAllocation Fails on Math Impossibility" { + var aa = AddressAllocator{ .child_allocator = testing.allocator }; + defer aa.deinit(); - try aa.block(testing.allocator, .{ .start = 0, .end = 100 }, 0); - try aa.block(testing.allocator, .{ .start = 200, .end = 300 }, 0); + const r1 = Request{ + .source = 0, + .size = 10, + .valid_range = .{ .start = 0, .end = 0x80000000 }, + .mask = 0xFFFF0000, + .pattern = 0x11110000, + }; + const r2 = Request{ + .source = 2, + .size = 10, + .valid_range = .{ .start = 0, .end = 0x80000000 }, + .mask = 0x0000FFFF, + .pattern = 0x00002222, + }; - const search_range = Range{ .start = 0, .end = 400 }; - const allocated = try aa.allocate(testing.allocator, 101, search_range); - try std.testing.expect(allocated == null); -} - -test "allocate with zero size" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); - - const search_range = Range{ .start = 0, .end = 1000 }; - const allocated = try aa.allocate(testing.allocator, 0, search_range); - try std.testing.expect(allocated == null); -} - -test "allocate with size bigger than range" { - var aa = AddressAllocator{}; - defer aa.deinit(testing.allocator); - - const search_range = Range{ .start = 0, .end = 100 }; - const allocated = try aa.allocate(testing.allocator, 1000, search_range); - try std.testing.expect(allocated == null); + const res = aa.findCoupledAllocation(2, r1, r2); + try testing.expectEqual(null, res); } diff --git a/src/PatchLocationIterator.zig b/src/PatchLocationIterator.zig deleted file mode 100644 index 0dca4cb..0000000 --- a/src/PatchLocationIterator.zig +++ /dev/null @@ -1,447 +0,0 @@ -//! Iterates through all possible valid address ranges for a `jmp rel33` instruction based on a -//! 4-byte pattern of "free" and "used" bytes. -//! -//! This is the core utility for implementing E9Patch-style instruction punning (B2) and padded -//! jumps (T1). -const std = @import("std"); -const testing = std.testing; -const assert = std.debug.assert; - -const log = std.log.scoped(.patch_location_iterator); - -const Range = @import("Range.zig"); - -/// Represents a single byte in the 4-byte `rel32` offset pattern. -pub const PatchByte = union(enum) { - /// This byte can be any value (0x00-0xFF). - free: void, - /// This byte is constrained to a specific value. - used: u8, - - pub fn format(self: @This(), writer: *std.Io.Writer) std.Io.Writer.Error!void { - switch (self) { - .free => try writer.print("free", .{}), - .used => |val| try writer.print("used({x})", .{val}), - } - } -}; - -const patch_size = 4; -const PatchInt = std.meta.Int(.signed, patch_size * 8); -const PatchLocationIterator = @This(); -/// The base address (e.g., RIP of the *next* instruction) that the 32-bit relative offset is -/// calculated from. -offset: i64, -/// The 4-byte little-endian pattern of `used` and `free` bytes that constrain the `rel32` offset. -patch_bytes: [patch_size]PatchByte, -/// Internal state: the byte-level representation of the *start* of the current `rel32` offset being -/// iterated. -start: [patch_size]u8, -/// Internal state: the byte-level representation of the *end* of the current `rel32` offset being -/// iterated. -end: [patch_size]u8, -/// Internal state: flag to handle the first call to `next()` uniquely. -first: bool, -/// Internal state: optimization cache for the number of contiguous `.free` bytes at the *end* of -/// `patch_bytes`. -trailing_free_count: u8, - -/// Initializes the iterator. -/// - `patch_bytes`: The 4-byte pattern of the `rel32` offset, in little-endian order. -/// The base address (e.g., RIP of the *next* instruction) that the 32-bit relative offset is -/// calculated from. -pub fn init(patch_bytes: [patch_size]PatchByte, addr: u64) PatchLocationIterator { - log.debug("hi", .{}); - assert(patch_bytes.len == patch_size); - - // Find the number of contiguous free bytes at the end of the pattern. - var trailing_free: u8 = 0; - for (0..patch_bytes.len) |i| { - if (patch_bytes[i] == .free) { - trailing_free += 1; - } else { - break; - } - } - - var start = std.mem.zeroes([patch_size]u8); - var end = std.mem.zeroes([patch_size]u8); - for (patch_bytes, 0..) |byte, i| { - switch (byte) { - .free => { - start[i] = 0; - end[i] = if (i < trailing_free) 0xff else 0; - }, - .used => |val| { - start[i] = val; - end[i] = val; - }, - } - } - - const out = PatchLocationIterator{ - .offset = @intCast(addr), - .patch_bytes = patch_bytes, - .trailing_free_count = trailing_free, - .start = start, - .end = end, - .first = true, - }; - log.debug("init: {f}", .{out}); - return out; -} - -/// Returns the next valid `Range` of target addresses, or `null` if the iteration is complete. -pub fn next(self: *PatchLocationIterator) ?Range { - // If all bytes are free we can just return the maximum range. - if (self.trailing_free_count == patch_size) { - defer self.first = false; - if (self.first) { - var range = Range{ - .start = self.offset + std.math.minInt(i32), - .end = self.offset + std.math.maxInt(i32), - }; - // Clamp to valid positive address space - if (range.start < 0) range.start = 0; - if (range.end <= 0) { - log.info("next: All bytes free, but range entirely negative.", .{}); - return null; - } - - log.debug("next: All bytes free, returning full range: {f}", .{range}); - return range; - } else { - log.info("next: All bytes free, iteration finished.", .{}); - return null; - } - } - - while (true) { - var range: Range = undefined; - - if (self.first) { - self.first = false; - const start = std.mem.readInt(PatchInt, self.start[0..], .little); - const end = std.mem.readInt(PatchInt, self.end[0..], .little); - range = Range{ - .start = start + self.offset, - .end = end + self.offset, - }; - } else { - var overflow: u1 = 1; - for (self.patch_bytes, 0..) |byte, i| { - if (i < self.trailing_free_count or byte == .used) { - continue; - } - assert(byte == .free); - assert(self.start[i] == self.end[i]); - defer assert(self.start[i] == self.end[i]); - - if (overflow == 1) { - if (self.start[i] == std.math.maxInt(u8)) { - self.start[i] = 0; - self.end[i] = 0; - } else { - self.start[i] += 1; - self.end[i] += 1; - overflow = 0; - } - } - } - if (overflow == 1) { - log.info("next: Iteration finished, no more ranges.", .{}); - return null; - } - - const start = std.mem.readInt(PatchInt, self.start[0..], .little); - const end = std.mem.readInt(PatchInt, self.end[0..], .little); - assert(end >= start); - range = Range{ - .start = start + self.offset, - .end = end + self.offset, - }; - } - - // Filter out ranges that are entirely negative (invalid memory addresses). - if (range.end <= 0) continue; - // Clamp ranges that start negative but end positive. - if (range.start < 0) range.start = 0; - - log.debug("next: new range: {f}", .{range}); - return range; - } -} - -pub fn format(self: PatchLocationIterator, writer: *std.Io.Writer) std.Io.Writer.Error!void { - try writer.print(".{{ ", .{}); - try writer.print(".offset = {x}, ", .{self.offset}); - try writer.print( - ".patch_bytes = .{{ {f}, {f}, {f}, {f} }}, ", - .{ self.patch_bytes[0], self.patch_bytes[1], self.patch_bytes[2], self.patch_bytes[3] }, - ); - try writer.print( - ".start: 0x{x}, .end: 0x{x}, first: {}, trailing_free_count: {}", - .{ self.start, self.end, self.first, self.trailing_free_count }, - ); -} - -test "free bytes" { - const pattern = [_]PatchByte{ - .{ .free = {} }, - .{ .free = {} }, - .{ .free = {} }, - .{ .free = {} }, - }; - var it = PatchLocationIterator.init(pattern, 0); - - try testing.expectEqual( - Range{ .start = 0, .end = std.math.maxInt(i32) }, - it.next().?, - ); - try testing.expectEqual(null, it.next()); -} - -test "predetermined negative" { - const pattern = [_]PatchByte{ - .{ .free = {} }, - .{ .free = {} }, - .{ .free = {} }, - .{ .used = 0xe9 }, - }; - var it = PatchLocationIterator.init(pattern, 0); - try testing.expectEqual(null, it.next()); -} - -test "trailing free bytes" { - const pattern = [_]PatchByte{ - .{ .free = {} }, - .{ .free = {} }, - .{ .free = {} }, - .{ .used = 0x79 }, - }; - var it = PatchLocationIterator.init(pattern, 0); - - try testing.expectEqual( - Range{ .start = 0x79000000, .end = 0x79ffffff }, - it.next().?, - ); - try testing.expectEqual(null, it.next()); -} - -test "inner and trailing free bytes" { - const pattern = [_]PatchByte{ - .{ .free = {} }, - .{ .used = 0xe8 }, - .{ .free = {} }, - .{ .used = 0x79 }, - }; - var it = PatchLocationIterator.init(pattern, 0); - - try testing.expectEqual( - Range{ .start = 0x7900e800, .end = 0x7900e8ff }, - it.next().?, - ); - try testing.expectEqual( - Range{ .start = 0x7901e800, .end = 0x7901e8ff }, - it.next().?, - ); - - // Skip to the last range - var r_last: ?Range = null; - var count: u32 = 2; // We already consumed two - while (it.next()) |r| { - r_last = r; - count += 1; - } - try testing.expectEqual( - Range{ .start = 0x79ffe800, .end = 0x79ffe8ff }, - r_last, - ); - try testing.expectEqual(256, count); -} - -test "no free bytes" { - const pattern = [_]PatchByte{ - .{ .used = 0xe9 }, - .{ .used = 0x00 }, - .{ .used = 0x00 }, - .{ .used = 0x78 }, - }; - var it = PatchLocationIterator.init(pattern, 0); - - try testing.expectEqual( - Range{ .start = 0x780000e9, .end = 0x780000e9 }, - it.next().?, - ); - try testing.expectEqual(null, it.next()); -} - -test "inner and leading free bytes" { - const pattern = [_]PatchByte{ - .{ .used = 0xe9 }, - .{ .free = {} }, - .{ .used = 0xe8 }, - .{ .free = {} }, - }; - var it = PatchLocationIterator.init(pattern, 0); - - try testing.expectEqual( - Range{ .start = 0x00e800e9, .end = 0x00e800e9 }, - it.next().?, - ); - try testing.expectEqual( - Range{ .start = 0x00e801e9, .end = 0x00e801e9 }, - it.next().?, - ); - - // Skip to the last range - var r_last: ?Range = null; - var count: u32 = 2; // We already consumed two - while (it.next()) |r| { - r_last = r; - count += 1; - } - try testing.expectEqual( - Range{ .start = 0x7fe8ffe9, .end = 0x7fe8ffe9 }, - r_last, - ); - try testing.expectEqual(256 * 128, count); -} - -test "only inner" { - const pattern = [_]PatchByte{ - .{ .used = 0xe9 }, - .{ .free = {} }, - .{ .free = {} }, - .{ .used = 0x78 }, - }; - var it = PatchLocationIterator.init(pattern, 0); - - try testing.expectEqual( - Range{ .start = 0x780000e9, .end = 0x780000e9 }, - it.next().?, - ); - try testing.expectEqual( - Range{ .start = 0x780001e9, .end = 0x780001e9 }, - it.next().?, - ); - - // Skip to the last range - var r_last: ?Range = null; - var count: u32 = 2; // We already consumed two - while (it.next()) |r| { - r_last = r; - count += 1; - } - try testing.expectEqual( - Range{ .start = 0x78ffffe9, .end = 0x78ffffe9 }, - r_last, - ); - try testing.expectEqual(256 * 256, count); -} - -test "trailing free bytes offset" { - const pattern = [_]PatchByte{ - .{ .free = {} }, - .{ .free = {} }, - .{ .free = {} }, - .{ .used = 0x79 }, - }; - const offset = 0x12345678; - var it = PatchLocationIterator.init(pattern, offset); - - try testing.expectEqual( - Range{ .start = offset + 0x79000000, .end = offset + 0x79ffffff }, - it.next().?, - ); - try testing.expectEqual(null, it.next()); -} - -test "trailing and leading offset" { - const pattern = [_]PatchByte{ - .{ .free = {} }, - .{ .used = 0xe9 }, - .{ .used = 0xe8 }, - .{ .free = {} }, - }; - const offset = 0x12345678; - var it = PatchLocationIterator.init(pattern, offset); - - try testing.expectEqual( - Range{ .start = offset + 0x00e8e900, .end = offset + 0x00e8e9ff }, - it.next().?, - ); - try testing.expectEqual( - Range{ .start = offset + 0x01e8e900, .end = offset + 0x01e8e9ff }, - it.next().?, - ); - - // Skip to the last range - var r_last: ?Range = null; - var count: u32 = 2; // We already consumed two - while (it.next()) |r| { - r_last = r; - count += 1; - } - try testing.expectEqual( - Range{ - .start = offset + @as(i32, @bitCast(@as(u32, 0xffe8e900))), - .end = offset + @as(i32, @bitCast(@as(u32, 0xffe8e9ff))), - }, - r_last, - ); - try testing.expect(count > 128); -} - -test "trailing free bytes large offset" { - const pattern = [_]PatchByte{ - .{ .free = {} }, - .{ .free = {} }, - .{ .free = {} }, - .{ .used = 0x79 }, - }; - const offset = 0x12345678; - var it = PatchLocationIterator.init(pattern, offset); - - try testing.expectEqual( - Range{ .start = offset + 0x79000000, .end = offset + 0x79ffffff }, - it.next().?, - ); - try testing.expectEqual(null, it.next()); -} - -test "trailing and leading large offset" { - const pattern = [_]PatchByte{ - .{ .free = {} }, - .{ .used = 0xe9 }, - .{ .used = 0xe8 }, - .{ .free = {} }, - }; - const offset = 0x123456789a; - var it = PatchLocationIterator.init(pattern, offset); - - try testing.expectEqual( - Range{ .start = offset + 0x00e8e900, .end = offset + 0x00e8e9ff }, - it.next().?, - ); - try testing.expectEqual( - Range{ .start = offset + 0x01e8e900, .end = offset + 0x01e8e9ff }, - it.next().?, - ); - - // Skip to the last range - var r_last: ?Range = null; - var count: u32 = 2; // We already consumed two - while (it.next()) |r| { - r_last = r; - count += 1; - } - try testing.expectEqual( - Range{ - .start = offset + @as(i64, @intCast(@as(i32, @bitCast(@as(u32, 0xffe8e900))))), - .end = offset + @as(i64, @intCast(@as(i32, @bitCast(@as(u32, 0xffe8e9ff))))), - }, - r_last, - ); - try testing.expectEqual(256, count); -} diff --git a/src/Patcher.zig b/src/Patcher.zig index c95736d..dd7c469 100644 --- a/src/Patcher.zig +++ b/src/Patcher.zig @@ -1,28 +1,27 @@ const std = @import("std"); -const builtin = @import("builtin"); -const testing = std.testing; const math = std.math; const mem = std.mem; const posix = std.posix; -const zydis = @import("zydis").zydis; -const dis = @import("disassembler.zig"); -const syscalls = @import("syscalls.zig"); +const testing = std.testing; + +const dis = @import("disassembler.zig"); +const reloc = @import("relocation.zig"); +const syscalls = @import("syscalls.zig"); +const zydis = @import("zydis").zydis; -const log = std.log.scoped(.patcher); const AddressAllocator = @import("AddressAllocator.zig"); -const InstructionFormatter = dis.InstructionFormatter; -const InstructionIterator = dis.InstructionIterator; -const PatchLocationIterator = @import("PatchLocationIterator.zig"); -const PatchByte = PatchLocationIterator.PatchByte; +const backend = @import("backend.zig").backend; const Range = @import("Range.zig"); +const Statistics = @import("Statistics.zig"); const assert = std.debug.assert; - const page_size = std.heap.pageSize(); -const jump_rel32: u8 = 0xe9; -const jump_rel32_size = 5; -const jump_rel8: u8 = 0xeb; -const jump_rel8_size = 2; +const log = std.log.scoped(.patcher); + +const j_rel32: u8 = 0xe9; +const j_rel32_size = 5; +const j_rel8: u8 = 0xeb; +const j_rel8_size = 2; // TODO: Find an invalid instruction to use. // const invalid: u8 = 0xaa; @@ -48,68 +47,40 @@ var syscall_flicken_bytes = [_]u8{ 0x41, 0xff, 0xd3, // call r11 }; -pub var gpa: mem.Allocator = undefined; -pub var flicken_templates: std.StringArrayHashMapUnmanaged(Flicken) = .empty; -pub var address_allocator: AddressAllocator = .empty; -/// Tracks the base addresses of pages we have mmap'd for Flicken. -pub var allocated_pages: std.AutoHashMapUnmanaged(u64, void) = .empty; -pub var mutex: std.Thread.Mutex = .{}; +const Patcher = @This(); -pub var target_exec_path_buf: [std.fs.max_path_bytes]u8 = @splat(0); -pub var target_exec_path: []const u8 = undefined; +mutex: std.Thread.Mutex = .{}, +address_allocator: AddressAllocator, +flicken_templates: std.StringArrayHashMapUnmanaged(Flicken) = .empty, -/// Initialize the patcher. -/// NOTE: This should only be called **once**. -pub fn init() !void { - gpa = std.heap.page_allocator; +pub fn init(allocator: mem.Allocator) !Patcher { + var patcher: Patcher = .{ + .address_allocator = .{ .child_allocator = allocator }, + }; - try flicken_templates.ensureTotalCapacity( - std.heap.page_allocator, + try patcher.flicken_templates.ensureTotalCapacity( + patcher.address_allocator.allocator(), page_size / @sizeOf(Flicken), ); - flicken_templates.putAssumeCapacity("nop", .{ .name = "nop", .bytes = &.{} }); + patcher.flicken_templates.putAssumeCapacity("nop", .{ .name = "nop", .bytes = &.{} }); mem.writeInt( u64, syscall_flicken_bytes[2..][0..8], @intFromPtr(&syscalls.syscallEntry), .little, ); - flicken_templates.putAssumeCapacity("syscall", .{ .name = "syscall", .bytes = &syscall_flicken_bytes }); + patcher.flicken_templates.putAssumeCapacity( + "syscall", + .{ .name = "syscall", .bytes = &syscall_flicken_bytes }, + ); - { - // Read mmap_min_addr to block the low memory range. This prevents us from allocating - // trampolines in the forbidden low address range. - var min_addr: u64 = 0x10000; // Default safe fallback (64KB) - if (std.fs.openFileAbsolute("/proc/sys/vm/mmap_min_addr", .{})) |file| { - defer file.close(); - var buf: [32]u8 = undefined; - if (file.readAll(&buf)) |len| { - const trimmed = std.mem.trim(u8, buf[0..len], " \n\r\t"); - if (std.fmt.parseInt(u64, trimmed, 10)) |val| { - min_addr = val; - } else |_| {} - } else |_| {} - } else |_| {} - try address_allocator.block(gpa, .{ .start = 0, .end = @intCast(min_addr) }, 0); - } + return patcher; } -/// Flicken name and bytes have to be valid for the lifetime it's used. If a trampoline with the -/// name is already registered it gets overwritten. -/// NOTE: The name "nop" is reserved and always has the ID 0. -pub fn addFlicken(trampoline: Flicken) !FlickenId { - assert(!mem.eql(u8, "nop", trampoline.name)); - assert(!mem.eql(u8, "syscall", trampoline.name)); - try flicken_templates.ensureUnusedCapacity(gpa, 1); - errdefer comptime unreachable; - - const gop = flicken_templates.getOrPutAssumeCapacity(trampoline.name); - if (gop.found_existing) { - log.warn("addTrampoline: Overwriting existing trampoline: {s}", .{trampoline.name}); - } - gop.key_ptr.* = trampoline.name; - gop.value_ptr.* = trampoline; - return @enumFromInt(gop.index); +pub fn deinit(patcher: *Patcher) void { + const allocator = patcher.address_allocator.allocator(); + patcher.flicken_templates.deinit(allocator); + patcher.address_allocator.deinit(); } pub const Flicken = struct { @@ -117,11 +88,11 @@ pub const Flicken = struct { bytes: []const u8, pub fn size(flicken: *const Flicken) u64 { - return flicken.bytes.len + jump_rel32_size; + return flicken.bytes.len + j_rel32_size; } }; -pub const FlickenId = enum(u64) { +pub const FlickenId = enum(u32) { /// The nop flicken is special. It just does the patched instruction and immediately jumps back /// to the normal instruction stream. It **cannot** be changed. /// The bytes are always empty, meaning that `bytes.len == 0`. @@ -141,8 +112,7 @@ pub const PatchRequest = struct { offset: u64, /// Number of bytes of instruction. size: u8, - /// A byte slice from the start of the offset to the end of the region. This isn't necessary to - /// have but makes things more accessible. + /// The bytes of the original code, starting at this instruction. bytes: []u8, pub fn desc(_: void, lhs: PatchRequest, rhs: PatchRequest) bool { @@ -160,100 +130,41 @@ pub const PatchRequest = struct { } }; -pub const Statistics = struct { - /// Direct jumps - jump: u64, - /// Punning - index represents number of prefixes used - punning: [4]u64, - /// Successor Eviction - successor_eviction: u64, - /// Neighbor Eviction - neighbor_eviction: u64, - /// Failed to patch - failed: u64, - - pub const empty = mem.zeroes(Statistics); - - pub fn punningSum(stats: *const Statistics) u64 { - return stats.punning[0] + stats.punning[1] + - stats.punning[2] + stats.punning[3]; - } - - pub fn successful(stats: *const Statistics) u64 { - return stats.jump + stats.punningSum() + - stats.successor_eviction + stats.neighbor_eviction; - } - - pub fn total(stats: *const Statistics) u64 { - return stats.successful() + stats.failed; - } - - pub fn percentage(stats: *const Statistics) f64 { - if (stats.total() == 0) return 1; - const s: f64 = @floatFromInt(stats.successful()); - const t: f64 = @floatFromInt(stats.total()); - return s / t; - } - - pub fn add(self: *Statistics, other: *const Statistics) void { - self.jump += other.jump; - for (0..self.punning.len) |i| { - self.punning[i] += other.punning[i]; - } - self.successor_eviction += other.successor_eviction; - self.neighbor_eviction += other.neighbor_eviction; - self.failed += other.failed; - } -}; - -/// Scans a memory region for instructions that require patching and applies the patches -/// using a hierarchy of tactics (Direct/Punning -> Successor Eviction -> Neighbor Eviction). +/// Scans a memory region for instructions that require patching and applies the patches using a +/// hierarchy of tactics (Direct/Punning -> Successor Eviction -> Neighbor Eviction). /// -/// NOTE: This function leaves the region as R|W and the caller is responsible for changing it to -/// the desired protection -pub fn patchRegion(region: []align(page_size) u8) !void { +/// Assert that the region is already mapped as R|W. The caller is responsible for changing it to +/// the desired protection after patching is done. +pub fn patchRegion(patcher: *Patcher, region: []align(page_size) u8) !void { log.info( - "Patching region: 0x{x} - 0x{x}", + "patchRegion: 0x{x} - 0x{x}", .{ @intFromPtr(region.ptr), @intFromPtr(®ion[region.len - 1]) }, ); - // For now just do a coarse lock. - // TODO: should we make this more fine grained? - mutex.lock(); - defer mutex.unlock(); - { - // Block the region, such that we don't try to allocate there anymore. - const start: i64 = @intCast(@intFromPtr(region.ptr)); - try address_allocator.block( - gpa, - .{ .start = start, .end = start + @as(i64, @intCast(region.len)) }, - page_size, - ); - } + patcher.mutex.lock(); + defer patcher.mutex.unlock(); - var arena_impl = std.heap.ArenaAllocator.init(gpa); + // Make the application code writable so we can inject our jumps. + try backend.mprotect(region, posix.PROT.READ | posix.PROT.WRITE); + + try patcher.address_allocator.block(.fromPtr(region.ptr, region.len)); + + var arena_impl = std.heap.ArenaAllocator.init(patcher.address_allocator.allocator()); const arena = arena_impl.allocator(); defer arena_impl.deinit(); var patch_requests: std.ArrayListUnmanaged(PatchRequest) = .empty; - // We save the bytes where instructions start to be able to disassemble them on the fly. This is - // necessary for the neighbor eviction, since we can't just iterate forwards from a target - // instruction and disassemble happily. This is because some bytes may already be the patched - // ones which means that we might disassemble garbage or something different that wasn't there - // before. This means that we would need to stop disassembling on the first byte that is locked, - // which kind of defeats the purpose of neighbor eviction. - var instruction_starts = try std.DynamicBitSetUnmanaged.initEmpty(arena, region.len); + var instruction_starts: std.DynamicBitSetUnmanaged = try .initEmpty(arena, region.len); { - // Get where to patch. - var instruction_iterator = InstructionIterator.init(region); - while (instruction_iterator.next()) |instruction| { + log.info("patchRegion: Collecting patch requests", .{}); + var instruction_iter = dis.InstructionIterator.init(region); + while (instruction_iter.next()) |instruction| { const offset = instruction.address - @intFromPtr(region.ptr); instruction_starts.set(offset); const is_syscall = instruction.instruction.mnemonic == zydis.ZYDIS_MNEMONIC_SYSCALL; - const should_patch = is_syscall or - instruction.instruction.attributes & zydis.ZYDIS_ATTRIB_HAS_LOCK > 0; + const should_patch = is_syscall; if (should_patch) { const request: PatchRequest = .{ .flicken = if (is_syscall) .syscall else .nop, @@ -280,814 +191,842 @@ pub fn patchRegion(region: []align(page_size) u8) !void { "patchRegion: Found duplicate patch requests for instruction: {s}", .{fmt}, ); - log.err("patchRegion: request 1: {f}", .{patch_requests.items[i - 1]}); - log.err("patchRegion: request 2: {f}", .{patch_requests.items[i]}); + log.err(" request 1: {f}", .{patch_requests.items[i - 1]}); + log.err(" request 2: {f}", .{patch_requests.items[i]}); return error.DuplicatePatchRequest; } last_offset = request.offset; - if (@as(u64, @intFromEnum(request.flicken)) >= flicken_templates.count()) { + if (@as(u64, @intFromEnum(request.flicken)) >= patcher.flicken_templates.count()) { const fmt = dis.formatBytes(request.bytes[0..request.size]); log.err( "patchRegion: Usage of undefined flicken in request {f} for instruction: {s}", .{ request, fmt }, ); - return error.undefinedFlicken; + return error.UndefinedFlicken; } } } - { - // Apply patches. - try posix.mprotect(region, posix.PROT.READ | posix.PROT.WRITE); - - var stats = Statistics.empty; - // Used to track which bytes have been modified or used for constraints (punning), - // to prevent future patches (from neighbor/successor eviction) from corrupting them. - var locked_bytes = try std.DynamicBitSetUnmanaged.initEmpty(arena, region.len); - // PERF: A set of the pages for the patches/flicken we made writable. This way we don't - // repeatedly change call `mprotect` on the same page to switch it from R|W to R|X and back. - // At the end we `mprotect` all pages in this set back to being R|X. - var pages_made_writable: std.AutoHashMapUnmanaged(u64, void) = .empty; - - requests: for (patch_requests.items) |request| { - for (0..request.size) |i| { - if (locked_bytes.isSet(request.offset + i)) { - log.warn("patchRegion: Skipping request at offset 0x{x} because it is locked", .{request.offset}); - stats.failed += 1; - continue :requests; - } - } - - if (try attemptDirectOrPunning( - request, - arena, - &locked_bytes, - &pages_made_writable, - &stats, - )) { - continue :requests; - } - - if (try attemptSuccessorEviction( - request, - arena, - &locked_bytes, - &pages_made_writable, - &stats, - )) { - continue :requests; - } - - if (try attemptNeighborEviction( - request, - arena, - &locked_bytes, - &pages_made_writable, - &instruction_starts, - &stats, - )) { + // Used to track which bytes have been modified or used for constraints (punning), to + // prevent future patches (neighbor/successor eviction) from corrupting them. + var locked_bytes = try std.DynamicBitSetUnmanaged.initEmpty(arena, region.len); + // A set of the pages for the patches/flicken we made writable. This way we don't repeatedly + // change call `mprotect` on the same page to switch it from R|W to R|X and back. At the end + // we `mprotect` all pages in this set back to being R|X. + var pages_made_writable: std.AutoHashMapUnmanaged(u64, void) = .empty; + var stats: Statistics = .empty; + requests: for (patch_requests.items) |request| { + for (0..request.size) |i| { + if (locked_bytes.isSet(request.offset + i)) { + log.warn( + "patchRegion: Skipping request at offset 0x{x} because it is locked", + .{request.offset}, + ); continue :requests; } + } + const result = patcher.patchRequest(request, region, instruction_starts, locked_bytes) catch |err| { + log.err("patchRegion: Failed to patch request at offset 0x{x}: {}", .{ request.offset, err }); stats.failed += 1; + continue; + }; + + switch (result.tactic) { + .jump => stats.jump += 1, + .punning => |n| stats.punning[n] += 1, + .successor_eviction => stats.successor_eviction += 1, + .neighbor_eviction => stats.neighbor_eviction += 1, } - // Change pages back to R|X. - var iter = pages_made_writable.keyIterator(); - const protection = posix.PROT.READ | posix.PROT.EXEC; - while (iter.next()) |page_addr| { - const ptr: [*]align(page_size) u8 = @ptrFromInt(page_addr.*); - try posix.mprotect(ptr[0..page_size], protection); - } + // Now nothing should error anymore, so we "commit" the patches + for (result.patches) |p| { + if (p.kind == .empty) continue; - assert(stats.total() == patch_requests.items.len); - log.info("{}", .{stats}); - log.info("patched: {}/{}: {:2.2}%", .{ - stats.successful(), - stats.total(), - stats.percentage() * 100, - }); - log.info("patchRegion: Finished applying patches", .{}); + if (p.trampoline_addr != 0 and p.trampoline_len > 0) { + try patcher.address_allocator.block(.{ + .start = @intCast(p.trampoline_addr), + .end = @intCast(p.trampoline_addr + p.trampoline_len), + }); + + const start_page = mem.alignBackward(u64, p.trampoline_addr, page_size); + const end_page = mem.alignForward(u64, p.trampoline_addr + p.trampoline_len, page_size); + + { + var page = start_page; + const prot = posix.PROT.READ | posix.PROT.WRITE; + const flags: posix.MAP = .{ + .TYPE = .PRIVATE, + .ANONYMOUS = true, + .FIXED_NOREPLACE = true, + }; + while (page < end_page) : (page += page_size) { + const gop = try pages_made_writable.getOrPut(arena, page); + if (gop.found_existing) continue; + + const ptr: [*]align(page_size) u8 = @ptrFromInt(page); + _ = backend.mmap(ptr, page_size, prot, flags, -1, 0) catch |err| switch (err) { + error.MappingAlreadyExists => { + try backend.mprotect(ptr[0..page_size], prot); + }, + else => return err, + }; + } + } + + const dest: [*]u8 = @ptrFromInt(p.trampoline_addr); + @memcpy(dest[0..p.trampoline_len], p.trampoline_bytes[0..p.trampoline_len]); + } + + if (p.source_addr != 0 and p.source_len > 0) { + const dest: [*]u8 = @ptrFromInt(p.source_addr); + @memcpy(dest[0..p.source_len], p.source_bytes[0..p.source_len]); + } + + if (p.lock_len > 0) { + locked_bytes.setRangeValue( + .{ .start = p.lock_offset, .end = p.lock_offset + p.lock_len }, + true, + ); + } + } } + + var iter = pages_made_writable.keyIterator(); + const prot = posix.PROT.READ | posix.PROT.EXEC; + while (iter.next()) |page_addr| { + const ptr: [*]align(page_size) u8 = @ptrFromInt(page_addr.*); + try backend.mprotect(ptr[0..page_size], prot); + } + + log.info("{}", .{stats}); + log.info("patched: {}/{}: {d:.2}%", .{ + stats.successful(), + stats.total(), + stats.percentage() * 100.0, + }); +} + +pub const Tactic = union(enum) { + jump, + punning: u8, + successor_eviction, + neighbor_eviction, +}; + +pub const PatchResult = struct { + patches: [2]Patch, + tactic: Tactic, +}; + +/// Informations to "commit" a patch. +pub const Patch = struct { + kind: enum { empty, active } = .empty, + + /// Information for the jump overwrite + source_addr: u64 = 0, + source_bytes: [15]u8 = undefined, + source_len: u8 = 0, + + /// Information for the trampoline + trampoline_addr: u64 = 0, + trampoline_bytes: [128]u8 = undefined, + trampoline_len: u8 = 0, + + /// Offset inside the region to lock so future patches don't touch them. + lock_offset: u64 = 0, + lock_len: u64 = 0, +}; + +fn patchRequest( + patcher: *Patcher, + /// What to patch. + request: PatchRequest, + /// Where to patch it. + region: []align(page_size) u8, + /// Needed to get the size of instructions for the successor and neighbor eviction. + instruction_starts: std.DynamicBitSetUnmanaged, + /// Needed to not repeatedly patch the same instructions with successor and neighbor eviction. + locked_bytes: std.DynamicBitSetUnmanaged, +) !PatchResult { + if (try attemptDirectOrPunning(patcher, request, region, locked_bytes)) |result| { + return result; + } + if (try attemptSuccessorEviction(patcher, request, region, locked_bytes)) |result| { + return result; + } + if (try attemptNeighborEviction(patcher, request, region, instruction_starts, locked_bytes)) |result| { + return result; + } + return error.PatchFailed; } fn attemptDirectOrPunning( + patcher: *Patcher, request: PatchRequest, - arena: mem.Allocator, - locked_bytes: *std.DynamicBitSetUnmanaged, - pages_made_writable: *std.AutoHashMapUnmanaged(u64, void), - stats: *Statistics, -) !bool { + region: []align(page_size) u8, + locked_bytes: std.DynamicBitSetUnmanaged, +) !?PatchResult { const flicken: Flicken = if (request.flicken == .nop) .{ .name = "nop", .bytes = request.bytes[0..request.size] } else - flicken_templates.entries.get(@intFromEnum(request.flicken)).value; + patcher.flicken_templates.values()[@intFromEnum(request.flicken)]; - var pii = PatchInstructionIterator.init( - request.bytes, - request.size, - flicken.size(), - ); - // TODO: There is a "Ghost Page" edge case here. If `pii.next()` returns a range that - // spans multiple pages (Pages A and B), we might successfully mmap Page A but fail to - // mmap Page B. The loop will `continue` to the next candidate range, leaving Page A - // mapped. While harmless (it becomes an unused executable page), it is technically a - // memory leak. A future fix should track "current attempt" pages separately and unmap - // them on failure. - while (pii.next(.{ .count = 256 })) |allocated_range| { - try pages_made_writable.ensureUnusedCapacity(arena, touchedPageCount(allocated_range)); - ensureRangeWritable( - allocated_range, - pages_made_writable, - ) catch |err| switch (err) { - error.MappingAlreadyExists => continue, - else => return err, - }; + const flicken_size = flicken.size(); // bytes.len + 5 + const source_addr = @intFromPtr(region.ptr) + request.offset; - applyPatch( - request, - flicken, - allocated_range, - pii.num_prefixes, - ) catch |err| switch (err) { - error.RelocationOverflow => continue, - else => return err, - }; + for (0..prefixes.len + 1) |num_prefixes_usize| { + const num_prefixes: u8 = @intCast(num_prefixes_usize); - try address_allocator.block(gpa, allocated_range, 0); - const lock_size = jump_rel32_size + pii.num_prefixes; - locked_bytes.setRangeValue( - .{ .start = request.offset, .end = request.offset + lock_size }, - true, - ); + // Tactics T1 pads with prefixes. 5 is the size of `jmp rel32`. + const lock_size = j_rel32_size + num_prefixes; + if (request.offset + lock_size > region.len) continue; + if (num_prefixes + 1 > request.size) continue; - if (request.size >= 5) { - // assert(pii.num_prefixes == 0); - stats.jump += 1; - } else { - stats.punning[pii.num_prefixes] += 1; + for (0..lock_size) |i| { + if (locked_bytes.isSet(request.offset + i)) { + return null; + } } - return true; + + // Construct bitwise constraint if our jump spills over the instruction bounds + var mask: u32 = 0; + var pattern: u32 = 0; + for (0..4) |i| { + const byte_offset = num_prefixes + 1 + i; + if (byte_offset >= request.size) { + const existing_byte = request.bytes[byte_offset]; + mask |= @as(u32, 0xFF) << @intCast(i * 8); + pattern |= @as(u32, existing_byte) << @intCast(i * 8); + } + } + + const jump_source = source_addr + num_prefixes + j_rel32_size; + + const alloc_request = AddressAllocator.Request{ + .source = jump_source, + .size = flicken_size, + .valid_range = .{ + // TODO: calculate from flicken size + // TODO: use relocation information if needed + .start = @max(0, @as(i64, @intCast(source_addr)) - 0x7FFF0000), // ~2GB + .end = @as(i64, @intCast(source_addr)) + 0x7FFF0000, + }, + .mask = mask, + .pattern = pattern, + }; + + const tramp_range = patcher.address_allocator.findAllocation(alloc_request) orelse continue; + var patch = Patch{ .kind = .active }; + + // Populate Trampoline + patch.trampoline_addr = @intCast(tramp_range.start); + patch.trampoline_len = @intCast(flicken_size); + @memcpy(patch.trampoline_bytes[0..flicken.bytes.len], flicken.bytes); + + // Relocate if NOP + if (request.flicken == .nop) { + const instr = dis.disassembleInstruction(request.bytes[0..request.size]).?; + const reloc_info = reloc.RelocInfo{ + .instr = instr, + .old_addr = source_addr, + }; + reloc.relocateInstruction( + reloc_info.instr, + patch.trampoline_addr, + patch.trampoline_bytes[0..flicken.bytes.len], + ) catch |err| switch (err) { + // TODO: when we use relocation information to restrict the range for the request + // this shouldn't happen anymore. + error.RelocationOverflow => continue, // try next prefix/hole + else => return err, + }; + } + + // Jump back from trampoline to original stream + const ret_addr = source_addr + request.size; + const tramp_jump_source = patch.trampoline_addr + flicken.bytes.len + j_rel32_size; + const tramp_disp: i32 = @intCast(@as(i64, @intCast(ret_addr)) - @as(i64, @intCast(tramp_jump_source))); + + patch.trampoline_bytes[flicken.bytes.len] = j_rel32; + mem.writeInt(i32, patch.trampoline_bytes[flicken.bytes.len + 1 ..][0..4], tramp_disp, .little); + + // Populate Source Jump + patch.source_addr = source_addr; + patch.source_len = @intCast(@max(request.size, lock_size)); + @memset(patch.source_bytes[0..patch.source_len], int3); // Clean padding + + if (num_prefixes > 0) { + @memcpy(patch.source_bytes[0..num_prefixes], prefixes[0..num_prefixes]); + } + patch.source_bytes[num_prefixes] = j_rel32; + const source_disp: i32 = @intCast(tramp_range.start - @as(i64, @intCast(jump_source))); + mem.writeInt(i32, patch.source_bytes[num_prefixes + 1 ..][0..4], source_disp, .little); + + patch.lock_offset = request.offset; + patch.lock_len = lock_size; + + const tactic: Tactic = if (num_prefixes == 0 and request.size >= 5) + .jump + else + .{ .punning = num_prefixes }; + return .{ .patches = .{ patch, .{} }, .tactic = tactic }; } - return false; + return null; +} + +test "attemptDirectOrPunning - Direct Jump (>= 5 bytes)" { + var patcher = try Patcher.init(testing.allocator); + defer patcher.deinit(); + + // Simulate code memory at a known location + var region: [1024]u8 align(page_size) = undefined; + @memset(®ion, nop); + // Put a 5-byte instruction at offset 0: mov eax, 1 (B8 01 00 00 00) + const instr = "\xB8\x01\x00\x00\x00"; + @memcpy(region[0..instr.len], instr); + + const source_addr = @intFromPtr(®ion); + + // Block everything except a hole at offset 0x2000 + try patcher.address_allocator.block(.{ .start = 0, .end = @intCast(source_addr + 0x2000) }); + try patcher.address_allocator.block(.{ + .start = @intCast(source_addr + 0x3000), + .end = @intCast(source_addr + 0x10000000), + }); + + const request = PatchRequest{ + .flicken = .nop, + .offset = 0, + .size = instr.len, + .bytes = region[0..], + }; + + var locked_bytes = try std.DynamicBitSetUnmanaged.initEmpty(testing.allocator, region.len); + defer locked_bytes.deinit(testing.allocator); + + const patch_opt = try attemptDirectOrPunning(&patcher, request, ®ion, locked_bytes); + try testing.expect(patch_opt != null); + const patch = patch_opt.?.patches[0]; + + try testing.expectEqual(.active, patch.kind); + + try testing.expectEqual(source_addr, patch.source_addr); + try testing.expectEqual(5, patch.source_len); + try testing.expectEqual(0xE9, patch.source_bytes[0]); + + try testing.expectEqual(source_addr + 0x2000, patch.trampoline_addr); + + // Trampoline bytes should be [B8 01 00 00 00][E9 xx xx xx xx] + try testing.expectEqual(instr.len + 5, patch.trampoline_len); + try testing.expectEqualSlices(u8, instr, patch.trampoline_bytes[0..5]); + try testing.expectEqual(0xE9, patch.trampoline_bytes[5]); +} + +test "attemptDirectOrPunning - Punning (< 5 bytes)" { + var patcher = try Patcher.init(testing.allocator); + defer patcher.deinit(); + + var region: [1024]u8 align(page_size) = undefined; + @memset(®ion, nop); + // Put a 2-byte instruction at offset 0: xor eax, eax (31 C0) + // Followed by 3 bytes of a successor we MUST pun into: 0xAA 0xBB 0xCC + const instr = "\x31\xC0\x11\x22\x33"; + @memcpy(region[0..instr.len], instr); + const target_addr = @intFromPtr(®ion) + 5 + 0x33221100; + + try patcher.address_allocator.block(.{ .start = 0, .end = @intCast(target_addr) }); + try patcher.address_allocator.block(.{ + .start = @intCast(target_addr + 100), + .end = math.maxInt(i64), + }); + + const request = PatchRequest{ + .flicken = .nop, + .offset = 0, + .size = 2, + .bytes = region[0..], + }; + + var locked_bytes = try std.DynamicBitSetUnmanaged.initEmpty(testing.allocator, region.len); + defer locked_bytes.deinit(testing.allocator); + + const patch_opt = try attemptDirectOrPunning(&patcher, request, ®ion, locked_bytes); + try testing.expect(patch_opt != null); + + const p = patch_opt.?.patches[0]; + + try testing.expectEqual(5, p.source_len); // 5 bytes overwritten + try testing.expectEqual(0xE9, p.source_bytes[0]); + + // The jump offset MUST exactly match the 3 bytes we spilled into! + try testing.expectEqual(0x11, p.source_bytes[2]); + try testing.expectEqual(0x22, p.source_bytes[3]); + try testing.expectEqual(0x33, p.source_bytes[4]); + try testing.expectEqual(target_addr, p.trampoline_addr); } fn attemptSuccessorEviction( + patcher: *Patcher, request: PatchRequest, - arena: mem.Allocator, - locked_bytes: *std.DynamicBitSetUnmanaged, - pages_made_writable: *std.AutoHashMapUnmanaged(u64, void), - stats: *Statistics, -) !bool { - // Disassemble Successor and create request and flicken for it. - const succ_instr = dis.disassembleInstruction(request.bytes[request.size..]) orelse return false; - const succ_request = PatchRequest{ - .flicken = .nop, - .size = succ_instr.instruction.length, - .bytes = request.bytes[request.size..], - .offset = request.offset + request.size, - }; - const succ_flicken = Flicken{ - .name = "nop", - .bytes = succ_request.bytes[0..succ_request.size], - }; + region: []align(page_size) u8, + locked_bytes: std.DynamicBitSetUnmanaged, +) !?PatchResult { + const k = request.size; + assert(k < 5); + assert(k > 0); - for (0..succ_request.size) |i| { - if (locked_bytes.isSet(succ_request.offset + i)) return false; + const source_addr = @intFromPtr(region.ptr) + request.offset; + const succ_offset = request.offset + k; + if (succ_offset >= region.len) return null; + + // Disassemble the Successor Instruction + const succ_instr_bundle = dis.disassembleInstruction(region[succ_offset..]) orelse return null; + const succ_size = succ_instr_bundle.instruction.length; + + // The total physical bytes we will overwrite. + // k + 5 covers both jumps. We may need to pad up to the end of the successor. + const lock_size = @max(k + 5, k + succ_size); + if (request.offset + lock_size > region.len) return null; + + for (0..lock_size) |i| { + if (locked_bytes.isSet(request.offset + i)) { + return null; + } } - // Save original bytes for reverting the change. - var succ_orig_bytes: [15]u8 = undefined; - @memcpy( - succ_orig_bytes[0..succ_request.size], - succ_request.bytes[0..succ_request.size], - ); + const flicken: Flicken = if (request.flicken == .nop) + .{ .name = "nop", .bytes = request.bytes[0..request.size] } + else + patcher.flicken_templates.values()[@intFromEnum(request.flicken)]; + const flicken_size = flicken.size(); - var succ_pii = PatchInstructionIterator.init( - succ_request.bytes, - succ_request.size, - succ_flicken.size(), - ); - while (succ_pii.next(.{ .count = 16 })) |succ_range| { - // Ensure bytes match original before retry. - assert(mem.eql( - u8, - succ_request.bytes[0..succ_request.size], - succ_orig_bytes[0..succ_request.size], - )); + const succ_flicken = Flicken{ + .name = "nop", + .bytes = region[succ_offset .. succ_offset + succ_size], + }; + const succ_flicken_size = succ_flicken.size(); - try pages_made_writable.ensureUnusedCapacity(arena, touchedPageCount(succ_range)); - ensureRangeWritable( - succ_range, - pages_made_writable, + const jump_source1 = source_addr + j_rel32_size; + const jump_source2 = source_addr + k + j_rel32_size; + + // If the successor jump (5 bytes) spills over the successor instruction bounds, we must + // constrain R2 to not corrupt the instruction after the successor. + var r2_mask: u32 = 0; + var r2_pattern: u32 = 0; + for (0..4) |i| { + if (1 + i >= succ_size) { + const existing_byte = region[succ_offset + 1 + i]; + r2_mask |= @as(u32, 0xFF) << @intCast(i * 8); + r2_pattern |= @as(u32, existing_byte) << @intCast(i * 8); + } + } + + // Both requests look in the ~2GB window. + // TODO: Adjust window using RIP-relative relocation information + const window: i64 = 0x7FFF0000; + const valid_range1 = Range{ + .start = @max(0, @as(i64, @intCast(jump_source1)) - window), + .end = @as(i64, @intCast(jump_source1)) + window, + }; + const valid_range2 = Range{ + .start = @max(0, @as(i64, @intCast(jump_source2)) - window), + .end = @as(i64, @intCast(jump_source2)) + window, + }; + + const r1 = AddressAllocator.Request{ + .source = jump_source1, + .size = flicken_size, + .valid_range = valid_range1, + .mask = 0, + .pattern = 0, + }; + const r2 = AddressAllocator.Request{ + .source = jump_source2, + .size = succ_flicken_size, + .valid_range = valid_range2, + .mask = r2_mask, + .pattern = r2_pattern, + }; + + const coupled_alloc = patcher.address_allocator.findCoupledAllocation(k, r1, r2) orelse return null; + const tramp1_range = coupled_alloc[0]; + const tramp2_range = coupled_alloc[1]; + + var patch1 = Patch{ .kind = .active }; + var patch2 = Patch{ .kind = .active }; + + // Populate Successor Trampoline + patch2.trampoline_addr = @intCast(tramp2_range.start); + patch2.trampoline_len = @intCast(succ_flicken_size); + @memcpy(patch2.trampoline_bytes[0..succ_size], succ_flicken.bytes); + + const reloc_info2 = reloc.RelocInfo{ + .instr = succ_instr_bundle, + .old_addr = source_addr + k, + }; + reloc.relocateInstruction( + reloc_info2.instr, + patch2.trampoline_addr, + patch2.trampoline_bytes[0..succ_size], + ) catch |err| switch (err) { + error.RelocationOverflow => return null, + else => return err, + }; + + const tramp2_jump_source = patch2.trampoline_addr + succ_size + j_rel32_size; + const tramp2_disp: i32 = @intCast(@as(i64, @intCast(source_addr + k + succ_size)) - @as(i64, @intCast(tramp2_jump_source))); + patch2.trampoline_bytes[succ_size] = j_rel32; + mem.writeInt(i32, patch2.trampoline_bytes[succ_size + 1 ..][0..4], tramp2_disp, .little); + + // Populate Original Trampoline and Source Replacements + patch1.trampoline_addr = @intCast(tramp1_range.start); + patch1.trampoline_len = @intCast(flicken_size); + @memcpy(patch1.trampoline_bytes[0..flicken.bytes.len], flicken.bytes); + + if (request.flicken == .nop) { + const instr_bundle = dis.disassembleInstruction(request.bytes[0..k]).?; + const reloc_info1 = reloc.RelocInfo{ + .instr = instr_bundle, + .old_addr = source_addr, + }; + reloc.relocateInstruction( + reloc_info1.instr, + patch1.trampoline_addr, + patch1.trampoline_bytes[0..flicken.bytes.len], ) catch |err| switch (err) { - error.MappingAlreadyExists => continue, + error.RelocationOverflow => return null, else => return err, }; + } - applyPatch( - succ_request, - succ_flicken, - succ_range, - succ_pii.num_prefixes, - ) catch |err| switch (err) { - error.RelocationOverflow => continue, - else => return err, - }; + // T1 returns to the Successor's jump (which is at source_addr + k) + const tramp1_jump_source: i64 = @intCast(patch1.trampoline_addr + flicken.bytes.len + j_rel32_size); + const tramp1_disp: i32 = @intCast(@as(i64, @intCast(source_addr + k)) - + @as(i64, @intCast(tramp1_jump_source))); + patch1.trampoline_bytes[flicken.bytes.len] = j_rel32; + mem.writeInt(i32, patch1.trampoline_bytes[flicken.bytes.len + 1 ..][0..4], tramp1_disp, .little); - // Now that the successor is patched, we can patch the original request. - const flicken: Flicken = if (request.flicken == .nop) - .{ .name = "nop", .bytes = request.bytes[0..request.size] } - else - flicken_templates.entries.get(@intFromEnum(request.flicken)).value; + // Populate the overlapping jumps in the original code stream + // Because they physically overlap, Patch 1 handles both J1 and J2 writing. + patch1.source_addr = source_addr; + patch1.source_len = @intCast(lock_size); + @memset(patch1.source_bytes[0..lock_size], int3); - var orig_pii = PatchInstructionIterator.init( - request.bytes, - request.size, - flicken.size(), - ); - while (orig_pii.next(.{ .count = 16 })) |orig_range| { - if (succ_range.touches(orig_range)) continue; - try pages_made_writable.ensureUnusedCapacity(arena, touchedPageCount(orig_range)); - ensureRangeWritable( - orig_range, - pages_made_writable, - ) catch |err| switch (err) { - error.MappingAlreadyExists => continue, - else => return err, + // Write Successor Jump First + patch1.source_bytes[k] = j_rel32; + const rel2: i32 = @intCast(tramp2_range.start - @as(i64, @intCast(jump_source2))); + mem.writeInt(i32, patch1.source_bytes[k + 1 ..][0..4], rel2, .little); + + // Write Original Jump Over The Top + patch1.source_bytes[0] = j_rel32; + const rel1: i32 = @intCast(tramp1_range.start - @as(i64, @intCast(jump_source1))); + mem.writeInt(i32, patch1.source_bytes[1..][0..4], rel1, .little); + + patch1.lock_offset = request.offset; + patch1.lock_len = lock_size; + + return .{ .patches = .{ patch1, patch2 }, .tactic = .successor_eviction }; +} + +test "attemptSuccessorEviction - K=2" { + var patcher = try Patcher.init(testing.allocator); + defer patcher.deinit(); + + var region: [1024]u8 align(page_size) = undefined; + @memset(®ion, nop); + + // Instruction 1 (J1): xor eax, eax (31 C0) -> 2 bytes + // Instruction 2 (J2): mov eax, 1 (B8 01 00 00 00) -> 5 bytes + const instr = "\x31\xC0\xB8\x01\x00\x00\x00"; + @memcpy(region[0..instr.len], instr); + + const request = PatchRequest{ + .flicken = .nop, + .offset = 0, + .size = 2, + .bytes = region[0..], + }; + + const source_addr = @intFromPtr(®ion); + + // We block the immediate area to force the solver to search for a coupled solution. + try patcher.address_allocator.block(.{ .start = 0, .end = @intCast(source_addr + 0x2000) }); + + var locked_bytes = try std.DynamicBitSetUnmanaged.initEmpty(testing.allocator, region.len); + defer locked_bytes.deinit(testing.allocator); + + var instruction_starts = try std.DynamicBitSetUnmanaged.initEmpty(testing.allocator, region.len); + defer instruction_starts.deinit(testing.allocator); + instruction_starts.set(0); + instruction_starts.set(2); + + const patches_opt = try attemptSuccessorEviction(&patcher, request, ®ion, locked_bytes); + try testing.expect(patches_opt != null); + const patches = patches_opt.?.patches; + + try testing.expectEqual(.active, patches[0].kind); + try testing.expectEqual(.active, patches[1].kind); + + const p1 = patches[0]; + try testing.expectEqual(source_addr, p1.source_addr); + + // k=2, succ_size=5 -> lock_size = max(2+5, 2+5) = 7 + try testing.expectEqual(7, p1.source_len); + + // Verify mathematical overlap worked + try testing.expectEqual(0xE9, p1.source_bytes[0]); // J1 Opcode + try testing.expectEqual(0xE9, p1.source_bytes[2]); // J2 Opcode is perfectly preserved! + + const rel1 = mem.readInt(i32, p1.source_bytes[1..5], .little); + const rel2 = mem.readInt(i32, p1.source_bytes[3..7], .little); + + // The top 2 bytes of rel1 MUST exactly match the bottom 2 bytes of rel2 + const u_rel1: u32 = @bitCast(rel1); + const u_rel2: u32 = @bitCast(rel2); + try testing.expectEqual((u_rel1 >> 16) & 0xFFFF, u_rel2 & 0xFFFF); +} + +fn attemptNeighborEviction( + patcher: *Patcher, + request: PatchRequest, + region: []align(page_size) u8, + instruction_starts: std.DynamicBitSetUnmanaged, + locked_bytes: std.DynamicBitSetUnmanaged, +) !?PatchResult { + // Neighbor Eviction requires at least 2 bytes for the short jump (0xEB ) + if (request.size < 2) return null; + + const source_addr = @intFromPtr(region.ptr) + request.offset; + const start_offset = request.offset + 2; + // Valid short jump displacement is [-128, 127]. We only look forward to avoid evicting + // instructions we haven't patched yet. + const end_offset = @min(start_offset + 128, region.len); + + const flicken: Flicken = if (request.flicken == .nop) + .{ .name = "nop", .bytes = request.bytes[0..request.size] } + else + patcher.flicken_templates.values()[@intFromEnum(request.flicken)]; + const flicken_size = flicken.size(); + + neighbor: for (start_offset..end_offset) |neighbor_offset| { + if (!instruction_starts.isSet(neighbor_offset)) continue; + + const victim_bytes_all = region[neighbor_offset..]; + const victim_instr_bundle = dis.disassembleInstruction(victim_bytes_all) orelse continue; + const victim_size = victim_instr_bundle.instruction.length; + + for (0..victim_size) |i| { + if (locked_bytes.isSet(neighbor_offset + i)) continue :neighbor; + } + + const neighbor_addr = source_addr + (neighbor_offset - request.offset); + + // Try to split the victim instruction at offset `k` + var k: u8 = 1; + while (k < victim_size) : (k += 1) { + const victim_lock_size = @max(victim_size, k + j_rel32_size); + if (neighbor_offset + victim_lock_size > region.len) continue; + + // Calculate short jump displacement (from end of original instruction to J_P) + const target_offset: i64 = @intCast(neighbor_offset + k); + const source_end_offset: i64 = @intCast(request.offset + 2); + const disp = target_offset - source_end_offset; + if (disp > 127 or disp < -128) continue; + + // Ensure our J_P spill doesn't corrupt already locked bytes + for (victim_size..victim_lock_size) |i| { + if (locked_bytes.isSet(neighbor_offset + i)) continue; + } + + // Build constraint for J_P (the Patch jump) + var rp_mask: u32 = 0; + var rp_pattern: u32 = 0; + for (0..4) |i| { + const byte_offset = k + 1 + i; + if (byte_offset >= victim_size) { + const existing_byte = region[neighbor_offset + byte_offset]; + rp_mask |= @as(u32, 0xFF) << @intCast(i * 8); + rp_pattern |= @as(u32, existing_byte) << @intCast(i * 8); + } + } + + const jump_source_V = neighbor_addr + j_rel32_size; + const jump_source_P = neighbor_addr + k + j_rel32_size; + + // Look in the ~2GB window + const window: i64 = 0x7FFF0000; + const r_V = AddressAllocator.Request{ + .source = jump_source_V, + .size = victim_size + j_rel32_size, + .valid_range = .{ + .start = @max(0, @as(i64, @intCast(jump_source_V)) - window), + .end = @as(i64, @intCast(jump_source_V)) + window, + }, + .mask = 0, + .pattern = 0, + }; + const r_P = AddressAllocator.Request{ + .source = jump_source_P, + .size = flicken_size, + .valid_range = .{ + .start = @max(0, @as(i64, @intCast(jump_source_P)) - window), + .end = @as(i64, @intCast(jump_source_P)) + window, + }, + .mask = rp_mask, + .pattern = rp_pattern, }; - applyPatch( - request, - flicken, - orig_range, - orig_pii.num_prefixes, + const coupled_alloc = patcher.address_allocator.findCoupledAllocation(k, r_V, r_P) orelse continue; + const tramp_V_range = coupled_alloc[0]; + const tramp_P_range = coupled_alloc[1]; + + var patch1 = Patch{ .kind = .active }; + var patch2 = Patch{ .kind = .active }; + + // Patch 1: Original Short Jump + Flicken Trampoline + patch1.source_addr = source_addr; + patch1.source_len = request.size; + @memset(patch1.source_bytes[0..patch1.source_len], int3); + patch1.source_bytes[0] = j_rel8; + patch1.source_bytes[1] = @intCast(disp); + + patch1.trampoline_addr = @intCast(tramp_P_range.start); + patch1.trampoline_len = @intCast(flicken_size); + @memcpy(patch1.trampoline_bytes[0..flicken.bytes.len], flicken.bytes); + + if (request.flicken == .nop) { + const reloc_info_p = reloc.RelocInfo{ + .instr = dis.disassembleInstruction(request.bytes[0..request.size]).?, + .old_addr = source_addr, + }; + reloc.relocateInstruction( + reloc_info_p.instr, + patch1.trampoline_addr, + patch1.trampoline_bytes[0..flicken.bytes.len], + ) catch |err| switch (err) { + error.RelocationOverflow => continue, + else => return err, + }; + } + + const tramp_P_jump_source = patch1.trampoline_addr + flicken.bytes.len + j_rel32_size; + const tramp_P_disp: i32 = @intCast(@as(i64, @intCast(source_addr + request.size)) - @as(i64, @intCast(tramp_P_jump_source))); + patch1.trampoline_bytes[flicken.bytes.len] = j_rel32; + mem.writeInt(i32, patch1.trampoline_bytes[flicken.bytes.len + 1 ..][0..4], tramp_P_disp, .little); + + patch1.lock_offset = request.offset; + patch1.lock_len = request.size; + + // Patch 2: Victim Coupled Jump + Victim Trampoline + patch2.source_addr = neighbor_addr; + patch2.source_len = @intCast(victim_lock_size); + @memset(patch2.source_bytes[0..patch2.source_len], int3); + + // Write J_P (The jump targeted by our short jump) at offset k + patch2.source_bytes[k] = j_rel32; + const rel_P: i32 = @intCast(tramp_P_range.start - @as(i64, @intCast(jump_source_P))); + mem.writeInt(i32, patch2.source_bytes[k + 1 ..][0..4], rel_P, .little); + + // Write J_V (The victim's jump) at offset 0 + patch2.source_bytes[0] = j_rel32; + const rel_V: i32 = @intCast(tramp_V_range.start - @as(i64, @intCast(jump_source_V))); + mem.writeInt(i32, patch2.source_bytes[1..][0..4], rel_V, .little); + + patch2.trampoline_addr = @intCast(tramp_V_range.start); + patch2.trampoline_len = @intCast(victim_size + j_rel32_size); + @memcpy(patch2.trampoline_bytes[0..victim_size], victim_bytes_all[0..victim_size]); + + const reloc_info_v = reloc.RelocInfo{ + .instr = victim_instr_bundle, + .old_addr = neighbor_addr, + }; + reloc.relocateInstruction( + reloc_info_v.instr, + patch2.trampoline_addr, + patch2.trampoline_bytes[0..victim_size], ) catch |err| switch (err) { error.RelocationOverflow => continue, else => return err, }; - try address_allocator.block(gpa, succ_range, 0); - try address_allocator.block(gpa, orig_range, 0); - const lock_size = request.size + jump_rel32_size + succ_pii.num_prefixes; - locked_bytes.setRangeValue( - .{ .start = request.offset, .end = request.offset + lock_size }, - true, - ); - stats.successor_eviction += 1; - return true; - } + const tramp_V_jump_source = patch2.trampoline_addr + victim_size + j_rel32_size; + const tramp_V_disp: i32 = @intCast(@as(i64, @intCast(neighbor_addr + victim_size)) - @as(i64, @intCast(tramp_V_jump_source))); + patch2.trampoline_bytes[victim_size] = j_rel32; + mem.writeInt(i32, patch2.trampoline_bytes[victim_size + 1 ..][0..4], tramp_V_disp, .little); - // We couldn't patch with the bytes. So revert to original ones. - @memcpy( - succ_request.bytes[0..succ_request.size], - succ_orig_bytes[0..succ_request.size], - ); - } - return false; -} + patch2.lock_offset = neighbor_offset; + patch2.lock_len = victim_lock_size; -fn attemptNeighborEviction( - request: PatchRequest, - arena: mem.Allocator, - locked_bytes: *std.DynamicBitSetUnmanaged, - pages_made_writable: *std.AutoHashMapUnmanaged(u64, void), - instruction_starts: *const std.DynamicBitSetUnmanaged, - stats: *Statistics, -) !bool { - // Valid neighbors must be within [-128, 127] range for a short jump. - // Since we patch back-to-front, we only look at neighbors *after* the current instruction - // (higher address) to avoid evicting an instruction we haven't processed/patched yet. - const start_offset = request.offset + 2; - const end_offset = @min( - start_offset + 128, - request.bytes.len + request.offset, - ); - - neighbor: for (start_offset..end_offset) |neighbor_offset| { - if (!instruction_starts.isSet(neighbor_offset)) continue; - - const victim_bytes_all = request.bytes[neighbor_offset - request.offset ..]; - - // PERF: We could also search for the next set bit in instruction_starts - const victim_instr = dis.disassembleInstruction(victim_bytes_all) orelse continue; - const victim_size = victim_instr.instruction.length; - const victim_bytes = victim_bytes_all[0..victim_size]; - - for (0..victim_size) |i| { - if (locked_bytes.isSet(neighbor_offset + i)) { - continue :neighbor; - } - } - - // Save original bytes to revert if constraints cannot be solved. - var victim_orig_bytes: [15]u8 = undefined; - @memcpy(victim_orig_bytes[0..victim_size], victim_bytes); - - // OUTER LOOP: J_Patch - // Iterate possible offsets 'k' inside the victim for the patch jump. - var k: u8 = 1; - while (k < victim_size) : (k += 1) { - const target: i64 = @intCast(neighbor_offset + k); - const source: i64 = @intCast(request.offset + 2); - const disp = target - source; - if (disp > 127 or disp < -128) continue; - - const patch_flicken: Flicken = if (request.flicken == .nop) - .{ .name = "nop", .bytes = request.bytes[0..request.size] } - else - flicken_templates.entries.get(@intFromEnum(request.flicken)).value; - - // Constraints for J_Patch: - // Bytes [0 .. victim_size - k] are free (inside victim). - // Bytes [victim_size - k .. ] are used (outside victim, immutable). - var patch_pii = PatchInstructionIterator.init( - victim_bytes_all[k..], - @intCast(victim_size - k), - patch_flicken.size(), - ); - - while (patch_pii.next(.{ .count = 16 })) |patch_range| { - // J_Patch MUST NOT use prefixes, because it's punned inside J_Victim. - // Adding prefixes would shift J_Patch relative to J_Victim, making constraints harder. - if (patch_pii.num_prefixes > 0) break; - - try pages_made_writable.ensureUnusedCapacity(arena, touchedPageCount(patch_range)); - ensureRangeWritable(patch_range, pages_made_writable) catch |err| switch (err) { - error.MappingAlreadyExists => continue, - else => return err, - }; - - // Tentatively write J_Patch to memory to set constraints for J_Victim. - // We only need to write the bytes of J_Patch that land inside the victim. - { - const jmp_target = patch_range.start; - const jmp_source: i64 = @intCast(@intFromPtr(&victim_bytes_all[k]) + 5); - const rel32: i32 = @intCast(jmp_target - jmp_source); - victim_bytes_all[k] = jump_rel32; - mem.writeInt(i32, victim_bytes_all[k + 1 ..][0..4], rel32, .little); - } - - // INNER LOOP: J_Victim - // Constraints: - // Bytes [0 .. k] are free (before J_Patch). - // Bytes [k .. ] are used (overlap J_Patch). - const victim_flicken = Flicken{ - .name = "nop", - .bytes = victim_orig_bytes[0..victim_size], - }; - - var victim_pii = PatchInstructionIterator.init( - victim_bytes_all, - k, - victim_flicken.size(), - ); - - while (victim_pii.next(.{ .count = 16 })) |victim_range| { - if (patch_range.touches(victim_range)) continue; - - try pages_made_writable.ensureUnusedCapacity(arena, touchedPageCount(victim_range)); - ensureRangeWritable(victim_range, pages_made_writable) catch |err| switch (err) { - error.MappingAlreadyExists => continue, - else => return err, - }; - - // SUCCESS! Commit everything. - - // 1. Write Patch Trampoline (J_Patch target) - { - const trampoline: [*]u8 = @ptrFromInt(patch_range.getStart(u64)); - var reloc_info: ?RelocInfo = null; - if (request.flicken == .nop) { - reloc_info = .{ - .instr = dis.disassembleInstruction(patch_flicken.bytes).?, - .old_addr = @intFromPtr(request.bytes.ptr), - }; - } - commitTrampoline( - trampoline, - patch_flicken.bytes, - reloc_info, - @intFromPtr(request.bytes.ptr) + request.size, - ) catch |err| switch (err) { - error.RelocationOverflow => continue, - else => return err, - }; - } - - // 2. Write Victim Trampoline (J_Victim target) - { - const trampoline: [*]u8 = @ptrFromInt(victim_range.getStart(u64)); - commitTrampoline( - trampoline, - victim_orig_bytes[0..victim_size], - .{ - .instr = dis.disassembleInstruction(victim_orig_bytes[0..victim_size]).?, - .old_addr = @intFromPtr(victim_bytes_all.ptr), - }, - @intFromPtr(victim_bytes_all.ptr) + victim_size, - ) catch |err| switch (err) { - error.RelocationOverflow => continue, - else => return err, - }; - } - - // 3. Write J_Victim (overwrites head of J_Patch which is fine) - commitJump( - victim_bytes_all.ptr, - @intCast(victim_range.start), - victim_pii.num_prefixes, - k, // Total size for padding is limited to k to preserve J_Patch tail - ); - - // 4. Write J_Short at request - request.bytes[0] = jump_rel8; - request.bytes[1] = @intCast(disp); - if (request.size > 2) { - @memset(request.bytes[2..request.size], int3); - } - - // 5. Locking - try address_allocator.block(gpa, patch_range, 0); - try address_allocator.block(gpa, victim_range, 0); - - locked_bytes.setRangeValue( - .{ .start = request.offset, .end = request.offset + request.size }, - true, - ); - // Lock victim range + any extension of J_Patch - const j_patch_end = neighbor_offset + k + 5; - const lock_end = @max(neighbor_offset + victim_size, j_patch_end); - locked_bytes.setRangeValue( - .{ .start = neighbor_offset, .end = lock_end }, - true, - ); - - stats.neighbor_eviction += 1; - return true; - } - - // Revert J_Patch write for next iteration - @memcpy(victim_bytes, victim_orig_bytes[0..victim_size]); - } + return PatchResult{ .patches = .{ patch1, patch2 }, .tactic = .neighbor_eviction }; } } - - return false; + return null; } -/// Applies a standard patch (T1/B1/B2) where the instruction is replaced by a jump to a trampoline. -/// -/// This handles the logic of writing the trampoline content (including relocation) and -/// overwriting the original instruction with a `JMP` (plus prefixes/padding). -fn applyPatch( - request: PatchRequest, - flicken: Flicken, - allocated_range: Range, - num_prefixes: u8, -) !void { - const flicken_addr: [*]u8 = @ptrFromInt(allocated_range.getStart(u64)); +test "attemptNeighborEviction - Valid Neighbor Found" { + var patcher = try Patcher.init(testing.allocator); + defer patcher.deinit(); - // Commit Trampoline - var reloc_info: ?RelocInfo = null; - if (request.flicken == .nop) { - reloc_info = .{ - .instr = dis.disassembleInstruction(request.bytes[0..request.size]).?, - .old_addr = @intFromPtr(request.bytes.ptr), - }; - } + var region: [1024]u8 align(page_size) = undefined; + @memset(®ion, 0); - const ret_addr = @intFromPtr(request.bytes.ptr) + request.size; - try commitTrampoline(flicken_addr, flicken.bytes, reloc_info, ret_addr); + // Target (I): xor eax, eax (31 C0) -> 2 bytes [Offset 0] + // Padding: NOP NOP (90 90) -> 2 bytes [Offset 2] + // Neighbor (N): mov eax, 1 (B8 01 00 00 00) -> 5 bytes [Offset 4] + const instr = "\x31\xC0\x90\x90\xB8\x01\x00\x00\x00"; + @memcpy(region[0..instr.len], instr); - // Commit Jump (Patch) - commitJump(request.bytes.ptr, @intCast(allocated_range.start), num_prefixes, request.size); -} + const source_addr = @intFromPtr(®ion); -const RelocInfo = struct { - instr: dis.BundledInstruction, - old_addr: u64, -}; - -/// Helper to write code into a trampoline. -/// -/// It copies the original bytes (or flicken content), relocates any RIP-relative instructions -/// to be valid at the new address, and appends a jump back to the instruction stream. -fn commitTrampoline( - trampoline_ptr: [*]u8, - content: []const u8, - reloc_info: ?RelocInfo, - return_addr: u64, -) !void { - @memcpy(trampoline_ptr[0..content.len], content); - - if (reloc_info) |info| { - try relocateInstruction( - info.instr, - @intFromPtr(trampoline_ptr), - trampoline_ptr[0..content.len], - ); - } - - // Write jump back - trampoline_ptr[content.len] = jump_rel32; - const jump_src = @intFromPtr(trampoline_ptr) + content.len + jump_rel32_size; - const jump_disp: i32 = @intCast(@as(i64, @intCast(return_addr)) - @as(i64, @intCast(jump_src))); - mem.writeInt(i32, trampoline_ptr[content.len + 1 ..][0..4], jump_disp, .little); -} - -/// Helper to overwrite an instruction with a jump to a trampoline. -/// -/// It handles writing optional prefixes (padding), the `0xE9` opcode, the relative offset, -/// and fills any remaining bytes of the original instruction with `INT3` to prevent -/// execution of garbage bytes. -fn commitJump( - from_ptr: [*]u8, - to_addr: u64, - num_prefixes: u8, - total_size: usize, -) void { - const prefixes_slice = from_ptr[0..num_prefixes]; - @memcpy(prefixes_slice, prefixes[0..num_prefixes]); - - from_ptr[num_prefixes] = jump_rel32; - - const jump_src = @intFromPtr(from_ptr) + num_prefixes + jump_rel32_size; - const jump_disp: i32 = @intCast(@as(i64, @intCast(to_addr)) - @as(i64, @intCast(jump_src))); - mem.writeInt(i32, from_ptr[num_prefixes + 1 ..][0..4], jump_disp, .little); - - const patch_end_index = num_prefixes + jump_rel32_size; - if (patch_end_index < total_size) { - @memset(from_ptr[patch_end_index..total_size], int3); - } -} - -/// Only used for debugging. -fn printMaps() !void { - const path = "/proc/self/maps"; - var reader = try std.fs.cwd().openFile(path, .{}); - var buffer: [1024 * 1024]u8 = undefined; - const size = try reader.readAll(&buffer); - std.debug.print("\n{s}\n", .{buffer[0..size]}); -} - -/// Returns the number of pages that the given range touches. -fn touchedPageCount(range: Range) u32 { - const start_page = mem.alignBackward(u64, range.getStart(u64), page_size); - // alignBackward on (end - 1) handles the exclusive upper bound correctly - const end_page = mem.alignBackward(u64, range.getEnd(u64) - 1, page_size); - return @intCast((end_page - start_page) / page_size + 1); -} - -/// Ensure `range` is mapped R|W. Assumes `pages_made_writable` has enough free capacity. -fn ensureRangeWritable( - range: Range, - pages_made_writable: *std.AutoHashMapUnmanaged(u64, void), -) !void { - const start_page = mem.alignBackward(u64, range.getStart(u64), page_size); - const end_page = mem.alignBackward(u64, range.getEnd(u64) - 1, page_size); - const protection = posix.PROT.READ | posix.PROT.WRITE; - var page_addr = start_page; - while (page_addr <= end_page) : (page_addr += page_size) { - // If the page is already writable, skip it. - if (pages_made_writable.get(page_addr)) |_| continue; - // If we mapped it already we have to do mprotect, else mmap. - const gop = try allocated_pages.getOrPut(gpa, page_addr); - if (gop.found_existing) { - const ptr: [*]align(page_size) u8 = @ptrFromInt(page_addr); - try posix.mprotect(ptr[0..page_size], protection); - } else { - const addr = posix.mmap( - @ptrFromInt(page_addr), - page_size, - protection, - .{ .TYPE = .PRIVATE, .ANONYMOUS = true, .FIXED_NOREPLACE = true }, - -1, - 0, - ) catch |err| switch (err) { - error.MappingAlreadyExists => { - // If the mapping exists this means that the someone else - // (executable, OS, dynamic loader,...) allocated something there. - // We block this so we don't try this page again in the future, - // saving a bunch of syscalls. - try address_allocator.block( - gpa, - .{ .start = @intCast(page_addr), .end = @intCast(page_addr + page_size) }, - page_size, - ); - return err; - }, - else => return err, - }; - assert(@as(u64, @intFromPtr(addr.ptr)) == page_addr); - // `gop.value_ptr.* = {};` not needed because it's void. - } - pages_made_writable.putAssumeCapacityNoClobber(page_addr, {}); - } -} - -const PatchInstructionIterator = struct { - bytes: []const u8, // first byte is first byte of instruction to patch. - instruction_size: u8, - flicken_size: u64, - - // Internal state - num_prefixes: u8, - pli: PatchLocationIterator, - valid_range: Range, - allocated_count: u64, - - fn init( - bytes: []const u8, - instruction_size: u8, - flicken_size: u64, - ) PatchInstructionIterator { - const patch_bytes = getPatchBytes(bytes, instruction_size, 0); - var pli = PatchLocationIterator.init(patch_bytes, @intFromPtr(&bytes[5])); - const valid_range = pli.next() orelse Range{ .start = 0, .end = 0 }; - return .{ - .bytes = bytes, - .instruction_size = instruction_size, - .flicken_size = flicken_size, - .num_prefixes = 0, - .pli = pli, - .valid_range = valid_range, - .allocated_count = 0, - }; - } - - pub const Strategy = union(enum) { - /// Iterates through all possible ranges. - /// Useful for finding the optimal allocation (fewest prefixes). - exhaustive: void, - /// Limits the search to `count` allocation attempts per valid constraint range found by the - /// PatchLocationIterator. - /// - /// This acts as a heuristic to prevent worst-case performance (scanning every byte of a 2GB - /// gap) while still offering better density than a purely greedy approach. A count of 1 is - /// equivalent to a greedy strategy. - count: u64, + const request = PatchRequest{ + .flicken = .nop, + .offset = 0, + .size = 2, + .bytes = region[0..], }; - fn next( - pii: *PatchInstructionIterator, - strategy: Strategy, - ) ?Range { - const State = enum { - allocation, - range, - prefix, - }; - blk: switch (State.allocation) { - .allocation => { - if (address_allocator.findAllocation( - pii.flicken_size, - pii.valid_range, - )) |allocated_range| { - assert(allocated_range.size() == pii.flicken_size); - pii.allocated_count += 1; - // Advancing the valid range, such that the next call to `findAllocation` won't - // find the same range again. - switch (strategy) { - .exhaustive => pii.valid_range.start = allocated_range.start + 1, - .count => |c| { - if (pii.allocated_count >= c) { - pii.valid_range.start = pii.valid_range.end; - pii.allocated_count = 0; - } else { - pii.valid_range.start = allocated_range.start + 1; - } - }, - } - return allocated_range; - } else { - pii.allocated_count = 0; - continue :blk .range; - } - }, - .range => { - // Valid range is used up, so get a new one from the pli. - if (pii.pli.next()) |valid_range| { - pii.valid_range = valid_range; - continue :blk .allocation; - } else { - continue :blk .prefix; - } - }, - .prefix => { - if (pii.num_prefixes < @min(pii.instruction_size, prefixes.len)) { - pii.num_prefixes += 1; - const patch_bytes = getPatchBytes(pii.bytes, pii.instruction_size, pii.num_prefixes); - pii.pli = PatchLocationIterator.init( - patch_bytes, - @intFromPtr(&pii.bytes[pii.num_prefixes + 5]), - ); - continue :blk .range; - } else { - return null; - } - }, - } - comptime unreachable; - } + // Block immediate area to trigger the complex coupled solver logic. + try patcher.address_allocator.block(.{ .start = 0, .end = @intCast(source_addr + 0x2000) }); - fn getPatchBytes(instruction_bytes: []const u8, instruction_size: u8, num_prefixes: u8) [4]PatchByte { - const offset_location = instruction_bytes[num_prefixes + 1 ..][0..4]; // +1 for e9 - var patch_bytes: [4]PatchByte = undefined; - for (&patch_bytes, offset_location, num_prefixes + 1..) |*patch_byte, offset_byte, i| { - if (i < instruction_size) { - patch_byte.* = .free; - } else { - patch_byte.* = .{ .used = offset_byte }; - } - } - return patch_bytes; - } -}; + var locked_bytes = try std.DynamicBitSetUnmanaged.initEmpty(testing.allocator, region.len); + defer locked_bytes.deinit(testing.allocator); -/// Fixes RIP-relative operands in an instruction that has been moved to a new address. -fn relocateInstruction( - instruction: dis.BundledInstruction, - address: u64, - buffer: []u8, -) !void { - const instr = instruction.instruction; - // Iterate all operands - for (0..instr.operand_count) |i| { - const operand = &instruction.operands[i]; + var instruction_starts = try std.DynamicBitSetUnmanaged.initEmpty(testing.allocator, region.len); + defer instruction_starts.deinit(testing.allocator); + instruction_starts.set(0); + instruction_starts.set(2); + instruction_starts.set(3); + instruction_starts.set(4); // Neighbor starts here - // Check for RIP-relative memory operand - const is_rip_rel = operand.type == zydis.ZYDIS_OPERAND_TYPE_MEMORY and - operand.unnamed_0.mem.base == zydis.ZYDIS_REGISTER_RIP; - // Check for relative immediate (e.g. JMP rel32) - const is_rel_imm = operand.type == zydis.ZYDIS_OPERAND_TYPE_IMMEDIATE and - operand.unnamed_0.imm.is_relative == zydis.ZYAN_TRUE; - if (!is_rip_rel and !is_rel_imm) continue; + const patches_opt = try attemptNeighborEviction(&patcher, request, ®ion, instruction_starts, locked_bytes); + try testing.expect(patches_opt != null); + const patches = patches_opt.?.patches; - // We have to apply a relocation - var result_address: u64 = 0; - const status = zydis.ZydisCalcAbsoluteAddress( - instr, - operand, - instruction.address, - &result_address, - ); - assert(zydis.ZYAN_SUCCESS(status)); // TODO: maybe return an error instead + try testing.expectEqual(.active, patches[0].kind); + try testing.expectEqual(.active, patches[1].kind); - // Calculate new displacement relative to the new address - // The instruction length remains the same. - const next_rip: i64 = @intCast(address + instr.length); - const new_disp = @as(i64, @intCast(result_address)) - next_rip; + const p1 = patches[0]; + const p2 = patches[1]; - var offset: u16 = 0; - var size_bits: u8 = 0; + // Verify Patch 1 (The short jump) + try testing.expectEqual(source_addr, p1.source_addr); + try testing.expectEqual(2, p1.source_len); + try testing.expectEqual(0xEB, p1.source_bytes[0]); - if (is_rip_rel) { - offset = instr.raw.disp.offset; - size_bits = instr.raw.disp.size; - } else { - assert(is_rel_imm); - // For relative immediate, find the matching raw immediate. - var found = false; - for (&instr.raw.imm) |*imm| { - if (imm.is_relative == zydis.ZYAN_TRUE) { - offset = imm.offset; - size_bits = imm.size; - found = true; - break; - } - } - assert(found); - } + // Displacement should jump to the hole created at offset 4. + // Short jump origin is end of instruction (offset 2). + // Target is `neighbor_offset + k`. Assume it chose k=2 for the overlap: 4 + 2 = 6. + // disp = 6 - 2 = 4. + const expected_disp = p1.source_bytes[1]; + const target_offset = 2 + @as(i8, @bitCast(expected_disp)); + try testing.expect(target_offset > 4 and target_offset < 9); - assert(offset != 0); - assert(size_bits != 0); - const size_bytes = size_bits / 8; + // Verify Patch 2 (The overlapping jumps in the neighbor's location) + try testing.expectEqual(source_addr + 4, p2.source_addr); + try testing.expectEqual(0xE9, p2.source_bytes[0]); // J_V starts with 0xE9 - if (offset + size_bytes > buffer.len) { - return error.RelocationFail; - } - - const fits = switch (size_bits) { - 8 => new_disp >= math.minInt(i8) and new_disp <= math.maxInt(i8), - 16 => new_disp >= math.minInt(i16) and new_disp <= math.maxInt(i16), - 32 => new_disp >= math.minInt(i32) and new_disp <= math.maxInt(i32), - 64 => true, - else => unreachable, - }; - - if (!fits) { - return error.RelocationOverflow; - } - - const ptr = buffer[offset..]; - switch (size_bits) { - 8 => ptr[0] = @as(u8, @bitCast(@as(i8, @intCast(new_disp)))), - 16 => mem.writeInt(u16, ptr[0..2], @bitCast(@as(i16, @intCast(new_disp))), .little), - 32 => mem.writeInt(u32, ptr[0..4], @bitCast(@as(i32, @intCast(new_disp))), .little), - 64 => mem.writeInt(u64, ptr[0..8], @bitCast(@as(i64, @intCast(new_disp))), .little), - else => unreachable, - } - } + const k = target_offset - 4; + try testing.expectEqual(0xE9, p2.source_bytes[@intCast(k)]); // J_P starts with 0xE9 exactly where the short jump points! } diff --git a/src/Range.zig b/src/Range.zig index b04060f..5804fc0 100644 --- a/src/Range.zig +++ b/src/Range.zig @@ -17,16 +17,6 @@ pub fn size(range: Range) u64 { return @intCast(range.end - range.start); } -pub fn alignTo(range: Range, alignment: u64) Range { - assert(range.end >= range.start); - assert(std.math.isPowerOfTwo(alignment)); - assert(alignment <= std.math.maxInt(i64)); - const lower = std.mem.alignBackward(i64, range.start, @intCast(alignment)); - const upper = std.mem.alignForward(i64, range.end, @intCast(alignment)); - assert(upper >= lower); - return .{ .start = lower, .end = upper }; -} - pub fn overlaps(range: Range, other: Range) bool { assert(range.end >= range.start); assert(other.end >= other.start); @@ -52,18 +42,17 @@ pub fn touches(range: Range, other: Range) bool { } /// Ranges are considered equal if they touch. -pub fn compare(lhs: Range, rhs: Range) std.math.Order { +pub fn compareTouching(lhs: Range, rhs: Range) std.math.Order { assert(lhs.end >= lhs.start); assert(rhs.end >= rhs.start); return if (lhs.start > rhs.end) .gt else if (lhs.end < rhs.start) .lt else .eq; } -pub fn getStart(range: Range, T: type) T { - return @intCast(range.start); -} - -pub fn getEnd(range: Range, T: type) T { - return @intCast(range.end); +/// Ranges are considered equal if they overlap. +pub fn compareOverlapping(lhs: Range, rhs: Range) std.math.Order { + assert(lhs.end >= lhs.start); + assert(rhs.end >= rhs.start); + return if (lhs.start >= rhs.end) .gt else if (lhs.end <= rhs.start) .lt else .eq; } pub fn format( @@ -73,25 +62,23 @@ pub fn format( try writer.print(".{{ .start = 0x{x}, .end = 0x{x} }}", .{ self.start, self.end }); } +pub fn fromSlice(T: type, slice: []T) Range { + const start = @intFromPtr(slice.ptr); + return .{ + .start = @intCast(start), + .end = @intCast(start + slice.len * @sizeOf(T)), + }; +} + +pub fn fromPtr(ptr: [*]u8, len: usize) Range { + return .fromSlice(u8, ptr[0..len]); +} + test "AddressRange size" { const range = Range{ .start = 100, .end = 250 }; try std.testing.expectEqual(@as(u64, 150), range.size()); } -test "AddressRange alignTo unaligned" { - const range = Range{ .start = 101, .end = 199 }; - const aligned = range.alignTo(16); - try std.testing.expectEqual(@as(i64, 96), aligned.start); - try std.testing.expectEqual(@as(i64, 208), aligned.end); -} - -test "AddressRange alignTo already aligned" { - const range = Range{ .start = 64, .end = 128 }; - const aligned = range.alignTo(64); - try std.testing.expectEqual(@as(i64, 64), aligned.start); - try std.testing.expectEqual(@as(i64, 128), aligned.end); -} - test "AddressRange no overlap before" { const base = Range{ .start = 100, .end = 200 }; const other = Range{ .start = 0, .end = 100 }; diff --git a/src/Statistics.zig b/src/Statistics.zig new file mode 100644 index 0000000..9a73bc0 --- /dev/null +++ b/src/Statistics.zig @@ -0,0 +1,46 @@ +const std = @import("std"); +const mem = std.mem; + +const Statistics = @This(); + +/// Direct jumps +jump: u64, +/// Punning - index represents number of prefixes used +punning: [4]u64, +/// Successor Eviction +successor_eviction: u64, +/// Neighbor Eviction +neighbor_eviction: u64, +/// Failed to patch +failed: u64, + +pub const empty = mem.zeroes(Statistics); + +pub fn punningSum(stats: *const Statistics) u64 { + return stats.punning[0] + stats.punning[1] + stats.punning[2] + stats.punning[3]; +} + +pub fn successful(stats: *const Statistics) u64 { + return stats.jump + stats.punningSum() + stats.successor_eviction + stats.neighbor_eviction; +} + +pub fn total(stats: *const Statistics) u64 { + return stats.successful() + stats.failed; +} + +pub fn percentage(stats: *const Statistics) f64 { + if (stats.total() == 0) return 1; + const s: f64 = @floatFromInt(stats.successful()); + const t: f64 = @floatFromInt(stats.total()); + return s / t; +} + +pub fn add(self: *Statistics, other: *const Statistics) void { + self.jump += other.jump; + for (0..self.punning.len) |i| { + self.punning[i] += other.punning[i]; + } + self.successor_eviction += other.successor_eviction; + self.neighbor_eviction += other.neighbor_eviction; + self.failed += other.failed; +} diff --git a/src/backend.zig b/src/backend.zig new file mode 100644 index 0000000..3d2d5c1 --- /dev/null +++ b/src/backend.zig @@ -0,0 +1,49 @@ +const std = @import("std"); +const p = std.posix; + +const page_size_min = std.heap.page_size_min; + +pub const backend = switch (@import("builtin").is_test) { + true => testing, + false => posix, +}; + +// TODO: Maybe log? +pub const testing = struct { + pub fn mmap( + ptr: [*]align(page_size_min) u8, + length: usize, + prot: u32, + flags: p.MAP, + fd: p.fd_t, + offset: u64, + ) p.MMapError![]align(page_size_min) u8 { + _ = .{ ptr, length, prot, flags, fd, offset }; + return ptr[0..length]; + } + pub fn mprotect(memory: []align(page_size_min) u8, protection: u32) p.MProtectError!void { + _ = .{ memory, protection }; + } + pub fn munmap(memory: []align(page_size_min) const u8) void { + _ = memory; + } +}; + +pub const posix = struct { + pub fn mmap( + ptr: ?[*]align(page_size_min) u8, + length: usize, + prot: u32, + flags: p.MAP, + fd: p.fd_t, + offset: u64, + ) p.MMapError![]align(page_size_min) u8 { + return p.mmap(ptr, length, prot, flags, fd, offset); + } + pub fn mprotect(memory: []align(page_size_min) u8, protection: u32) p.MProtectError!void { + return p.mprotect(memory, protection); + } + pub fn munmap(memory: []align(page_size_min) const u8) void { + p.munmap(memory); + } +}; diff --git a/src/loader.zig b/src/loader.zig new file mode 100644 index 0000000..c3132a3 --- /dev/null +++ b/src/loader.zig @@ -0,0 +1,94 @@ +const std = @import("std"); +const elf = std.elf; +const mem = std.mem; +const posix = std.posix; + +const log = std.log.scoped(.loader); +const page_size = std.heap.pageSize(); + +pub const UnfinishedReadError = error{UnfinishedRead}; + +pub const LoadResult = struct { + base: usize, + size: usize, +}; + +/// Loads all `PT_LOAD` segments of an ELF file into memory. +/// +/// For `ET_EXEC` (non-PIE), segments are mapped at their fixed virtual addresses (`p_vaddr`). +/// For `ET_DYN` (PIE), segments are mapped at a random base address chosen by the kernel. +/// +/// It handles zero-initialized(e.g., .bss) sections by mapping anonymous memory and only reading +/// `p_filesz` bytes from the file, ensuring `p_memsz` bytes are allocated. +pub fn loadStaticElf(ehdr: elf.Header, file_reader: *std.fs.File.Reader) !LoadResult { + // NOTE: In theory we could also just look at the first and last loadable segment because the + // ELF spec mandates these to be in ascending order of `p_vaddr`, but better be safe than sorry. + // https://gabi.xinuos.com/elf/08-pheader.html#:~:text=ascending%20order + const minva, const maxva = bounds: { + var minva: u64 = std.math.maxInt(u64); + var maxva: u64 = 0; + var phdrs = ehdr.iterateProgramHeaders(file_reader); + while (try phdrs.next()) |phdr| { + if (phdr.p_type != elf.PT_LOAD) continue; + minva = @min(minva, phdr.p_vaddr); + maxva = @max(maxva, phdr.p_vaddr + phdr.p_memsz); + } + minva = mem.alignBackward(usize, minva, page_size); + maxva = mem.alignForward(usize, maxva, page_size); + log.debug("Calculated bounds: minva=0x{x}, maxva=0x{x}", .{ minva, maxva }); + break :bounds .{ minva, maxva }; + }; + + // Check, that the needed memory region can be allocated as a whole. We do this + const dynamic = ehdr.type == elf.ET.DYN; + log.debug("ELF type is {s}", .{if (dynamic) "DYN" else "EXEC (static)"}); + const hint = if (dynamic) null else @as(?[*]align(page_size) u8, @ptrFromInt(minva)); + log.debug("mmap pre-flight hint: {*}", .{hint}); + const base = try posix.mmap( + hint, + maxva - minva, + posix.PROT.WRITE, + .{ .TYPE = .PRIVATE, .ANONYMOUS = true, .FIXED_NOREPLACE = !dynamic }, + -1, + 0, + ); + log.debug("Pre-flight reservation at: {*}, size: 0x{x}", .{ base.ptr, base.len }); + + var phdrs = ehdr.iterateProgramHeaders(file_reader); + var phdr_idx: u32 = 0; + errdefer posix.munmap(base); + while (try phdrs.next()) |phdr| : (phdr_idx += 1) { + if (phdr.p_type != elf.PT_LOAD) continue; + if (phdr.p_memsz == 0) continue; + + const offset = phdr.p_vaddr & (page_size - 1); + const size = mem.alignForward(usize, phdr.p_memsz + offset, page_size); + var start = mem.alignBackward(usize, phdr.p_vaddr, page_size); + const base_for_dyn = if (dynamic) @intFromPtr(base.ptr) else 0; + start += base_for_dyn; + log.debug( + " - phdr[{}]: mapping 0x{x} - 0x{x} (vaddr=0x{x}, dyn_base=0x{x})", + .{ phdr_idx, start, start + size, phdr.p_vaddr, base_for_dyn }, + ); + const ptr: []align(page_size) u8 = @as([*]align(page_size) u8, @ptrFromInt(start))[0..size]; + // TODO: we should likely just use mmap instead because then not touched memory isn't loaded + // unnecessarily + try file_reader.seekTo(phdr.p_offset); + if (try file_reader.read(ptr[offset..][0..phdr.p_filesz]) != phdr.p_filesz) + return UnfinishedReadError.UnfinishedRead; + + const protections = elfToMmapProt(phdr.p_flags); + try posix.mprotect(ptr, protections); + } + log.debug("loadElf returning base: 0x{x}, size: 0x{x}", .{ @intFromPtr(base.ptr), base.len }); + return .{ .base = @intFromPtr(base.ptr), .size = base.len }; +} + +/// Converts ELF program header protection flags to mmap protection flags. +pub fn elfToMmapProt(elf_prot: u64) u32 { + var result: u32 = posix.PROT.NONE; + if ((elf_prot & elf.PF_R) != 0) result |= posix.PROT.READ; + if ((elf_prot & elf.PF_W) != 0) result |= posix.PROT.WRITE; + if ((elf_prot & elf.PF_X) != 0) result |= posix.PROT.EXEC; + return result; +} diff --git a/src/main.zig b/src/main.zig index 19fa67b..651dec7 100644 --- a/src/main.zig +++ b/src/main.zig @@ -8,6 +8,7 @@ const testing = std.testing; const log = std.log.scoped(.flicker); const Patcher = @import("Patcher.zig"); +const loader = @import("loader.zig"); const assert = std.debug.assert; @@ -16,8 +17,8 @@ pub const std_options: std.Options = .{ .log_scope_levels = &.{ .{ .scope = .disassembler, .level = .info }, .{ .scope = .patcher, .level = .debug }, - .{ .scope = .patch_location_iterator, .level = .warn }, .{ .scope = .flicker, .level = .info }, + .{ .scope = .loader, .level = .info }, }, }; const page_size = std.heap.pageSize(); @@ -32,6 +33,12 @@ const help = const UnfinishedReadError = error{UnfinishedRead}; +/// This needs to be a public global, such that it has a static memory location. This is needed +/// for the syscall interception, in particular for patching new maps of the `mmap` call. +pub var patcher: Patcher = undefined; +pub var target_exec_path_buf: [std.fs.max_path_bytes]u8 = @splat(0); +pub var target_exec_path: []const u8 = undefined; + pub fn main() !void { // Parse arguments var arg_index: u64 = 1; // Skip own name @@ -51,27 +58,29 @@ pub fn main() !void { const file = try lookupFile(mem.sliceTo(std.os.argv[arg_index], 0)); - { - // Initialize patcher - try Patcher.init(); - // Resolve the absolute path of the target executable. This is needed for the - // readlink("/proc/self/exe") interception. We use the file descriptor to get the - // authoritative path. - var self_buf: [128]u8 = undefined; - const fd_path = try std.fmt.bufPrint(&self_buf, "/proc/self/fd/{d}", .{file.handle}); - Patcher.target_exec_path = try std.fs.readLinkAbsolute(fd_path, &Patcher.target_exec_path_buf); - log.debug("Resolved target executable path: {s}", .{Patcher.target_exec_path}); - } + patcher = try .init(std.heap.page_allocator); + + // Resolve the absolute path of the target executable for /proc/self/exe spoofing + const fd_path = try std.fmt.bufPrint(&target_exec_path_buf, "/proc/self/fd/{d}", .{file.handle}); + target_exec_path = try std.fs.readLinkAbsolute(fd_path, &target_exec_path_buf); + log.debug("Resolved target executable path: {s}", .{target_exec_path}); + + try bootstrapMemoryMap(&patcher); + // TODO: + // block until `mmap_min_addr` + // block all entries in `proc/self/maps` // Map file into memory var file_buffer: [128]u8 = undefined; var file_reader = file.reader(&file_buffer); log.info("--- Loading executable: {s} ---", .{std.os.argv[arg_index]}); const ehdr = try elf.Header.read(&file_reader.interface); - const base = try loadStaticElf(ehdr, &file_reader); + const load_result = try loader.loadStaticElf(ehdr, &file_reader); + const base = load_result.base; const entry = ehdr.entry + if (ehdr.type == .DYN) base else 0; log.info("Executable loaded: base=0x{x}, entry=0x{x}", .{ base, entry }); - try patchLoadedElf(base); + try patcher.address_allocator.block(.fromPtr(@ptrFromInt(base), load_result.size)); + try patchLoadedElf(load_result.base); // Check for dynamic linker var maybe_interp_base: ?usize = null; @@ -96,13 +105,15 @@ pub fn main() !void { var interp_reader = interp.reader(&interp_buffer); const interp_ehdr = try elf.Header.read(&interp_reader.interface); assert(interp_ehdr.type == elf.ET.DYN); - const interp_base = try loadStaticElf(interp_ehdr, &interp_reader); + const interp_result = try loader.loadStaticElf(interp_ehdr, &interp_reader); + const interp_base = interp_result.base; maybe_interp_base = interp_base; maybe_interp_entry = interp_ehdr.entry + if (interp_ehdr.type == .DYN) interp_base else 0; log.info( "Interpreter loaded: base=0x{x}, entry=0x{x}", .{ interp_base, maybe_interp_entry.? }, ); + try patcher.address_allocator.block(.fromPtr(@ptrFromInt(interp_base), interp_result.size)); try patchLoadedElf(interp_base); interp.close(); } @@ -118,9 +129,12 @@ pub fn main() !void { elf.AT_ENTRY => entry, elf.AT_EXECFN => @intFromPtr(std.os.argv[arg_index]), elf.AT_SYSINFO_EHDR => blk: { - log.info("Found vDSO at 0x{x}", .{auxv[i].a_un.a_val}); - try patchLoadedElf(auxv[i].a_un.a_val); - break :blk auxv[i].a_un.a_val; + const vdso_base = auxv[i].a_un.a_val; + log.info("Found vDSO at 0x{x}", .{vdso_base}); + try patchLoadedElf(vdso_base); + break :blk vdso_base; + // NOTE: We do not need to block this, because it's already done by the initial + // `/proc/self/maps` pass. }, elf.AT_EXECFD => { @panic("Got AT_EXECFD auxv value"); @@ -163,77 +177,6 @@ pub fn main() !void { trampoline(final_entry, argc); } -/// Loads all `PT_LOAD` segments of an ELF file into memory. -/// -/// For `ET_EXEC` (non-PIE), segments are mapped at their fixed virtual addresses (`p_vaddr`). -/// For `ET_DYN` (PIE), segments are mapped at a random base address chosen by the kernel. -/// -/// It handles zero-initialized(e.g., .bss) sections by mapping anonymous memory and only reading -/// `p_filesz` bytes from the file, ensuring `p_memsz` bytes are allocated. -fn loadStaticElf(ehdr: elf.Header, file_reader: *std.fs.File.Reader) !usize { - // NOTE: In theory we could also just look at the first and last loadable segment because the - // ELF spec mandates these to be in ascending order of `p_vaddr`, but better be safe than sorry. - // https://gabi.xinuos.com/elf/08-pheader.html#:~:text=ascending%20order - const minva, const maxva = bounds: { - var minva: u64 = std.math.maxInt(u64); - var maxva: u64 = 0; - var phdrs = ehdr.iterateProgramHeaders(file_reader); - while (try phdrs.next()) |phdr| { - if (phdr.p_type != elf.PT_LOAD) continue; - minva = @min(minva, phdr.p_vaddr); - maxva = @max(maxva, phdr.p_vaddr + phdr.p_memsz); - } - minva = mem.alignBackward(usize, minva, page_size); - maxva = mem.alignForward(usize, maxva, page_size); - log.debug("Calculated bounds: minva=0x{x}, maxva=0x{x}", .{ minva, maxva }); - break :bounds .{ minva, maxva }; - }; - - // Check, that the needed memory region can be allocated as a whole. We do this - const dynamic = ehdr.type == elf.ET.DYN; - log.debug("ELF type is {s}", .{if (dynamic) "DYN" else "EXEC (static)"}); - const hint = if (dynamic) null else @as(?[*]align(page_size) u8, @ptrFromInt(minva)); - log.debug("mmap pre-flight hint: {*}", .{hint}); - const base = try posix.mmap( - hint, - maxva - minva, - posix.PROT.WRITE, - .{ .TYPE = .PRIVATE, .ANONYMOUS = true, .FIXED_NOREPLACE = !dynamic }, - -1, - 0, - ); - log.debug("Pre-flight reservation at: {*}, size: 0x{x}", .{ base.ptr, base.len }); - - var phdrs = ehdr.iterateProgramHeaders(file_reader); - var phdr_idx: u32 = 0; - errdefer posix.munmap(base); - while (try phdrs.next()) |phdr| : (phdr_idx += 1) { - if (phdr.p_type != elf.PT_LOAD) continue; - if (phdr.p_memsz == 0) continue; - - const offset = phdr.p_vaddr & (page_size - 1); - const size = mem.alignForward(usize, phdr.p_memsz + offset, page_size); - var start = mem.alignBackward(usize, phdr.p_vaddr, page_size); - const base_for_dyn = if (dynamic) @intFromPtr(base.ptr) else 0; - start += base_for_dyn; - log.debug( - " - phdr[{}]: mapping 0x{x} - 0x{x} (vaddr=0x{x}, dyn_base=0x{x})", - .{ phdr_idx, start, start + size, phdr.p_vaddr, base_for_dyn }, - ); - const ptr: []align(page_size) u8 = @as([*]align(page_size) u8, @ptrFromInt(start))[0..size]; - // TODO: we should likely just use mmap instead because then not touched memory isn't loaded - // unnecessarily - try file_reader.seekTo(phdr.p_offset); - if (try file_reader.read(ptr[offset..][0..phdr.p_filesz]) != phdr.p_filesz) - return UnfinishedReadError.UnfinishedRead; - - const protections = elfToMmapProt(phdr.p_flags); - try posix.mprotect(ptr, protections); - } - log.debug("loadElf returning base: 0x{x}", .{@intFromPtr(base.ptr)}); - return @intFromPtr(base.ptr); -} - fn patchLoadedElf(base: usize) !void { const ehdr = @as(*const elf.Ehdr, @ptrFromInt(base)); if (!mem.eql(u8, ehdr.e_ident[0..4], elf.MAGIC)) return error.InvalidElfMagic; @@ -263,20 +206,11 @@ fn patchLoadedElf(base: usize) !void { const region = @as([*]align(page_size) u8, @ptrFromInt(page_start))[0..size]; - try Patcher.patchRegion(region); - try posix.mprotect(region, elfToMmapProt(phdr.p_flags)); + try patcher.patchRegion(region); + try posix.mprotect(region, loader.elfToMmapProt(phdr.p_flags)); } } -/// Converts ELF program header protection flags to mmap protection flags. -fn elfToMmapProt(elf_prot: u64) u32 { - var result: u32 = posix.PROT.NONE; - if ((elf_prot & elf.PF_R) != 0) result |= posix.PROT.READ; - if ((elf_prot & elf.PF_W) != 0) result |= posix.PROT.WRITE; - if ((elf_prot & elf.PF_X) != 0) result |= posix.PROT.EXEC; - return result; -} - /// Opens the file by either opening via a (absolute or relative) path or searching through `PATH` /// for a file with the name. // TODO: support paths starting with ~ @@ -317,10 +251,50 @@ fn trampoline(entry: usize, sp: [*]usize) noreturn { unreachable; } +fn bootstrapMemoryMap(p: *Patcher) !void { + { + var min_addr: u64 = 0x10000; + if (std.fs.openFileAbsolute("/proc/sys/vm/mmap_min_addr", .{})) |file| { + defer file.close(); + var buf: [32]u8 = undefined; + if (file.readAll(&buf)) |len| { + const trimmed = std.mem.trim(u8, buf[0..len], " \n\r\t"); + if (std.fmt.parseInt(u64, trimmed, 10)) |val| { + min_addr = val; + } else |_| {} + } else |_| {} + } else |_| {} + try p.address_allocator.block(.{ .start = 0, .end = @intCast(min_addr) }); + } + + { + var maps_file = try std.fs.openFileAbsolute("/proc/self/maps", .{}); + defer maps_file.close(); + var buf: [512]u8 = undefined; + var reader = maps_file.reader(&buf); + while (true) { + const line = reader.interface.takeDelimiterInclusive('\n') catch |err| switch (err) { + error.EndOfStream => break, + error.ReadFailed => |e| return reader.err orelse e, + else => |e| return e, + }; + std.debug.print("{s}", .{line}); + const dash = mem.indexOfScalar(u8, line, '-') orelse continue; + const space = mem.indexOfScalar(u8, line, ' ') orelse continue; + assert(space > dash); + const start = std.fmt.parseInt(u64, line[0..dash], 16) catch unreachable; + const end = std.fmt.parseInt(u64, line[dash + 1 .. space], 16) catch unreachable; + // TODO: remove when Range is `u64` + try p.address_allocator.block(.{ + .start = @as(u63, @truncate(start)), + .end = @as(u63, @truncate(end)), + }); + } + } +} + test { - _ = @import("AddressAllocator.zig"); - _ = @import("Range.zig"); - _ = @import("PatchLocationIterator.zig"); + _ = @import("Patcher.zig"); } // TODO: make this be passed in from the build system diff --git a/src/relocation.zig b/src/relocation.zig new file mode 100644 index 0000000..d0f6555 --- /dev/null +++ b/src/relocation.zig @@ -0,0 +1,98 @@ +const dis = @import("disassembler.zig"); +const std = @import("std"); +const math = std.math; +const mem = std.mem; +const zydis = @import("zydis").zydis; + +const assert = std.debug.assert; + +pub const RelocInfo = struct { + instr: dis.BundledInstruction, + old_addr: u64, +}; + +/// Fixes RIP-relative operands in an instruction that has been moved to a new address. +pub fn relocateInstruction( + instruction: dis.BundledInstruction, + address: u64, + buffer: []u8, +) !void { + const instr = instruction.instruction; + // Iterate all operands + for (0..instr.operand_count) |i| { + const operand = &instruction.operands[i]; + + // Check for RIP-relative memory operand + const is_rip_rel = operand.type == zydis.ZYDIS_OPERAND_TYPE_MEMORY and + operand.unnamed_0.mem.base == zydis.ZYDIS_REGISTER_RIP; + // Check for relative immediate (e.g. JMP rel32) + const is_rel_imm = operand.type == zydis.ZYDIS_OPERAND_TYPE_IMMEDIATE and + operand.unnamed_0.imm.is_relative == zydis.ZYAN_TRUE; + if (!is_rip_rel and !is_rel_imm) continue; + + // We have to apply a relocation + var result_address: u64 = 0; + const status = zydis.ZydisCalcAbsoluteAddress( + instr, + operand, + instruction.address, + &result_address, + ); + assert(zydis.ZYAN_SUCCESS(status)); // TODO: maybe return an error instead + + // Calculate new displacement relative to the new address + // The instruction length remains the same. + const next_rip: i64 = @intCast(address + instr.length); + const new_disp = @as(i64, @intCast(result_address)) - next_rip; + + var offset: u16 = 0; + var size_bits: u8 = 0; + + if (is_rip_rel) { + offset = instr.raw.disp.offset; + size_bits = instr.raw.disp.size; + } else { + assert(is_rel_imm); + // For relative immediate, find the matching raw immediate. + var found = false; + for (&instr.raw.imm) |*imm| { + if (imm.is_relative == zydis.ZYAN_TRUE) { + offset = imm.offset; + size_bits = imm.size; + found = true; + break; + } + } + assert(found); + } + + assert(offset != 0); + assert(size_bits != 0); + const size_bytes = size_bits / 8; + + if (offset + size_bytes > buffer.len) { + return error.RelocationFail; + } + + const fits = switch (size_bits) { + 8 => new_disp >= math.minInt(i8) and new_disp <= math.maxInt(i8), + 16 => new_disp >= math.minInt(i16) and new_disp <= math.maxInt(i16), + 32 => new_disp >= math.minInt(i32) and new_disp <= math.maxInt(i32), + 64 => true, + else => unreachable, + }; + + if (!fits) { + return error.RelocationOverflow; + } + + const ptr = buffer[offset..]; + switch (size_bits) { + 8 => ptr[0] = @as(u8, @bitCast(@as(i8, @intCast(new_disp)))), + 16 => mem.writeInt(u16, ptr[0..2], @bitCast(@as(i16, @intCast(new_disp))), .little), + 32 => mem.writeInt(u32, ptr[0..4], @bitCast(@as(i32, @intCast(new_disp))), .little), + 64 => mem.writeInt(u64, ptr[0..8], @bitCast(@as(i64, @intCast(new_disp))), .little), + else => unreachable, + } + } +} diff --git a/src/syscalls.zig b/src/syscalls.zig index 7e05951..40dacd7 100644 --- a/src/syscalls.zig +++ b/src/syscalls.zig @@ -1,11 +1,12 @@ const std = @import("std"); const linux = std.os.linux; const posix = std.posix; -const Patcher = @import("Patcher.zig"); -const assert = std.debug.assert; +const assert = std.debug.assert; const page_size = std.heap.pageSize(); +const main = @import("main.zig"); + const log = std.log.scoped(.syscalls); /// Represents the stack layout pushed by `syscallEntry` before calling the handler. @@ -114,7 +115,7 @@ export fn syscall_handler(ctx: *SavedContext) callconv(.c) void { // mmap addresses are always page aligned const ptr = @as([*]align(page_size) u8, @ptrFromInt(addr)); // Check if we can patch it - Patcher.patchRegion(ptr[0..len]) catch |err| { + main.patcher.patchRegion(ptr[0..len]) catch |err| { std.log.warn("JIT Patching failed: {}", .{err}); }; @@ -132,7 +133,7 @@ export fn syscall_handler(ctx: *SavedContext) callconv(.c) void { // mprotect requires addr to be page aligned. if (len > 0 and std.mem.isAligned(addr, page_size)) { const ptr = @as([*]align(page_size) u8, @ptrFromInt(addr)); - Patcher.patchRegion(ptr[0..len]) catch |err| { + main.patcher.patchRegion(ptr[0..len]) catch |err| { std.log.warn("mprotect Patching failed: {}", .{err}); }; // patchRegion leaves it R|W. @@ -250,7 +251,7 @@ fn isProcSelfExe(path: [*:0]const u8) bool { } fn handleReadlink(buf_addr: u64, buf_size: u64, ctx: *SavedContext) void { - const target = Patcher.target_exec_path; + const target = main.target_exec_path; const len = @min(target.len, buf_size); const dest = @as([*]u8, @ptrFromInt(buf_addr)); @memcpy(dest[0..len], target[0..len]);