flicker/src/AddressAllocator.zig

const std = @import("std");
const math = std.math;
const mem = std.mem;
const sort = std.sort;
const testing = std.testing;

const assert = std.debug.assert;

const Range = @import("Range.zig");
const log = std.log.scoped(.address_allocator);

const AddressAllocator = @This();

/// The **sorted** list of `Range`s that are blocked.
ranges: std.ArrayListUnmanaged(Range) = .empty,
child_allocator: mem.Allocator,

// TODO: we should likely create an init function that blocks the entire negative address space
pub fn init(child_allocator: mem.Allocator) !AddressAllocator {
    var aa: AddressAllocator = .{ .child_allocator = child_allocator };

    const ranges = try child_allocator.alloc(Range, std.heap.pageSize() / @sizeOf(Range));
    aa.ranges = .initBuffer(ranges);

    aa.block(.fromSlice(Range, ranges)) catch unreachable;

    return aa;
}

pub fn deinit(self: *AddressAllocator) void {
    self.ranges.deinit(self.child_allocator);
}

pub fn allocator(self: *AddressAllocator) mem.Allocator {
    return .{
        .ptr = self,
        .vtable = &.{
            .alloc = alloc,
            .resize = resize,
            .remap = remap,
            .free = free,
        },
    };
}

fn alloc(ctx: *anyopaque, n: usize, alignment: std.mem.Alignment, ra: usize) ?[*]u8 {
    const self: *AddressAllocator = @ptrCast(@alignCast(ctx));

    const ptr = self.child_allocator.rawAlloc(n, alignment, ra) orelse return null;
    self.block(.fromPtr(ptr, n)) catch @panic("OOM");
    return ptr;
}

fn resize(
    ctx: *anyopaque,
    buf: []u8,
    alignment: std.mem.Alignment,
    new_len: usize,
    ret_addr: usize,
) bool {
    const self: *AddressAllocator = @ptrCast(@alignCast(ctx));

    const success = self.child_allocator.rawResize(buf, alignment, new_len, ret_addr);
    if (success) {
        self.block(.fromPtr(buf.ptr, new_len)) catch @panic("OOM");
    }
    return success;
}

fn remap(
    context: *anyopaque,
    memory: []u8,
    alignment: std.mem.Alignment,
    new_len: usize,
    return_address: usize,
) ?[*]u8 {
    const self: *AddressAllocator = @ptrCast(@alignCast(context));

    const ptr = self.child_allocator.rawRemap(memory, alignment, new_len, return_address) orelse
        return null;

    if (ptr != memory.ptr) { // new memory location
        self.unblock(.fromSlice(u8, memory)) catch @panic("OOM");
    }
    self.block(.fromPtr(ptr, new_len)) catch @panic("OOM");
    return ptr;
}

fn free(
    ctx: *anyopaque,
    buf: []u8,
    alignment: std.mem.Alignment,
    ret_addr: usize,
) void {
    const self: *AddressAllocator = @ptrCast(@alignCast(ctx));

    self.unblock(.fromSlice(u8, buf)) catch @panic("OOM");
    return self.child_allocator.rawFree(buf, alignment, ret_addr);
}

/// Block a range to not be used by the `allocate` function. This function will always succeed, if
/// there is enough memory available.
pub fn block(self: *AddressAllocator, range: Range) !void {
    if (range.size() == 0) return;

    // Find the correct sorted position to insert the new range.
    const insert_idx = sort.lowerBound(
        Range,
        self.ranges.items,
        range,
        Range.compareTouching,
    );
    log.debug(
        "block: range: {f}, insert_idx: {}",
        .{ range, insert_idx },
    );
    // If we don't overlap any existing one, we just insert.
    if (insert_idx == self.ranges.items.len or
        self.ranges.items[insert_idx].compareTouching(range) == .gt)
    {
        return self.ranges.insert(self.child_allocator, insert_idx, range);
    }
    errdefer comptime unreachable;
    assert(self.ranges.items.len > 0);

    // Now `insert_idx` points to the first entry, that touches `range`.
    const first = &self.ranges.items[insert_idx];
    assert(first.touches(range));
    if (insert_idx > 0 and self.ranges.items.len > 0) {
        assert(!self.ranges.items[insert_idx - 1].touches(range));
    }
    log.debug("block: `range` touches at least one existing range.", .{});

    first.start = @min(first.start, range.start);
    first.end = @max(first.end, range.end);

    // Merge any following overlapping ranges into this one.
    // NOTE: We "iterate" through the slice by removing unneeded items and moving all following ones
    // back by one. That's why we always look at `insert_idx + 1`.
    while (insert_idx + 1 < self.ranges.items.len and
        self.ranges.items[insert_idx + 1].touches(range))
    {
        const neighbor = self.ranges.items[insert_idx + 1];
        assert(range.end >= neighbor.start);
        assert(range.start <= neighbor.start);
        first.end = @max(first.end, neighbor.end);
        _ = self.ranges.orderedRemove(insert_idx + 1);
    }
}

pub fn unblock(
    self: *AddressAllocator,
    range: Range,
) !void {

    // Find the correct sorted position to remove the range.
    var remove_idx = sort.lowerBound(
        Range,
        self.ranges.items,
        range,
        Range.compareOverlapping,
    );
    log.debug(
        "unblock: range: {f}, remove_idx: {}",
        .{ range, remove_idx },
    );
    // If we don't overlap any existing one, we just return.
    if (remove_idx == self.ranges.items.len or
        self.ranges.items[remove_idx].compareOverlapping(range) == .gt)
    {
        log.debug("unblock: Range to unblock overlaps nothing", .{});
        for (self.ranges.items) |r| {
            assert(!r.overlaps(range));
        }
        return;
    }
    assert(self.ranges.items.len > 0);

    // Now `remove_idx` points to the first entry, that touches `range`.
    const first = &self.ranges.items[remove_idx];
    assert(first.touches(range));
    if (remove_idx > 0 and self.ranges.items.len > 0) {
        assert(!self.ranges.items[remove_idx - 1].overlaps(range));
    }
    log.debug("unblock: `range` touches at least one existing range.", .{});

    // We have multiple cases for the first touching range:
    //
    //          [ range to unblock ]
    // 0   [           first            ]   -> split
    //
    //          [ range to unblock ]
    // 1              [     first       ]
    // 1        [         first         ]   -> change start
    //
    //          [ range to unblock ]
    // 2        [ first ]
    // 2             [ first ]
    // 2                   [ first ]        -> remove
    //
    //          [ range to unblock ]
    // 3   [     first       ]
    // 3   [         first         ]        -> change end
    //
    // If it's cases 0 or 1 the operation is finished because we can't overlap another one. For cases 2
    // and 3 we will have to remove the following ranges until we arrive at one of the following cases:
    // 1.
    //          [ range to unblock ]
    //                         [ last ]
    // 2.
    //          [ range to unblock ]
    //                               [ last ]
    //
    if (first.start < range.start and first.end > range.end) {
        const old_end = first.end;
        first.end = range.start;
        try self.ranges.insert(self.child_allocator, remove_idx + 1, .{
            .start = range.end,
            .end = old_end,
        });
        return;
    } else if (first.start >= range.start and first.start < range.end and first.end > range.end) {
        first.start = range.end;
        return;
    } else if (first.start >= range.start and first.end <= range.end) {
        _ = self.ranges.orderedRemove(remove_idx);
    } else if (first.start < range.start and first.end > range.start and first.end <= range.end) {
        first.end = range.start;
        remove_idx += 1;
    } else {
        unreachable;
    }

    // NOTE: We "iterate" through the slice by removing unneeded items and moving all following ones
    // back by one. That's why we always look at `insert_idx + 1`.
    while (remove_idx < self.ranges.items.len) {
        const next_range = &self.ranges.items[remove_idx];
        if (next_range.start >= range.end) break;

        if (next_range.end <= range.end) {
            _ = self.ranges.orderedRemove(remove_idx);
        } else {
            next_range.start = range.end;
            break;
        }
    }
}

test "fuzz against bitset" {
    const iterations = 64 * 1024;
    const size = 1024;

    var aa = AddressAllocator{ .child_allocator = testing.allocator };
    defer aa.deinit();

    var bitset_ref = try std.bit_set.DynamicBitSetUnmanaged.initEmpty(testing.allocator, size);
    defer bitset_ref.deinit(testing.allocator);

    var prng = std.Random.DefaultPrng.init(testing.random_seed);
    const random = prng.random();

    var expected_ranges = try std.ArrayListUnmanaged(Range).initCapacity(testing.allocator, size / 2);
    defer expected_ranges.deinit(testing.allocator);

    var bitset_temp = try std.bit_set.DynamicBitSetUnmanaged.initEmpty(testing.allocator, size);
    defer bitset_temp.deinit(testing.allocator);

    for (0..iterations) |_| {
        const is_block = random.boolean();
        const start = random.intRangeLessThan(usize, 0, size);
        const len = random.intRangeAtMost(usize, 1, size - start);
        const end = start + len;

        const range = Range{ .start = @intCast(start), .end = @intCast(end) };

        if (is_block) {
            try aa.block(range);
            bitset_ref.setRangeValue(.{ .start = start, .end = end }, true);
        } else {
            try aa.unblock(range);
            bitset_ref.setRangeValue(.{ .start = start, .end = end }, false);
        }

        bitset_temp.unsetAll();
        for (aa.ranges.items) |r| {
            bitset_temp.setRangeValue(.{ .start = @intCast(r.start), .end = @intCast(r.end) }, true);
        }
        try testing.expect(bitset_ref.eql(bitset_temp));
    }
}

/// An internal iterator that cleanly yields unblocked memory holes.
const HoleIterator = struct {
    ranges: []const Range,
    valid_range: Range,
    size: i64,
    candidate_start: i64,
    idx: usize,

    fn init(aa: *const AddressAllocator, valid_range: Range, size: u64) HoleIterator {
        const start_idx = sort.lowerBound(
            Range,
            aa.ranges.items,
            valid_range,
            Range.compareOverlapping,
        );
        return .{
            .ranges = aa.ranges.items,
            .valid_range = valid_range,
            .size = @intCast(size),
            .candidate_start = valid_range.start,
            .idx = start_idx,
        };
    }

    fn next(self: *HoleIterator) ?Range {
        while (self.idx < self.ranges.len) {
            const reserved = self.ranges[self.idx];
            if (self.candidate_start >= self.valid_range.end) return null;

            if (self.candidate_start < reserved.start) {
                const hole_end = @min(reserved.start, self.valid_range.end);
                const hole_start = self.candidate_start;
                self.candidate_start = reserved.end;

                if (hole_end >= hole_start + self.size) {
                    return Range{ .start = hole_start, .end = hole_end };
                }
            } else {
                self.candidate_start = @max(self.candidate_start, reserved.end);
            }
            self.idx += 1;
        }

        if (self.candidate_start < self.valid_range.end) {
            const hole_start = self.candidate_start;
            const hole_end = self.valid_range.end;
            self.candidate_start = self.valid_range.end; // Mark done to prevent infinite loops
            if (hole_end >= hole_start + self.size) {
                return Range{ .start = hole_start, .end = hole_end };
            }
        }

        return null;
    }

    test {
        var aa = AddressAllocator{ .child_allocator = testing.allocator };
        defer aa.deinit();

        try aa.block(.{ .start = 100, .end = 200 });
        try aa.block(.{ .start = 300, .end = 400 });

        var it = HoleIterator.init(&aa, .{ .start = 0, .end = 500 }, 10);

        try testing.expectEqual(Range{ .start = 0, .end = 100 }, it.next().?);
        try testing.expectEqual(Range{ .start = 200, .end = 300 }, it.next().?);
        try testing.expectEqual(Range{ .start = 400, .end = 500 }, it.next().?);
        try testing.expectEqual(null, it.next());
    }
};

const Constraint = struct {
    min_rel: i32,
    max_rel: i32,
    mask: u32,
    pattern: u32,
};

/// Solves a single 32-bit relative jump constraint in O(1) time.
///
/// Returns the smallest `rel32` such that
/// - `min_rel <= rel32 <= max_rel` and
/// - `(rel32 & mask) == pattern`
///
/// Context:
/// During "Instruction Punning", we overwrite an instruction with a 5-byte jump (`E9 xx xx xx xx`).
/// If the original instruction is smaller than 5 bytes, our jump offset (`xx xx xx xx`) will spill
/// into the next instruction. To prevent crashing, the spilled bytes must form the successor
/// instruction. This restricts certain bits/bytes of our `rel32` offset to fixed values.
///
/// The algorithm uses a bit-twiddling hack to isolate the "free" (unmasked) bits, increment them as
/// a single continuous integer, and map them back around the fixed "pattern" bits, completely
/// avoiding loops over the search space.
///
/// Visualization of the bit-twiddling constraint logic:
/// -------------------------------------------------------------------------
/// Mask:    1111 1111 0000 0000 1111 1111 0000 0000 (1 = Locked bits)
/// Pattern: 0000 0000 0000 0000 1110 1001 0000 0000 (The forced values)
/// Free:    0000 0000 1111 1111 0000 0000 1111 1111 (~Mask)
///
/// Current Candidate: [ Fixed A ] [ Free 1 ] [ Fixed B ] [ Free 0 ]
///
/// If `Current Candidate < min_rel`, we add 1 to the "Free" bits.
/// The hack `(((candidate & free) | mask) + 1) & free` allows the arithmetic carry to jump over the
/// fixed bits without corrupting them:
///
/// Next Valid Val:    [ Fixed A ][ Free 1 + carry ] [ Fixed B ] [ Free 0 + 1 ]
/// -------------------------------------------------------------------------
fn solveRelativeConstraint(c: Constraint) ?i32 {
    log.debug(
        "solveRelative: min: {x}, max: {x}, mask: {x}, pattern: {x}",
        .{ c.min_rel, c.max_rel, c.mask, c.pattern },
    );
    assert((c.pattern & ~c.mask) == 0);
    if (c.min_rel > c.max_rel) return null;

    // Force the pattern onto the current minimum value
    var candidate: u32 = (@as(u32, @bitCast(c.min_rel)) & ~c.mask) | c.pattern;
    log.debug("  candidate (init): {x}", .{candidate});

    // If forcing the pattern made the value smaller than min_rel, we must increment the "free" bits
    // to find the next valid higher number.
    if (@as(i32, @bitCast(candidate)) < c.min_rel) {
        if (~c.mask == 0) {
            log.debug("  failed: fully constrained", .{});
            return null;
        }

        const incremented_free = (((candidate & ~c.mask) | c.mask) +% 1) & ~c.mask;
        assert(incremented_free & c.mask == 0); // All constrained bits are 0
        candidate = incremented_free | c.pattern;
        log.debug("  candidate (incr): {x}", .{candidate});
    }

    const result: i32 = @bitCast(candidate);
    if (result >= c.min_rel and result <= c.max_rel) {
        log.debug("  success: {x}", .{result});
        return result;
    }
    log.debug("  failed: result {x} out of bounds", .{result});
    return null;
}

test "solveRelativeConstraint basic" {
    try testing.expectEqual(100, solveRelativeConstraint(.{
        .min_rel = 100,
        .max_rel = 200,
        .mask = 0,
        .pattern = 0,
    }));
}

test "solveRelativeConstraint aligned" {
    try testing.expectEqual(0x10E8, solveRelativeConstraint(.{
        .min_rel = 0x1000,
        .max_rel = 0x2000,
        .mask = 0xFF,
        .pattern = 0xE8,
    }));
    try testing.expectEqual(0x10E8, solveRelativeConstraint(.{
        .min_rel = 0x10E8,
        .max_rel = 0x2000,
        .mask = 0xFF,
        .pattern = 0xE8,
    }));
    try testing.expectEqual(0x11E8, solveRelativeConstraint(.{
        .min_rel = 0x10E9,
        .max_rel = 0x2000,
        .mask = 0xFF,
        .pattern = 0xE8,
    }));
}

test "solveRelativeConstraint negative" {
    try testing.expectEqual(@as(i32, @bitCast(@as(u32, 0xFFFFF0E8))), solveRelativeConstraint(.{
        .min_rel = -0x1000,
        .max_rel = 0,
        .mask = 0xFF,
        .pattern = 0xE8,
    }));
}

test "solveRelativeConstraint impossible" {
    try testing.expectEqual(null, solveRelativeConstraint(.{
        .min_rel = 0x1000,
        .max_rel = 0x10E7,
        .mask = 0xFF,
        .pattern = 0xE8,
    }));
    try testing.expectEqual(null, solveRelativeConstraint(.{
        .min_rel = 0x10000000,
        .max_rel = 0x11000000,
        .mask = 0xFFFFFFFF,
        .pattern = 0x12345678,
    }));
}

test "solveRelativeConstraint overflow" {
    try testing.expectEqual(0x12345678, solveRelativeConstraint(.{
        .min_rel = 0x10000000,
        .max_rel = 0x20000000,
        .mask = 0xFFFFFFFF,
        .pattern = 0x12345678,
    }));

    try testing.expectEqual(null, solveRelativeConstraint(.{
        .min_rel = 2147483640,
        .max_rel = 2147483647,
        .mask = 0xFF,
        .pattern = 0x00,
    }));
}

pub const Request = struct {
    source: u64,
    size: u64,
    valid_range: Range,
    mask: u32 = 0,
    pattern: u32 = 0,
};

/// Finds the first free range of `size` bytes within `valid_range` that also satisfies the relative
/// 32-bit jump constraints `mask` and `pattern` from `jump_source`.
/// Runs in `O(|H| + log(#R))` for
/// - `H` being the set of holes in the valid range and
/// - `#R` being the number of ranges in the AddressAllocator.
pub fn findAllocation(
    self: *AddressAllocator,
    r: Request,
) ?Range {
    if (r.valid_range.size() < r.size) return null;
    if (r.size == 0) return null;

    var it = HoleIterator.init(self, r.valid_range, r.size);
    while (it.next()) |hole| {
        log.debug("findAllocation: Hole: {f}", .{hole});
        const bounds = getRelativeBounds(hole, @intCast(r.size), r.source) orelse continue;
        const rel32 = solveRelativeConstraint(.{
            .min_rel = bounds.min,
            .max_rel = bounds.max,
            .mask = r.mask,
            .pattern = r.pattern,
        }) orelse continue;

        const start = @as(i64, @intCast(r.source)) + rel32;
        const end = start + @as(i64, @intCast(r.size));

        assert(end - start == r.size);
        assert(start >= r.valid_range.start);
        assert(end <= r.valid_range.end);
        return .{ .start = start, .end = end };
    }

    return null;
}

fn getRelativeBounds(hole: Range, size: i64, source: u64) ?struct { min: i32, max: i32 } {
    if (hole.end - hole.start < size) return null;

    const offset_to_min = hole.start - @as(i64, @intCast(source));
    const offset_to_max = (hole.end - size) - @as(i64, @intCast(source));

    const min_rel = @max(offset_to_min, math.minInt(i32));
    const max_rel = @min(offset_to_max, math.maxInt(i32));
    if (min_rel > max_rel) return null;

    return .{
        .min = @intCast(min_rel),
        .max = @intCast(max_rel),
    };
}

test "findConstrainedAllocation" {
    var aa = AddressAllocator{ .child_allocator = testing.allocator };
    defer aa.deinit();

    try aa.block(.{ .start = 0x1000, .end = 0x2000 });
    try aa.block(.{ .start = 0x3000, .end = 0x4000 });

    try testing.expectEqual(
        Range{ .start = 0x00AA, .end = 0x00BA },
        aa.findAllocation(.{
            .size = 0x10,
            .valid_range = .{ .start = 0x0000, .end = 0x4000 },
            .source = 0,
            .mask = 0xFF,
            .pattern = 0xAA,
        }),
    );

    try testing.expectEqual(
        Range{ .start = 0x20AA, .end = 0x20BA },
        aa.findAllocation(.{
            .size = 0x10,
            .valid_range = .{ .start = 0x1000, .end = 0x4000 },
            .source = 0,
            .mask = 0xFF,
            .pattern = 0xAA,
        }),
    );

    try testing.expectEqual(
        null,
        aa.findAllocation(.{
            .size = 0x10,
            .valid_range = .{ .start = 0x2000, .end = 0x8000 },
            .source = 0,
            .mask = 0xFFFF,
            .pattern = 0xAAAA,
        }),
    );

    try testing.expectEqual(
        Range{ .start = 0x40AA, .end = 0x50AA },
        aa.findAllocation(.{
            .size = 0x1000,
            .valid_range = .{ .start = 0x2000, .end = 0x8000 },
            .source = 0,
            .mask = 0xFF,
            .pattern = 0xAA,
        }),
    );
}

pub const CoupledResult = struct {
    rel1: i32,
    rel2: i32,
};

/// Attempts to find a joint bit-pattern that satisfies two overlapping jump constraints.
///
/// Context:
/// In tactics like Successor Eviction, we overwrite two adjacent instructions with 5-byte jumps (J1
/// and J2). If the distance between them is less than 5 bytes, their physical bytes overlap in
/// memory.
///
/// `k` represents the physical distance (in bytes) between the start of J1 and J2 (1 <= k <= 4).
/// Because x86_64 uses Little-Endian representation, the Most Significant Bytes (MSB) of J1's
/// relative offset (`rel1`) physically overlap with the Least Significant Bytes (LSB) of J2's
/// relative offset (`rel2`).
///
/// Furthermore, J2's opcode (`0xE9`) falls squarely inside the bytes of `rel1`.
///
/// Memory Layout & Endianness Overlap (Example where K = 2):
/// -----------------------------------------------------------------------------------
/// Memory Offset:   +0       +1       +2       +3       +4       +5       +6
/// J1 Bytes:       [0xE9]   [ X0 ]   [ X1 ]   [ X2 ]   [ X3 ]
/// J2 Bytes:                         [0xE9]   [ Y0 ]   [ Y1 ]   [ Y2 ]   [ Y3 ]
///
/// Consequences for `rel1` (X) and `rel2` (Y):
/// 1. Opcode Constraint:  `X1` MUST exactly equal `0xE9`.
/// 2. Shared Bytes (S):   `X2` MUST exactly equal `Y0`.
///                        `X3` MUST exactly equal `Y1`.
/// -----------------------------------------------------------------------------------
///
/// Algorithm ("The Squeeze"):
/// Iterating possibly billions of combinations of X and Y is too slow. Instead, we use the
/// constraints of the memory layout:
///
/// `rel1` is constrained to a physical memory hole `[min1, max1]`. Because memory holes are usually
/// small (e.g., 4KB), the Most Significant Bytes of `rel1` (which are exactly our Shared Bytes 'S')
/// are heavily restricted.
///
/// There are usually only a few possible values for S:
/// 1. We extract the possible values for S from `min1..max1`.
/// 2. We apply S as a strict constraint on the lower bytes of `rel2`.
/// 3. We delegate the remaining independent bits (X0, Y2 and Y3) to the `solveRelativeConstraint`.
///
/// Parameters:
/// `k`: The physical byte offset of J2 relative to J1 (1 <= k <= 4).
/// `min1`, `max1`: The valid rel32 hardware bounds for J1.
/// `min2`, `max2`: The valid rel32 hardware bounds for J2.
/// `mask1`, `pattern1`: The original byte constraints on J1.
/// `mask2`, `pattern2`: The original byte constraints on J2.
pub fn solveCoupledConstraint(
    k: u8,
    c1: Constraint,
    c2: Constraint,
) ?CoupledResult {
    log.debug("solveCoupled: k={}", .{k});
    log.debug("  C1: min={x} max={x} mask={x} pat={x}", .{ c1.min_rel, c1.max_rel, c1.mask, c1.pattern });
    log.debug("  C2: min={x} max={x} mask={x} pat={x}", .{ c2.min_rel, c2.max_rel, c2.mask, c2.pattern });
    assert(k >= 1);
    assert(k <= 4);

    // The opcode for J2 (0xE9) physically falls inside rel32 of J1 at byte index `k - 1` of rel1.
    const e9_shift = @as(u5, @intCast(k - 1)) * 8;
    const e9_mask = @as(u32, 0xFF) << e9_shift;

    if ((c1.mask & e9_mask) != 0 and (c1.pattern & e9_mask) != (@as(u32, 0xE9) << e9_shift)) {
        log.debug("  failed: opcode 0xE9 conflict in C1", .{});
        return null; // Caller's pattern conflicts with the mandatory J2 opcode
    }
    const c_mask1 = c1.mask | e9_mask;
    const c_pattern1 = (c1.pattern & ~e9_mask) | (@as(u32, 0xE9) << e9_shift);

    if (k == 4) {
        // J1 is completely resolved just with the 0xE9 constraint applied above.
        log.debug("  Fast path K=4", .{});
        const rel1 = solveRelativeConstraint(.{
            .min_rel = c1.min_rel,
            .max_rel = c1.max_rel,
            .mask = c_mask1,
            .pattern = c_pattern1,
        }) orelse return null;
        const rel2 = solveRelativeConstraint(.{
            .min_rel = c2.min_rel,
            .max_rel = c2.max_rel,
            .mask = c2.mask,
            .pattern = c2.pattern,
        }) orelse return null;
        return .{ .rel1 = rel1, .rel2 = rel2 };
    }

    // Determine the bitwise shift and mask for the Shared Bytes (S)
    const s_shift = @as(u5, @intCast(k)) * 8;
    const num_shared = @as(u5, @intCast(4 - k));
    const s_mask = (@as(u32, 1) << (num_shared * 8)) - 1;

    log.debug("  Shared Bytes: shift={}, mask={x}", .{ s_shift, s_mask });

    var current_min = c1.min_rel;
    while (current_min <= c1.max_rel) {
        const u_rel: u32 = @bitCast(current_min);
        const S = u_rel >> s_shift; // Extract shared bytes from top of rel1

        // Calculate the maximum u32 value that shares this S
        const max_u_rel_for_S = (S << s_shift) | ((@as(u32, 1) << s_shift) - 1);
        const max_i_rel_for_S: i32 = @bitCast(max_u_rel_for_S);
        const local_max1 = @min(c1.max_rel, max_i_rel_for_S);

        // Does this S conflict with J2's requirements?
        if ((c2.mask & s_mask) != 0) {
            if ((c2.pattern & c2.mask & s_mask) != (S & c2.mask & s_mask)) {
                // Advance to the next block of S.
                log.debug("  Conflict at S={x} (min={x})", .{ S, current_min });
                if (max_i_rel_for_S == std.math.maxInt(i32)) break;
                const next_min = max_i_rel_for_S + 1;
                if (next_min > c1.max_rel) break;
                current_min = next_min;
                continue;
            }
        }

        log.debug("  Trying S={x} range [{x}, {x}]", .{ S, current_min, local_max1 });

        // Apply S as a strict constraint on the lowest bytes of J2
        const c_mask2 = c2.mask | s_mask;
        const c_pattern2 = (c2.pattern & ~s_mask) | S;

        // O(1) solver execution for this specific S value
        const opt_rel1 = solveRelativeConstraint(.{
            .min_rel = current_min,
            .max_rel = local_max1,
            .mask = c_mask1,
            .pattern = c_pattern1,
        });
        const opt_rel2 = solveRelativeConstraint(.{
            .min_rel = c2.min_rel,
            .max_rel = c2.max_rel,
            .mask = c_mask2,
            .pattern = c_pattern2,
        });
        if (opt_rel1 != null and opt_rel2 != null) {
            log.debug("  Success: rel1={x} rel2={x}", .{ opt_rel1.?, opt_rel2.? });
            return .{ .rel1 = opt_rel1.?, .rel2 = opt_rel2.? };
        }

        if (max_i_rel_for_S == std.math.maxInt(i32)) break;
        const next_min = max_i_rel_for_S + 1;
        if (next_min > c1.max_rel) break;
        current_min = next_min;
    }

    log.debug("  failed: no coupled solution found", .{});
    return null;
}

test "solveCoupledConstraint K=4 (Independent)" {
    // If K=4, J1 and J2 don't share rel32 bytes, but byte 3 of rel1 MUST be 0xE9 (the J2 opcode).
    // Let's force rel1 to be in[0x12000000, 0x120000FF].
    // Since highest byte (byte 3) must be 0xE9, no value starting with 0x12 will work.
    try testing.expectEqual(null, solveCoupledConstraint(
        4,
        .{
            .min_rel = 0x12000000,
            .max_rel = 0x120000FF,
            .mask = 0,
            .pattern = 0,
        },
        .{
            .min_rel = 0,
            .max_rel = 100,
            .mask = 0,
            .pattern = 0,
        },
    ));

    const res = solveCoupledConstraint(
        4,
        .{
            .min_rel = @bitCast(@as(u32, 0xE8000000)),
            .max_rel = @bitCast(@as(u32, 0xEA000000)),
            .mask = 0,
            .pattern = 0,
        },
        .{
            .min_rel = 0x1234,
            .max_rel = 0x1234,
            .mask = 0,
            .pattern = 0,
        },
    );
    try testing.expect(res != null);
    try testing.expectEqual(@as(i32, @bitCast(@as(u32, 0xE9000000))), res.?.rel1);
    try testing.expectEqual(0x1234, res.?.rel2);
}

test "solveCoupledConstraint K=2 (2 byte overlap)" {
    // K=2 means the top 2 bytes of rel1 are the bottom 2 bytes of rel2.
    // J2 opcode (0xE9) sits at byte 1 of rel1.
    const res = solveCoupledConstraint(
        2,
        .{
            .min_rel = 0x12340000,
            .max_rel = 0x1234FFFF,
            .mask = 0,
            .pattern = 0,
        },
        .{
            .min_rel = 0x00000000,
            .max_rel = 0x0000FFFF,
            .mask = 0,
            .pattern = 0,
        },
    );
    try testing.expect(res != null);
    try testing.expectEqual(0x1234E900, res.?.rel1);
    try testing.expectEqual(0x00001234, res.?.rel2);
}

test "solveCoupledConstraint K=2 conflict" {
    // Same as above, but J2 explicitly forbids lower bytes from being 0x1234.
    const res = solveCoupledConstraint(
        2,
        .{
            .min_rel = 0x12340000,
            .max_rel = 0x1234FFFF,
            .mask = 0,
            .pattern = 0,
        },
        .{
            .min_rel = 0x00000000,
            .max_rel = 0x0000FFFF,
            .mask = 0x0000FFFF,
            .pattern = 0x00005678,
        },
    );
    try testing.expectEqual(null, res);
}

test "solveCoupledConstraint K=2 spans multiple S values" {
    // We give J1 a wide range:[0x00000000, 0x00060000]. S can be 0 to 6.
    // We force J2 to require lower bytes = 0x0004. This forces the solver to skip S=0 and similar
    // and find S=4.
    const res = solveCoupledConstraint(
        2,
        .{
            .min_rel = 0,
            .max_rel = 0x00060000,
            .mask = 0,
            .pattern = 0,
        },
        .{
            .min_rel = 0,
            .max_rel = 0x0000FFFF,
            .mask = 0x0000FFFF,
            .pattern = 0x00000004,
        },
    );
    try testing.expect(res != null);
    try testing.expectEqual(0x0004E900, res.?.rel1);
    try testing.expectEqual(0x00000004, res.?.rel2);
}

/// Finds two allocations that simultaneously satisfy their individual offset constraints and the
/// physical overlap constraints of their origin instructions.
/// `r1` (for J1) and `r2` (for J2) separated by `k` bytes.
///
/// Runs in O(|H1| * |H2| + log(#R)) for
/// - `H1` and `H2` being the set of holes in the valid ranges in `r1` and `r2`
/// - `#R` being the number of ranges in the AddressAllocator.
pub fn findCoupledAllocation(
    self: *AddressAllocator,
    k: u8,
    r1: Request,
    r2: Request,
) ?[2]Range {
    if (r1.valid_range.size() < r1.size or r1.size == 0) return null;
    if (r2.valid_range.size() < r2.size or r2.size == 0) return null;
    assert(r2.source > r1.source);
    assert(r2.source - r1.source == k);

    var it1 = HoleIterator.init(self, r1.valid_range, r1.size);
    while (it1.next()) |hole1| {
        log.debug("findCoupledAllocation: Hole1: {f}", .{hole1});
        const b1 = getRelativeBounds(hole1, @intCast(r1.size), r1.source) orelse continue;

        var it2 = HoleIterator.init(self, r2.valid_range, r2.size);
        while (it2.next()) |hole2| {
            log.debug("  Hole2: {f}", .{hole2});
            const b2 = getRelativeBounds(hole2, @intCast(r2.size), r2.source) orelse continue;

            const c1 = Constraint{
                .min_rel = b1.min,
                .max_rel = b1.max,
                .mask = r1.mask,
                .pattern = r1.pattern,
            };
            const c2 = Constraint{
                .min_rel = b2.min,
                .max_rel = b2.max,
                .mask = r2.mask,
                .pattern = r2.pattern,
            };

            if (solveCoupledConstraint(k, c1, c2)) |result| {
                const start1 = @as(i64, @intCast(r1.source)) + result.rel1;
                const end1 = start1 + @as(i64, @intCast(r1.size));

                const start2 = @as(i64, @intCast(r2.source)) + result.rel2;
                const end2 = start2 + @as(i64, @intCast(r2.size));

                assert(end1 - start1 == r1.size);
                assert(end2 - start2 == r2.size);

                // If we used the same hole, we must ensure the actual allocations don't overlap.
                const range1 = Range{ .start = start1, .end = end1 };
                const range2 = Range{ .start = start2, .end = end2 };
                // TODO: Support allocating both trampolines in the exact same memory hole.
                // This requires dynamically partitioning the hole so the trampolines don't overlap
                // each other. For now, simply skip this case.
                if (range1.overlaps(range2)) continue;

                return [2]Range{
                    .{ .start = start1, .end = end1 },
                    .{ .start = start2, .end = end2 },
                };
            }
        }
    }

    return null;
}

/// A generic helper to mechanically verify that a coupled allocation satisfies all bitwise and
/// physical overlap constraints.
fn verifyCoupled(k: u8, r1: Request, r2: Request, j1_range: Range, j2_range: Range) !void {
    const rel1: i32 = @intCast(j1_range.start - @as(i64, @intCast(r1.source)));
    const rel2: i32 = @intCast(j2_range.start - @as(i64, @intCast(r2.source)));
    const u_rel1: u32 = @bitCast(rel1);
    const u_rel2: u32 = @bitCast(rel2);

    // Opcode Constraint
    const e9_shift = @as(u5, @intCast(k - 1)) * 8;
    try testing.expectEqual(@as(u32, 0xE9), (u_rel1 >> e9_shift) & 0xFF);

    // Shared Bytes Constraint
    if (k < 4) {
        const shared_shift = @as(u5, @intCast(k)) * 8;
        const shared_mask = (@as(u32, 1) << (@as(u5, @intCast(4 - k)) * 8)) - 1;
        const shared1 = (u_rel1 >> shared_shift) & shared_mask;
        const shared2 = u_rel2 & shared_mask;
        try testing.expectEqual(shared1, shared2);
    }

    // Original User Constraints
    try testing.expectEqual(r1.pattern, u_rel1 & r1.mask);
    try testing.expectEqual(r2.pattern, u_rel2 & r2.mask);
}

test "findCoupledAllocation" {
    var aa = AddressAllocator{ .child_allocator = testing.allocator };
    defer aa.deinit();

    // Block memory so we have distinct holes.
    // We need a hole that allows `rel1` to have `0xE9` in its second byte.
    // This means `rel1` needs to be around `0xE900`.
    try aa.block(.{ .start = 0x2000, .end = 0xE000 });
    try aa.block(.{ .start = 0xF000, .end = 0x10000 });

    const r1 = Request{ .source = 0, .size = 10, .valid_range = .{ .start = 0, .end = 0x20000 } };
    const r2 = Request{ .source = 2, .size = 10, .valid_range = .{ .start = 0, .end = 0x20000 } };
    const res = aa.findCoupledAllocation(2, r1, r2);
    try testing.expect(res != null);

    const j1_range = res.?[0];
    const j2_range = res.?[1];
    try testing.expect(j1_range.start >= 0xE000 and j1_range.end <= 0xF000);
    try testing.expect(j2_range.start >= 0x0000 and j2_range.end <= 0x2000);

    try verifyCoupled(2, r1, r2, j1_range, j2_range);
}

test "findCoupledAllocation K=1 (3 shared bytes)" {
    var aa = AddressAllocator{ .child_allocator = testing.allocator };
    defer aa.deinit();

    try aa.block(.{ .start = 0x2000, .end = 0x01000000 });

    const r1 = Request{ .source = 0, .size = 10, .valid_range = .{ .start = 0, .end = 0x10000000 } };
    const r2 = Request{ .source = 1, .size = 10, .valid_range = .{ .start = 0, .end = 0x10000000 } };
    const res = aa.findCoupledAllocation(1, r1, r2);
    try testing.expect(res != null);

    // For K=1, rel1's lowest byte MUST be 0xE9.
    // In Hole 1, the smallest valid rel1 is 0x000000E9.
    // This makes the shared bytes (top 3 bytes) 0x000000.
    try testing.expectEqual(0xE9, res.?[0].start);
    try testing.expectEqual(0x01, res.?[1].start);

    try verifyCoupled(1, r1, r2, res.?[0], res.?[1]);
}

test "findCoupledAllocation K=3 (1 shared byte)" {
    var aa = AddressAllocator{ .child_allocator = testing.allocator };
    defer aa.deinit();

    // K=3 means rel1 byte 2 MUST be 0xE9. rel1 looks like 0xXXE9XXXX.
    // Smallest positive is ~0x00E90000. We need a hole there.
    try aa.block(.{ .start = 0x2000, .end = 0x00E90000 });

    const r1 = Request{ .source = 0, .size = 10, .valid_range = .{ .start = 0, .end = 0x10000000 } };
    const r2 = Request{ .source = 3, .size = 10, .valid_range = .{ .start = 0, .end = 0x10000000 } };
    const res = aa.findCoupledAllocation(3, r1, r2);
    try testing.expect(res != null);
    try verifyCoupled(3, r1, r2, res.?[0], res.?[1]);
}

test "findCoupledAllocation K=4 (Independent)" {
    var aa = AddressAllocator{ .child_allocator = testing.allocator };
    defer aa.deinit();

    try aa.block(.{ .start = 0x2000, .end = 0x01000000 });

    const r1 = Request{
        .source = 0x50000000,
        .size = 10,
        .valid_range = .{ .start = 0, .end = 0x60000000 },
    };
    const r2 = Request{
        .source = 0x50000004,
        .size = 10,
        .valid_range = .{ .start = 0, .end = 0x60000000 },
    };

    const res = aa.findCoupledAllocation(4, r1, r2);
    try testing.expect(res != null);
    try verifyCoupled(4, r1, r2, res.?[0], res.?[1]);
}

test "findCoupledAllocation Negative Jumps (Both Backwards)" {
    var aa = AddressAllocator{ .child_allocator = testing.allocator };
    defer aa.deinit();

    // We block everything except two specific holes far behind the jump source.
    try aa.block(.{ .start = 0, .end = 0x10000000 });
    try aa.block(.{ .start = 0x10010000, .end = 0x20000000 });
    try aa.block(.{ .start = 0x20010000, .end = 0x60000000 });

    const r1 = Request{
        .source = 0x50000000,
        .size = 10,
        .valid_range = .{ .start = 0, .end = 0x60000000 },
    };
    const r2 = Request{
        .source = 0x50000002,
        .size = 10,
        .valid_range = .{ .start = 0, .end = 0x60000000 },
    };

    // The math solver natively handles the two's complement wraparound.
    const res = aa.findCoupledAllocation(2, r1, r2);
    try testing.expect(res != null);
    try verifyCoupled(2, r1, r2, res.?[0], res.?[1]);
}

test "findCoupledAllocation with Mask/Pattern Constraints" {
    var aa = AddressAllocator{ .child_allocator = testing.allocator };
    defer aa.deinit();

    try aa.block(.{ .start = 0, .end = 0x10000 });
    try aa.block(.{ .start = 0x20000, .end = 0x44440000 });
    try aa.block(.{ .start = 0x44450000, .end = 0x80000000 });

    // K=2. We force the shared bytes to be exactly 0x4444.
    const r1 = Request{ .source = 0, .size = 10, .valid_range = .{ .start = 0, .end = 0x80000000 } };
    const r2 = Request{
        .source = 2,
        .size = 10,
        .valid_range = .{ .start = 0, .end = 0x80000000 },
        .mask = 0x0000FFFF,
        .pattern = 0x00004444,
    };

    const res = aa.findCoupledAllocation(2, r1, r2);
    try testing.expect(res != null);
    try verifyCoupled(2, r1, r2, res.?[0], res.?[1]);

    // Explicitly verify the constraint was propagated to J1
    const rel1: i32 = @intCast(res.?[0].start);
    const u_rel1: u32 = @bitCast(rel1);
    try testing.expectEqual(@as(u32, 0x4444), (u_rel1 >> 16) & 0xFFFF);
}

test "findCoupledAllocation Fails on Math Impossibility" {
    var aa = AddressAllocator{ .child_allocator = testing.allocator };
    defer aa.deinit();

    const r1 = Request{
        .source = 0,
        .size = 10,
        .valid_range = .{ .start = 0, .end = 0x80000000 },
        .mask = 0xFFFF0000,
        .pattern = 0x11110000,
    };
    const r2 = Request{
        .source = 2,
        .size = 10,
        .valid_range = .{ .start = 0, .end = 0x80000000 },
        .mask = 0x0000FFFF,
        .pattern = 0x00002222,
    };

    const res = aa.findCoupledAllocation(2, r1, r2);
    try testing.expectEqual(null, res);
}