From 9ac107b3984b6aa1cb342ad8cf94213e379a2d02 Mon Sep 17 00:00:00 2001 From: Pascal Zittlau Date: Thu, 11 Dec 2025 11:56:01 +0100 Subject: [PATCH] respect /proc/sys/vm/mmap_min_addr --- docs/use_cases.md | 120 ++++++++++++++++++++++++++++++++++++++++++++++ src/Patcher.zig | 30 +++++++++--- src/main.zig | 5 +- 3 files changed, 144 insertions(+), 11 deletions(-) create mode 100644 docs/use_cases.md diff --git a/docs/use_cases.md b/docs/use_cases.md new file mode 100644 index 0000000..c965f98 --- /dev/null +++ b/docs/use_cases.md @@ -0,0 +1,120 @@ +# Use Cases for Flicker + +Flicker's architecture, load-time binary rewriting without control-flow recovery, uniquely positions +it to handle scenarios where source code is unavailable (legacy/commercial software) and performance +is critical. Unlike Dynamic Binary Translation (DBT) tools like Valgrind or QEMU, which incur high +overhead due to JIT compilation/emulation, Flicker patches code to run natively. + +Below are possible use cases categorized by domain. + +## High Performance Computing (HPC) & Optimization + +### Approximate Computing and Mixed-Precision Analysis + +Scientific simulations often default to double precision (64-bit) for safety, even when single +(32-bit) or half (16-bit) precision would yield accurate results with significantly higher +performance. But rewriting massive legacy Fortran/C++ codebases to test precision sensitivity is +impractical. + +Flicker could instrument floating-point instructions to perform "Shadow Execution," running +operations in both double and single precision to log divergence. Alternatively, it can mask lower +bits of registers to simulate low-precision hardware. + +Unlike compiler-based approaches that change the whole binary, Flicker can apply these patches +selectively to specific "hot" functions at load-time, preserving accuracy in sensitive setup/solver +phases while optimizing the bulk computation. + +### Profiling Memory Access Patterns (False Sharing) + +In multi-threaded HPC applications, performance often degrades due to "False Sharing", where multiple +threads modify independent variables that happen to reside on the same CPU cache line, causing cache +thrashing. + +Sampling profilers (like `perf`) provide statistical approximations but often miss precise +interaction timings. Source-level instrumentation disrupts compiler optimizations. + +Flicker could instrument memory store instructions (`MOV` etc.) to record effective addresses. By +aggregating this data, it can generate heatmaps of cache line access density, precisely identifying +false sharing or inefficient strided access patterns in optimized binaries. + +### Low-Overhead I/O Tracing + +Parallel MPI jobs often inadvertently stress parallel filesystems (Lustre, GPFS) by performing +excessive small writes or metadata operations. + +Tools like `strace` force a context switch for every syscall, slowing down the application so much +that the race conditions or I/O storms disappear (Heisenbugs). + +By intercepting I/O syscalls (`write`, `read`, `open`, ...) inside the process memory, Flicker could +aggregate I/O statistics (e.g., "Rank 7 performed 50,000 writes of 4 bytes") with negligible +overhead, providing a lightweight alternative to `strace` for high-throughput jobs. + +### MPI Communication Profiling + +HPC performance is often bound by network latency between nodes. Profiling tools like Vampir are +heavy and costly. Flicker can patch shared library exports (like MPI_Send or MPI_Recv) at load-time. +This allows lightweight logging of message sizes and latencies without recompiling the application +or linking against special profiling libraries. + +## Security and Hardening + +### Coverage-Guided Fuzzing (Closed Source) + +Fuzzing requires feedback on which code paths are executed to be effective. But for closed-source +software, researchers typically use QEMU-mode in AFL. QEMU translates instructions dynamically, +resulting in slow execution speeds (often 2-10x slower than native). + +Flicker could inject coverage instrumentation (updating a shared memory bitmap on branch targets) +directly into the binary at load time. This would allow closed-source binaries to be fuzzed at +near-native speeds, significantly increasing the number of test cases run per second. + +### Software Shadow Stacks + +Return-Oriented Programming (ROP) attacks exploit buffer overflows to overwrite return addresses on +the stack. + +Hardware enforcement (Intel CET/AMD Shadow Stack) requires modern CPUs (Intel 11th Gen+, Zen 3+) and +recent kernels (Linux 6.6+). Older systems remain vulnerable. + +Flicker could instrument `CALL` and `RET` instructions to implement a Software Shadow Stack. On +`CALL`, the return address is pushed to a secure, isolated stack region. On `RET`, the address on +the stack is compared against the shadow stack. If they mismatch, the program terminates, preventing +ROP chains. + +### Binary-Only Address Sanitizer (ASan) + +Memory safety errors (buffer overflows, use-after-free) in C/C++ are often found with ASan or +Valgrind. ASan requires recompilation. Valgrind works on binaries but slows execution by 20x-50x, +making it unusable for large datasets. + +Flicker could intercept allocator calls (`malloc`/`free`) to poison "red zones" around memory and +instrument memory access instructions to check these zones. This provides ASan-like capabilities for +legacy binaries with significantly lower overhead than Valgrind. + +## Systems and Maintenance + +### Hardware Feature Emulation (Forward Compatibility) + +HPC clusters are often heterogeneous, with older nodes lacking newer instruction sets (e.g., +AVX-512, AMX). A binary compiled for a newer architecture will crash with `SIGILL` on an older node. + +Flicker could detect these instructions and patch them to jump to a software emulation routine or a +scalar fallback implementation. This allows binaries optimized for the latest hardware to run +(albeit slower) on legacy nodes for testing or resource-filling purposes. + +### Fault Injection + +To certify software for mission-critical environments, developers must verify how it handles +hardware errors. + +Flicker could instrument instructions to probabilistically flip bits in registers or memory +("Bit-flip injection"), or intercept syscalls to return error codes (e.g., returning `ENOSPC` on +`write`). It can also simulate malfunctioning or intermittent devices by corrupting buffers returned +by `read`. This allows testing error recovery paths without physical hardware damage. + +### Record/Replay Engine + +Debugging non-deterministic bugs (race conditions) is difficult because they are hard to reproduce. +By intercepting all sources of non-determinism (syscalls, `rdtsc`, atomic instructions, signals), +Flicker could record a trace of an execution. This trace can be replayed later to force the exact +same execution path, allowing developers to debug the error state interactively. diff --git a/src/Patcher.zig b/src/Patcher.zig index b6ab63b..e967680 100644 --- a/src/Patcher.zig +++ b/src/Patcher.zig @@ -52,19 +52,35 @@ pub var address_allocator: AddressAllocator = .empty; pub var allocated_pages: std.AutoHashMapUnmanaged(u64, void) = .empty; pub var mutex: std.Thread.Mutex = .{}; -var init_once = std.once(initInner); -pub fn init() void { - init_once.call(); -} -fn initInner() void { +/// Initialize the patcher. +/// NOTE: This should only be called **once**. +pub fn init() !void { gpa = std.heap.page_allocator; - flicken_templates.ensureTotalCapacity( + + try flicken_templates.ensureTotalCapacity( std.heap.page_allocator, page_size / @sizeOf(Flicken), - ) catch @panic("failed initializing patcher"); + ); flicken_templates.putAssumeCapacity("nop", .{ .name = "nop", .bytes = &.{} }); mem.writeInt(u64, syscall_flicken_bytes[2..][0..8], @intFromPtr(&syscalls.syscall_entry), .little); flicken_templates.putAssumeCapacity("syscall", .{ .name = "syscall", .bytes = &syscall_flicken_bytes }); + + { + // Read mmap_min_addr to block the low memory range. This prevents us from allocating + // trampolines in the forbidden low address range. + var min_addr: u64 = 0x10000; // Default safe fallback (64KB) + if (std.fs.openFileAbsolute("/proc/sys/vm/mmap_min_addr", .{})) |file| { + defer file.close(); + var buf: [32]u8 = undefined; + if (file.readAll(&buf)) |len| { + const trimmed = std.mem.trim(u8, buf[0..len], " \n\r\t"); + if (std.fmt.parseInt(u64, trimmed, 10)) |val| { + min_addr = val; + } else |_| {} + } else |_| {} + } else |_| {} + try address_allocator.block(gpa, .{ .start = 0, .end = @intCast(min_addr) }, 0); + } } /// Flicken name and bytes have to be valid for the lifetime it's used. If a trampoline with the diff --git a/src/main.zig b/src/main.zig index 5becf05..5f82fa8 100644 --- a/src/main.zig +++ b/src/main.zig @@ -50,10 +50,7 @@ pub fn main() !void { } // Initialize patcher - Patcher.init(); - // Block the first 64k to avoid mmap_min_addr (EPERM) issues on Linux. - // TODO: read it from `/proc/sys/vm/mmap_min_addr` instead. - try Patcher.address_allocator.block(Patcher.gpa, .{ .start = 0, .end = 0x10000 }, 0); + try Patcher.init(); // Map file into memory const file = try lookupFile(mem.sliceTo(std.os.argv[arg_index], 0));