Compare commits

..

4 Commits

Author SHA1 Message Date
8322ddba3b refactoring 2025-12-15 16:19:43 +01:00
85a07116af fork test 2025-12-15 15:59:34 +01:00
0a282259e3 fork-like clone test 2025-12-15 15:52:10 +01:00
33ce01d56d first working real clone call 2025-12-15 15:51:50 +01:00
6 changed files with 260 additions and 50 deletions

View File

@@ -71,7 +71,7 @@ pub fn init() !void {
mem.writeInt(
u64,
syscall_flicken_bytes[2..][0..8],
@intFromPtr(&syscalls.syscall_entry),
@intFromPtr(&syscalls.syscallEntry),
.little,
);
flicken_templates.putAssumeCapacity("syscall", .{ .name = "syscall", .bytes = &syscall_flicken_bytes });

View File

@@ -356,6 +356,26 @@ test "echo" {
try testHelper(&.{ "echo", "Hello", "There" }, "Hello There\n");
}
test "nolibc_nopie_fork" {
try testHelper(
&.{ flicker_path, getTestExePath("nolibc_nopie_fork") },
"Child: I'm alive!\nParent: Child died.\n",
);
}
test "nolibc_pie_fork" {
try testHelper(
&.{ flicker_path, getTestExePath("nolibc_pie_fork") },
"Child: I'm alive!\nParent: Child died.\n",
);
}
// BUG: This one is flaky
// test "libc_pie_fork" {
// try testHelper(
// &.{ flicker_path, getTestExePath("libc_pie_fork") },
// "Child: I'm alive!\nParent: Child died.\n",
// );
// }
fn testPrintArgs(comptime name: []const u8) !void {
const exe_path = getTestExePath(name);
const loader_argv: []const []const u8 = &.{ flicker_path, exe_path, "foo", "bar", "baz hi" };

View File

@@ -3,8 +3,8 @@ const linux = std.os.linux;
const Patcher = @import("Patcher.zig");
const assert = std.debug.assert;
/// Represents the stack layout pushed by `syscall_entry` before calling the handler.
pub const UserRegs = extern struct {
/// Represents the stack layout pushed by `syscallEntry` before calling the handler.
pub const SavedContext = extern struct {
padding: u64, // Result of `sub $8, %rsp` for alignment
rflags: u64,
rax: u64,
@@ -22,27 +22,28 @@ pub const UserRegs = extern struct {
r13: u64,
r14: u64,
r15: u64,
/// This one isn't pushed on the stack by `syscall_entry`. It's pushed by the `call r11` to get
/// to the `syscall_entry`
/// Pushed automatically by the `call r11` instruction when entering `syscallEntry`.
/// Crucially we copy this onto the child stack (if needed) because then we can just return at
/// the end of the child handler inside `handleClone`.
return_address: u64,
};
/// The main entry point for intercepted syscalls.
///
/// This function is called from `syscall_entry` with a pointer to the saved registers.
/// It effectively emulates the syscall instruction while allowing for interception.
export fn syscall_handler(regs: *UserRegs) callconv(.c) void {
/// This function is called from `syscallEntry` with a pointer to the saved context.
/// It dispatches specific syscalls to handlers or executes them directly.
export fn syscall_handler(ctx: *SavedContext) callconv(.c) void {
// TODO: Handle signals (masking) to prevent re-entrancy issues if we touch global state.
const sys: linux.SYS = @enumFromInt(regs.rax);
const sys: linux.SYS = @enumFromInt(ctx.rax);
switch (sys) {
.readlink => {
// readlink(const char *path, char *buf, size_t bufsiz)
const path_ptr = @as([*:0]const u8, @ptrFromInt(regs.rdi));
const path_ptr = @as([*:0]const u8, @ptrFromInt(ctx.rdi));
// TODO: handle relative paths with cwd
if (isProcSelfExe(path_ptr)) {
handleReadlink(regs.rsi, regs.rdx, regs);
handleReadlink(ctx.rsi, ctx.rdx, ctx);
return;
}
},
@@ -52,20 +53,16 @@ export fn syscall_handler(regs: *UserRegs) callconv(.c) void {
// TODO: handle relative paths with dirfd pointing to /proc/self
// TODO: handle relative paths with dirfd == AT_FDCWD (like readlink)
// TODO: handle empty pathname
const path_ptr = @as([*:0]const u8, @ptrFromInt(regs.rsi));
const path_ptr = @as([*:0]const u8, @ptrFromInt(ctx.rsi));
if (isProcSelfExe(path_ptr)) {
handleReadlink(regs.rdx, regs.r10, regs);
handleReadlink(ctx.rdx, ctx.r10, ctx);
return;
}
},
.clone, .clone3 => {
handleClone(regs);
handleClone(ctx);
return;
},
.fork, .vfork => {
// fork/vfork duplicate the stack (or share it until exec), so the return path via
// syscall_entry works fine.
},
.rt_sigreturn => {
@panic("sigreturn is not supported yet");
},
@@ -92,23 +89,24 @@ export fn syscall_handler(regs: *UserRegs) callconv(.c) void {
}
// Write result back to the saved RAX so it is restored to the application.
regs.rax = executeSyscall(regs);
ctx.rax = executeSyscall(ctx);
}
inline fn executeSyscall(regs: *UserRegs) u64 {
inline fn executeSyscall(ctx: *SavedContext) u64 {
return linux.syscall6(
@enumFromInt(regs.rax),
regs.rdi,
regs.rsi,
regs.rdx,
regs.r10,
regs.r8,
regs.r9,
@enumFromInt(ctx.rax),
ctx.rdi,
ctx.rsi,
ctx.rdx,
ctx.r10,
ctx.r8,
ctx.r9,
);
}
/// Assembly trampoline that saves state and calls the Zig handler.
pub fn syscall_entry() callconv(.naked) void {
/// This is the target of the `call r11` instruction in the syscall flicken.
pub fn syscallEntry() callconv(.naked) void {
asm volatile (
\\ # Save all GPRs that must be preserved or are arguments
\\ push %r15
@@ -135,7 +133,7 @@ pub fn syscall_entry() callconv(.naked) void {
\\ # Total misalign: 8 bytes. We need 16-byte alignment for 'call'.
\\ sub $8, %rsp
\\
\\ # Pass pointer to regs (current rsp) as 1st argument (rdi) and call handler.
\\ # Pass pointer to ctx (current rsp) as 1st argument (rdi) and call handler.
\\ mov %rsp, %rdi
\\ call syscall_handler
\\
@@ -175,14 +173,14 @@ fn isProcSelfExe(path: [*:0]const u8) bool {
return path[i] == 0;
}
fn handleReadlink(buf_addr: u64, buf_size: u64, regs: *UserRegs) void {
fn handleReadlink(buf_addr: u64, buf_size: u64, ctx: *SavedContext) void {
const target = Patcher.target_exec_path;
const len = @min(target.len, buf_size);
const dest = @as([*]u8, @ptrFromInt(buf_addr));
@memcpy(dest[0..len], target[0..len]);
// readlink does not null-terminate if the buffer is full, it just returns length.
regs.rax = len;
ctx.rax = len;
}
const CloneArgs = extern struct {
@@ -199,44 +197,149 @@ const CloneArgs = extern struct {
cgroup: u64,
};
fn handleClone(regs: *UserRegs) void {
const sys: linux.syscalls.X64 = @enumFromInt(regs.rax);
std.debug.print("got: {}\n", .{sys});
/// Handles `clone` and `clone3` syscalls, which are used for thread and process creation.
///
/// **The Stack Switching Problem:**
/// When a thread is created, the caller provides a pointer to a new, empty stack (`child_stack`).
/// 1. The parent enters the kernel via `syscallEntry` (the trampoline).
/// 2. `syscallEntry` saves all registers and the return address onto the **parent's stack**.
/// 3. The kernel creates the child thread and switches its stack pointer (`RSP`) to `child_stack`.
/// 4. The child wakes up. If we simply let it return to `syscallEntry`, it would try to `pop`
/// registers from its `child_stack`. But that stack is empty! It would pop garbage and crash.
///
/// **The Solution:**
/// We manually replicate the parent's saved state onto the child's new stack *before* the syscall.
///
/// For that the following steps occur:
/// 1. We decode the arguments to determine if this is `clone` or `clone3` and locate the target
/// `child_stack`.
/// 2. If `child_stack` is 0 (e.g., `fork`), no stack switching occurs. The function simply executes
/// the syscall and handles the return value normally.
/// 3. Else we need to stack switch:
/// a. We calculate where `SavedContext` (registers + return addr) would sit on the top of the
/// *new* `child_stack`. We then `memcpy` the current `ctx` (from the parent's stack) to this
/// new location.
/// b. We set `rax = 0` in the *copied* context, so the child sees itself as the child.
/// c. We modify the syscall argument (the stack pointer passed to the kernel) to point to the
/// *start* of our copied context on the new stack, rather than the raw top. This ensures that
/// when the child wakes up, its `RSP` points exactly at the saved registers we just copied.
/// d. We execute the raw syscall inline.
/// - **Parent:** Returns from the syscall, updates `ctx.rax` with the Child PID, and returns
/// to the trampoline normally.
/// - **Child:** Wakes up on the new stack. It executes `postCloneChild`, restores all
/// registers from the *new* stack (popping the values we copied in step 3a), and finally
/// executes `ret`. This `ret` pops the `return_address` we copied, jumping directly back
/// to the user code, effectively bypassing the `syscallEntry` epilogue.
fn handleClone(ctx: *SavedContext) void {
const sys: linux.syscalls.X64 = @enumFromInt(ctx.rax);
var child_stack: u64 = 0;
// Determine stack
if (sys == .clone) {
// clone(flags, stack, ...)
child_stack = regs.rsi;
child_stack = ctx.rsi;
} else {
// clone3(struct clone_args *args, size_t size)
const args = @as(*const CloneArgs, @ptrFromInt(regs.rdi));
const args = @as(*const CloneArgs, @ptrFromInt(ctx.rdi));
if (args.stack != 0) {
child_stack = args.stack + args.stack_size;
}
}
std.debug.print("child_stack: {x}\n", .{child_stack});
// If no new stack, just execute (like fork)
if (child_stack == 0) {
regs.rax = executeSyscall(regs);
if (regs.rax == 0) {
postCloneChild(regs);
ctx.rax = executeSyscall(ctx);
if (ctx.rax == 0) {
postCloneChild(ctx);
} else {
assert(regs.rax > 0); // TODO:: error handling
postCloneParent(regs);
assert(ctx.rax > 0); // TODO:: error handling
postCloneParent(ctx);
}
return;
}
@panic("case with a different stack is not handled yet");
// Prepare child stack by copying SavedContext.
// TODO: test alignment
child_stack &= ~@as(u64, 0xf - 1); // align to 16 bytes
const child_ctx_addr = child_stack - @sizeOf(SavedContext);
const child_ctx = @as(*SavedContext, @ptrFromInt(child_ctx_addr));
child_ctx.* = ctx.*;
child_ctx.rax = 0;
// Prepare arguments for syscall
var new_rsi = ctx.rsi;
var new_rdi = ctx.rdi;
var clone3_args_copy: CloneArgs = undefined;
if (sys == .clone) {
new_rsi = child_ctx_addr;
} else {
const args = @as(*const CloneArgs, @ptrFromInt(ctx.rdi));
clone3_args_copy = args.*;
clone3_args_copy.stack = child_ctx_addr;
clone3_args_copy.stack_size = 0; // TODO:
new_rdi = @intFromPtr(&clone3_args_copy);
}
fn postCloneChild(regs: *UserRegs) void {
_ = regs;
std.debug.print("Child: post clone\n", .{});
// Execute clone/clone3 via inline assembly
// We handle the child path entirely in assembly to avoid stack frame issues.
const ret = asm volatile (
\\ syscall
\\ test %rax, %rax
\\ jnz 1f
\\
\\ # --- CHILD PATH ---
\\ # We are now on the new stack and %rsp points to child_ctx_addr
\\
\\ # Run Child Hook
\\ # Argument 1 (rdi): Pointer to SavedContext (which is current rsp)
\\ mov %rsp, %rdi
\\ call postCloneChild
\\
\\ # Restore Context
\\ add $8, %rsp # Skip padding
\\ popfq
\\ pop %rax
\\ pop %rbx
\\ pop %rcx
\\ pop %rdx
\\ pop %rsi
\\ pop %rdi
\\ pop %rbp
\\ pop %r8
\\ pop %r9
\\ pop %r10
\\ pop %r11
\\ pop %r12
\\ pop %r13
\\ pop %r14
\\ pop %r15
\\
\\ # %rsp now points to `return_address` so we can just return.
\\ ret
\\
\\ 1:
\\ # --- PARENT PATH ---
: [ret] "={rax}" (-> usize),
: [number] "{rax}" (ctx.rax),
[arg1] "{rdi}" (new_rdi),
[arg2] "{rsi}" (new_rsi),
[arg3] "{rdx}" (ctx.rdx),
[arg4] "{r10}" (ctx.r10),
[arg5] "{r8}" (ctx.r8),
[arg6] "{r9}" (ctx.r9),
[child_hook] "i" (postCloneChild),
: .{ .rcx = true, .r11 = true, .memory = true });
// Parent continues here
ctx.rax = ret;
postCloneParent(ctx);
}
fn postCloneParent(regs: *UserRegs) void {
std.debug.print("Parent: post clone; Child PID: {}\n", .{regs.rax});
export fn postCloneChild(ctx: *SavedContext) callconv(.c) void {
_ = ctx;
}
fn postCloneParent(ctx: *SavedContext) void {
_ = ctx;
}

View File

@@ -0,0 +1,61 @@
const std = @import("std");
const linux = std.os.linux;
const clone = linux.CLONE;
pub fn main() !void {
// SIGCHLD: Send signal to parent on exit (required for waitpid)
const flags = clone.FILES | clone.FS | linux.SIG.CHLD;
const msg = "Child: Hello\n";
const msg_len = msg.len;
// We use inline assembly to perform the clone syscall and handle the child path completely to
// avoid the compiler generating code that relies on the parent's stack frame in the child
// process (where the stack is empty).
const ret = asm volatile (
\\ syscall
\\ test %%rax, %%rax
\\ jnz 1f
\\
\\ # Child Path
\\ # Write to stdout
\\ mov $1, %%rdi # fd = 1 (stdout)
\\ mov %[msg], %%rsi # buffer
\\ mov %[len], %%rdx # length
\\ mov $1, %%rax # SYS_write
\\ syscall
\\
\\ # Exit
\\ mov $0, %%rdi # code = 0
\\ mov $60, %%rax # SYS_exit
\\ syscall
\\
\\ # Should not be reached
\\ ud2
\\
\\ 1:
\\ # Parent Path continues
: [ret] "={rax}" (-> usize),
: [number] "{rax}" (@intFromEnum(linux.syscalls.X64.clone)),
[arg1] "{rdi}" (flags),
[arg2] "{rsi}" (0),
[arg3] "{rdx}" (0),
[arg4] "{r10}" (0),
[arg5] "{r8}" (0),
[msg] "r" (msg.ptr),
[len] "r" (msg_len),
: .{ .rcx = true, .r11 = true, .memory = true });
// Parent Process
const child_pid: i32 = @intCast(ret);
if (child_pid < 0) {
_ = linux.syscall3(.write, 1, @intFromPtr("Parent: Clone failed\n"), 21);
return;
}
var status: u32 = 0;
// wait4 for the child to exit
_ = linux.syscall4(.wait4, @as(usize, @intCast(child_pid)), @intFromPtr(&status), 0, 0);
_ = linux.syscall3(.write, 1, @intFromPtr("Parent: Goodbye\n"), 16);
}

View File

@@ -51,9 +51,12 @@ pub fn main() !void {
: .{ .rcx = true, .r11 = true, .memory = true });
// Parent Process
const child_pid: i32 = @intCast(ret);
const child_pid: i64 = @bitCast(ret);
if (child_pid < 0) {
_ = linux.syscall3(.write, 1, @intFromPtr("Parent: Clone failed\n"), 21);
std.debug.print(
"Parent: Clone failed with: {}\n",
.{@as(linux.E, @enumFromInt(-child_pid))},
);
return;
}

23
src/test/fork.zig Normal file
View File

@@ -0,0 +1,23 @@
const std = @import("std");
const linux = std.os.linux;
pub fn main() !void {
const ret = linux.syscall0(.fork);
const pid: i32 = @intCast(ret);
if (pid == 0) {
// --- Child ---
const msg = "Child: I'm alive!\n";
_ = linux.syscall3(.write, 1, @intFromPtr(msg.ptr), msg.len);
linux.exit(0);
} else if (pid > 0) {
// --- Parent ---
var status: u32 = 0;
_ = linux.syscall4(.wait4, @intCast(pid), @intFromPtr(&status), 0, 0);
const msg = "Parent: Child died.\n";
_ = linux.syscall3(.write, 1, @intFromPtr(msg.ptr), msg.len);
} else {
const msg = "Fork failed!\n";
_ = linux.syscall3(.write, 1, @intFromPtr(msg.ptr), msg.len);
}
}