From a93a0a4f92edae5ec8a8638bb2511e16daf51aa9 Mon Sep 17 00:00:00 2001 From: Jose Colon Rodriguez Date: Fri, 26 Jan 2024 19:44:34 -0400 Subject: [PATCH] Initial TigerBeetle specifics strip. All 7 tests passing. --- .gitignore | 13 + build.zig | 18 + src/bounded_array.zig | 76 ++ src/fifo.zig | 152 ++++ src/io.zig | 30 + src/io/darwin.zig | 823 ++++++++++++++++++++++ src/io/linux.zig | 1126 +++++++++++++++++++++++++++++ src/io/windows.zig | 1209 ++++++++++++++++++++++++++++++++ src/low_level_hash_vectors.zig | 142 ++++ src/stdx.zig | 728 +++++++++++++++++++ src/test.zig | 654 +++++++++++++++++ src/time.zig | 112 +++ 12 files changed, 5083 insertions(+) create mode 100644 .gitignore create mode 100644 build.zig create mode 100644 src/bounded_array.zig create mode 100644 src/fifo.zig create mode 100644 src/io.zig create mode 100644 src/io/darwin.zig create mode 100644 src/io/linux.zig create mode 100644 src/io/windows.zig create mode 100644 src/low_level_hash_vectors.zig create mode 100644 src/stdx.zig create mode 100644 src/test.zig create mode 100644 src/time.zig diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c04a0ab --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +# This file is for zig-specific build artifacts. +# If you have OS-specific or editor-specific files to ignore, +# such as *.swp or .DS_Store, put those in your global +# ~/.gitignore and put this in your ~/.gitconfig: +# +# [core] +# excludesfile = ~/.gitignore +# +# Cheers! +# -andrewrk + +zig-cache/ +zig-out/ diff --git a/build.zig b/build.zig new file mode 100644 index 0000000..9ab5f74 --- /dev/null +++ b/build.zig @@ -0,0 +1,18 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + _ = b.addModule("io", .{ .source_file = .{ .path = "src/io.zig" } }); + + const main_tests = b.addTest(.{ + .root_source_file = .{ .path = "src/test.zig" }, + .target = target, + .optimize = optimize, + }); + + const run_main_tests = b.addRunArtifact(main_tests); + const test_step = b.step("test", "Run library tests"); + test_step.dependOn(&run_main_tests.step); +} diff --git a/src/bounded_array.zig b/src/bounded_array.zig new file mode 100644 index 0000000..c288093 --- /dev/null +++ b/src/bounded_array.zig @@ -0,0 +1,76 @@ +const std = @import("std"); +const assert = std.debug.assert; + +/// A version of standard `BoundedArray` with TigerBeetle-idiomatic APIs. +/// +/// See for the original reason for +/// wrapping --- we need an `fn count` which returns an `usize`, instead of potentially much smaller +/// type which stores the length internally. +pub fn BoundedArray(comptime T: type, comptime capacity: usize) type { + const Inner = @import("std").BoundedArray(T, capacity); // smuggle the std version past tidy + + return struct { + inner: Inner = Inner{}, + + const Self = @This(); + + pub inline fn from_slice(items: []const T) error{Overflow}!Self { + return .{ .inner = try Inner.fromSlice(items) }; + } + + pub inline fn count(array: *const Self) usize { + return array.inner.len; + } + + /// Returns count of elements in this BoundedArray in the specified integer types, + /// checking at compile time that it indeed can represent the length. + pub inline fn count_as(array: *const Self, comptime Int: type) Int { + return array.inner.len; + } + + pub inline fn full(self: Self) bool { + return self.count() == capacity; + } + + pub inline fn empty(self: Self) bool { + return self.count() == 0; + } + + pub inline fn get(array: *const Self, index: usize) T { + return array.inner.get(index); + } + + pub inline fn slice(array: *Self) []T { + return array.inner.slice(); + } + + pub inline fn const_slice(array: *const Self) []const T { + return array.inner.constSlice(); + } + + pub inline fn add_one_assume_capacity(array: *Self) *T { + return array.inner.addOneAssumeCapacity(); + } + + pub inline fn append_assume_capacity(array: *Self, item: T) void { + array.inner.appendAssumeCapacity(item); + } + + pub inline fn writer(self: *Self) Inner.Writer { + return self.inner.writer(); + } + + pub inline fn swap_remove(array: *Self, index: usize) T { + return array.inner.swapRemove(index); + } + + pub inline fn truncate(array: *Self, count_new: usize) void { + assert(count_new <= array.count()); + array.inner.len = @intCast(count_new); // can't overflow due to check above. + } + + pub inline fn clear(array: *Self) void { + array.inner.len = 0; + } + }; +} diff --git a/src/fifo.zig b/src/fifo.zig new file mode 100644 index 0000000..22e13b2 --- /dev/null +++ b/src/fifo.zig @@ -0,0 +1,152 @@ +const std = @import("std"); +const assert = std.debug.assert; + +/// An intrusive first in/first out linked list. +/// The element type T must have a field called "next" of type ?*T +pub fn FIFO(comptime T: type) type { + return struct { + const Self = @This(); + + in: ?*T = null, + out: ?*T = null, + count: u64 = 0, + // This should only be null if you're sure we'll never want to monitor `count`. + name: ?[]const u8, + + pub fn push(self: *Self, elem: *T) void { + assert(elem.next == null); + if (self.in) |in| { + in.next = elem; + self.in = elem; + } else { + assert(self.out == null); + self.in = elem; + self.out = elem; + } + self.count += 1; + } + + pub fn pop(self: *Self) ?*T { + const ret = self.out orelse return null; + self.out = ret.next; + ret.next = null; + if (self.in == ret) self.in = null; + self.count -= 1; + return ret; + } + + pub fn peek_last(self: Self) ?*T { + return self.in; + } + + pub fn peek(self: Self) ?*T { + return self.out; + } + + pub fn empty(self: Self) bool { + return self.peek() == null; + } + + /// Returns whether the linked list contains the given *exact element* (pointer comparison). + pub fn contains(self: *const Self, elem_needle: *const T) bool { + var iterator = self.peek(); + while (iterator) |elem| : (iterator = elem.next) { + if (elem == elem_needle) return true; + } + return false; + } + + /// Remove an element from the FIFO. Asserts that the element is + /// in the FIFO. This operation is O(N), if this is done often you + /// probably want a different data structure. + pub fn remove(self: *Self, to_remove: *T) void { + if (to_remove == self.out) { + _ = self.pop(); + return; + } + var it = self.out; + while (it) |elem| : (it = elem.next) { + if (to_remove == elem.next) { + if (to_remove == self.in) self.in = elem; + elem.next = to_remove.next; + to_remove.next = null; + self.count -= 1; + break; + } + } else unreachable; + } + + pub fn reset(self: *Self) void { + self.* = .{ .name = self.name }; + } + }; +} + +test "FIFO: push/pop/peek/remove/empty" { + const testing = @import("std").testing; + + const Foo = struct { next: ?*@This() = null }; + + var one: Foo = .{}; + var two: Foo = .{}; + var three: Foo = .{}; + + var fifo: FIFO(Foo) = .{ .name = null }; + try testing.expect(fifo.empty()); + + fifo.push(&one); + try testing.expect(!fifo.empty()); + try testing.expectEqual(@as(?*Foo, &one), fifo.peek()); + try testing.expect(fifo.contains(&one)); + try testing.expect(!fifo.contains(&two)); + try testing.expect(!fifo.contains(&three)); + + fifo.push(&two); + fifo.push(&three); + try testing.expect(!fifo.empty()); + try testing.expectEqual(@as(?*Foo, &one), fifo.peek()); + try testing.expect(fifo.contains(&one)); + try testing.expect(fifo.contains(&two)); + try testing.expect(fifo.contains(&three)); + + fifo.remove(&one); + try testing.expect(!fifo.empty()); + try testing.expectEqual(@as(?*Foo, &two), fifo.pop()); + try testing.expectEqual(@as(?*Foo, &three), fifo.pop()); + try testing.expectEqual(@as(?*Foo, null), fifo.pop()); + try testing.expect(fifo.empty()); + try testing.expect(!fifo.contains(&one)); + try testing.expect(!fifo.contains(&two)); + try testing.expect(!fifo.contains(&three)); + + fifo.push(&one); + fifo.push(&two); + fifo.push(&three); + fifo.remove(&two); + try testing.expect(!fifo.empty()); + try testing.expectEqual(@as(?*Foo, &one), fifo.pop()); + try testing.expectEqual(@as(?*Foo, &three), fifo.pop()); + try testing.expectEqual(@as(?*Foo, null), fifo.pop()); + try testing.expect(fifo.empty()); + + fifo.push(&one); + fifo.push(&two); + fifo.push(&three); + fifo.remove(&three); + try testing.expect(!fifo.empty()); + try testing.expectEqual(@as(?*Foo, &one), fifo.pop()); + try testing.expect(!fifo.empty()); + try testing.expectEqual(@as(?*Foo, &two), fifo.pop()); + try testing.expect(fifo.empty()); + try testing.expectEqual(@as(?*Foo, null), fifo.pop()); + try testing.expect(fifo.empty()); + + fifo.push(&one); + fifo.push(&two); + fifo.remove(&two); + fifo.push(&three); + try testing.expectEqual(@as(?*Foo, &one), fifo.pop()); + try testing.expectEqual(@as(?*Foo, &three), fifo.pop()); + try testing.expectEqual(@as(?*Foo, null), fifo.pop()); + try testing.expect(fifo.empty()); +} diff --git a/src/io.zig b/src/io.zig new file mode 100644 index 0000000..f4bf877 --- /dev/null +++ b/src/io.zig @@ -0,0 +1,30 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const assert = std.debug.assert; +const os = std.os; + +const FIFO = @import("fifo.zig").FIFO; +const IO_Linux = @import("io/linux.zig").IO; +const IO_Darwin = @import("io/darwin.zig").IO; +const IO_Windows = @import("io/windows.zig").IO; + +pub const IO = switch (builtin.target.os.tag) { + .linux => IO_Linux, + .windows => IO_Windows, + .macos, .tvos, .watchos, .ios => IO_Darwin, + else => @compileError("IO is not supported for platform"), +}; + +pub fn bufferLimit(buffer_len: usize) usize { + // Linux limits how much may be written in a `pwrite()/pread()` call, which is `0x7ffff000` on + // both 64-bit and 32-bit systems, due to using a signed C int as the return value, as well as + // stuffing the errno codes into the last `4096` values. + // Darwin limits writes to `0x7fffffff` bytes, more than that returns `EINVAL`. + // The corresponding POSIX limit is `std.math.maxInt(isize)`. + const limit = switch (builtin.target.os.tag) { + .linux => 0x7ffff000, + .macos, .ios, .watchos, .tvos => std.math.maxInt(i32), + else => std.math.maxInt(isize), + }; + return @min(limit, buffer_len); +} diff --git a/src/io/darwin.zig b/src/io/darwin.zig new file mode 100644 index 0000000..8e4058d --- /dev/null +++ b/src/io/darwin.zig @@ -0,0 +1,823 @@ +const std = @import("std"); +const os = std.os; +const mem = std.mem; +const assert = std.debug.assert; +const log = std.log.scoped(.io); + +const FIFO = @import("../fifo.zig").FIFO; +const Time = @import("../time.zig").Time; +const bufferLimit = @import("../io.zig").bufferLimit; + +const sector_size = 4096; +const direct_io = true; + +pub const IO = struct { + kq: os.fd_t, + time: Time = .{}, + io_inflight: usize = 0, + timeouts: FIFO(Completion) = .{ .name = "io_timeouts" }, + completed: FIFO(Completion) = .{ .name = "io_completed" }, + io_pending: FIFO(Completion) = .{ .name = "io_pending" }, + + pub fn init(entries: u12, flags: u32) !IO { + _ = entries; + _ = flags; + + const kq = try os.kqueue(); + assert(kq > -1); + return IO{ .kq = kq }; + } + + pub fn deinit(self: *IO) void { + assert(self.kq > -1); + os.close(self.kq); + self.kq = -1; + } + + /// Pass all queued submissions to the kernel and peek for completions. + pub fn tick(self: *IO) !void { + return self.flush(false); + } + + /// Pass all queued submissions to the kernel and run for `nanoseconds`. + /// The `nanoseconds` argument is a u63 to allow coercion to the i64 used + /// in the __kernel_timespec struct. + pub fn run_for_ns(self: *IO, nanoseconds: u63) !void { + var timed_out = false; + var completion: Completion = undefined; + const on_timeout = struct { + fn callback( + timed_out_ptr: *bool, + _completion: *Completion, + result: TimeoutError!void, + ) void { + _ = _completion; + _ = result catch unreachable; + + timed_out_ptr.* = true; + } + }.callback; + + // Submit a timeout which sets the timed_out value to true to terminate the loop below. + self.timeout( + *bool, + &timed_out, + on_timeout, + &completion, + nanoseconds, + ); + + // Loop until our timeout completion is processed above, which sets timed_out to true. + // LLVM shouldn't be able to cache timed_out's value here since its address escapes above. + while (!timed_out) { + try self.flush(true); + } + } + + fn flush(self: *IO, wait_for_completions: bool) !void { + var io_pending = self.io_pending.peek(); + var events: [256]os.Kevent = undefined; + + // Check timeouts and fill events with completions in io_pending + // (they will be submitted through kevent). + // Timeouts are expired here and possibly pushed to the completed queue. + const next_timeout = self.flush_timeouts(); + const change_events = self.flush_io(&events, &io_pending); + + // Only call kevent() if we need to submit io events or if we need to wait for completions. + if (change_events > 0 or self.completed.empty()) { + // Zero timeouts for kevent() implies a non-blocking poll + var ts = std.mem.zeroes(os.timespec); + + // We need to wait (not poll) on kevent if there's nothing to submit or complete. + // We should never wait indefinitely (timeout_ptr = null for kevent) given: + // - tick() is non-blocking (wait_for_completions = false) + // - run_for_ns() always submits a timeout + if (change_events == 0 and self.completed.empty()) { + if (wait_for_completions) { + const timeout_ns = next_timeout orelse @panic("kevent() blocking forever"); + ts.tv_nsec = @as(@TypeOf(ts.tv_nsec), @intCast(timeout_ns % std.time.ns_per_s)); + ts.tv_sec = @as(@TypeOf(ts.tv_sec), @intCast(timeout_ns / std.time.ns_per_s)); + } else if (self.io_inflight == 0) { + return; + } + } + + const new_events = try os.kevent( + self.kq, + events[0..change_events], + events[0..events.len], + &ts, + ); + + // Mark the io events submitted only after kevent() successfully processed them + self.io_pending.out = io_pending; + if (io_pending == null) { + self.io_pending.in = null; + } + + self.io_inflight += change_events; + self.io_inflight -= new_events; + + for (events[0..new_events]) |event| { + const completion = @as(*Completion, @ptrFromInt(event.udata)); + completion.next = null; + self.completed.push(completion); + } + } + + var completed = self.completed; + self.completed.reset(); + while (completed.pop()) |completion| { + (completion.callback)(self, completion); + } + } + + fn flush_io(_: *IO, events: []os.Kevent, io_pending_top: *?*Completion) usize { + for (events, 0..) |*event, flushed| { + const completion = io_pending_top.* orelse return flushed; + io_pending_top.* = completion.next; + + const event_info = switch (completion.operation) { + .accept => |op| [2]c_int{ op.socket, os.system.EVFILT_READ }, + .connect => |op| [2]c_int{ op.socket, os.system.EVFILT_WRITE }, + .read => |op| [2]c_int{ op.fd, os.system.EVFILT_READ }, + .write => |op| [2]c_int{ op.fd, os.system.EVFILT_WRITE }, + .recv => |op| [2]c_int{ op.socket, os.system.EVFILT_READ }, + .send => |op| [2]c_int{ op.socket, os.system.EVFILT_WRITE }, + else => @panic("invalid completion operation queued for io"), + }; + + event.* = .{ + .ident = @as(u32, @intCast(event_info[0])), + .filter = @as(i16, @intCast(event_info[1])), + .flags = os.system.EV_ADD | os.system.EV_ENABLE | os.system.EV_ONESHOT, + .fflags = 0, + .data = 0, + .udata = @intFromPtr(completion), + }; + } + return events.len; + } + + fn flush_timeouts(self: *IO) ?u64 { + var min_timeout: ?u64 = null; + var timeouts: ?*Completion = self.timeouts.peek(); + while (timeouts) |completion| { + timeouts = completion.next; + + // NOTE: We could cache `now` above the loop but monotonic() should be cheap to call. + const now = self.time.monotonic(); + const expires = completion.operation.timeout.expires; + + // NOTE: remove() could be O(1) here with a doubly-linked-list + // since we know the previous Completion. + if (now >= expires) { + self.timeouts.remove(completion); + self.completed.push(completion); + continue; + } + + const timeout_ns = expires - now; + if (min_timeout) |min_ns| { + min_timeout = @min(min_ns, timeout_ns); + } else { + min_timeout = timeout_ns; + } + } + return min_timeout; + } + + /// This struct holds the data needed for a single IO operation + pub const Completion = struct { + next: ?*Completion, + context: ?*anyopaque, + callback: *const fn (*IO, *Completion) void, + operation: Operation, + }; + + const Operation = union(enum) { + accept: struct { + socket: os.socket_t, + }, + close: struct { + fd: os.fd_t, + }, + connect: struct { + socket: os.socket_t, + address: std.net.Address, + initiated: bool, + }, + read: struct { + fd: os.fd_t, + buf: [*]u8, + len: u32, + offset: u64, + }, + recv: struct { + socket: os.socket_t, + buf: [*]u8, + len: u32, + }, + send: struct { + socket: os.socket_t, + buf: [*]const u8, + len: u32, + }, + timeout: struct { + expires: u64, + }, + write: struct { + fd: os.fd_t, + buf: [*]const u8, + len: u32, + offset: u64, + }, + }; + + fn submit( + self: *IO, + context: anytype, + comptime callback: anytype, + completion: *Completion, + comptime operation_tag: std.meta.Tag(Operation), + operation_data: anytype, + comptime OperationImpl: type, + ) void { + const onCompleteFn = struct { + fn onComplete(io: *IO, _completion: *Completion) void { + // Perform the actual operaton + const op_data = &@field(_completion.operation, @tagName(operation_tag)); + const result = OperationImpl.do_operation(op_data); + + // Requeue onto io_pending if error.WouldBlock + switch (operation_tag) { + .accept, .connect, .read, .write, .send, .recv => { + _ = result catch |err| switch (err) { + error.WouldBlock => { + _completion.next = null; + io.io_pending.push(_completion); + return; + }, + else => {}, + }; + }, + else => {}, + } + + // Complete the Completion + + return callback( + @ptrCast(@alignCast(_completion.context)), + _completion, + result, + ); + } + }.onComplete; + + completion.* = .{ + .next = null, + .context = context, + .callback = onCompleteFn, + .operation = @unionInit(Operation, @tagName(operation_tag), operation_data), + }; + + switch (operation_tag) { + .timeout => self.timeouts.push(completion), + else => self.completed.push(completion), + } + } + + pub const AcceptError = os.AcceptError || os.SetSockOptError; + + pub fn accept( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: AcceptError!os.socket_t, + ) void, + completion: *Completion, + socket: os.socket_t, + ) void { + self.submit( + context, + callback, + completion, + .accept, + .{ + .socket = socket, + }, + struct { + fn do_operation(op: anytype) AcceptError!os.socket_t { + const fd = try os.accept( + op.socket, + null, + null, + os.SOCK.NONBLOCK | os.SOCK.CLOEXEC, + ); + errdefer os.close(fd); + + // Darwin doesn't support os.MSG_NOSIGNAL to avoid getting SIGPIPE on socket send(). + // Instead, it uses the SO_NOSIGPIPE socket option which does the same for all send()s. + os.setsockopt( + fd, + os.SOL.SOCKET, + os.SO.NOSIGPIPE, + &mem.toBytes(@as(c_int, 1)), + ) catch |err| return switch (err) { + error.TimeoutTooBig => unreachable, + error.PermissionDenied => error.NetworkSubsystemFailed, + error.AlreadyConnected => error.NetworkSubsystemFailed, + error.InvalidProtocolOption => error.ProtocolFailure, + else => |e| e, + }; + + return fd; + } + }, + ); + } + + pub const CloseError = error{ + FileDescriptorInvalid, + DiskQuota, + InputOutput, + NoSpaceLeft, + } || os.UnexpectedError; + + pub fn close( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: CloseError!void, + ) void, + completion: *Completion, + fd: os.fd_t, + ) void { + self.submit( + context, + callback, + completion, + .close, + .{ + .fd = fd, + }, + struct { + fn do_operation(op: anytype) CloseError!void { + return switch (os.errno(os.system.close(op.fd))) { + .SUCCESS => {}, + .BADF => error.FileDescriptorInvalid, + .INTR => {}, // A success, see https://github.com/ziglang/zig/issues/2425 + .IO => error.InputOutput, + else => |errno| os.unexpectedErrno(errno), + }; + } + }, + ); + } + + pub const ConnectError = os.ConnectError; + + pub fn connect( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: ConnectError!void, + ) void, + completion: *Completion, + socket: os.socket_t, + address: std.net.Address, + ) void { + self.submit( + context, + callback, + completion, + .connect, + .{ + .socket = socket, + .address = address, + .initiated = false, + }, + struct { + fn do_operation(op: anytype) ConnectError!void { + // Don't call connect after being rescheduled by io_pending as it gives EISCONN. + // Instead, check the socket error to see if has been connected successfully. + const result = switch (op.initiated) { + true => os.getsockoptError(op.socket), + else => os.connect(op.socket, &op.address.any, op.address.getOsSockLen()), + }; + + op.initiated = true; + return result; + } + }, + ); + } + + pub const ReadError = error{ + WouldBlock, + NotOpenForReading, + ConnectionResetByPeer, + Alignment, + InputOutput, + IsDir, + SystemResources, + Unseekable, + ConnectionTimedOut, + } || os.UnexpectedError; + + pub fn read( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: ReadError!usize, + ) void, + completion: *Completion, + fd: os.fd_t, + buffer: []u8, + offset: u64, + ) void { + self.submit( + context, + callback, + completion, + .read, + .{ + .fd = fd, + .buf = buffer.ptr, + .len = @as(u32, @intCast(bufferLimit(buffer.len))), + .offset = offset, + }, + struct { + fn do_operation(op: anytype) ReadError!usize { + while (true) { + const rc = os.system.pread( + op.fd, + op.buf, + op.len, + @as(isize, @bitCast(op.offset)), + ); + return switch (os.errno(rc)) { + .SUCCESS => @as(usize, @intCast(rc)), + .INTR => continue, + .AGAIN => error.WouldBlock, + .BADF => error.NotOpenForReading, + .CONNRESET => error.ConnectionResetByPeer, + .FAULT => unreachable, + .INVAL => error.Alignment, + .IO => error.InputOutput, + .ISDIR => error.IsDir, + .NOBUFS => error.SystemResources, + .NOMEM => error.SystemResources, + .NXIO => error.Unseekable, + .OVERFLOW => error.Unseekable, + .SPIPE => error.Unseekable, + .TIMEDOUT => error.ConnectionTimedOut, + else => |err| os.unexpectedErrno(err), + }; + } + } + }, + ); + } + + pub const RecvError = os.RecvFromError; + + pub fn recv( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: RecvError!usize, + ) void, + completion: *Completion, + socket: os.socket_t, + buffer: []u8, + ) void { + self.submit( + context, + callback, + completion, + .recv, + .{ + .socket = socket, + .buf = buffer.ptr, + .len = @as(u32, @intCast(bufferLimit(buffer.len))), + }, + struct { + fn do_operation(op: anytype) RecvError!usize { + return os.recv(op.socket, op.buf[0..op.len], 0); + } + }, + ); + } + + pub const SendError = os.SendError; + + pub fn send( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: SendError!usize, + ) void, + completion: *Completion, + socket: os.socket_t, + buffer: []const u8, + ) void { + self.submit( + context, + callback, + completion, + .send, + .{ + .socket = socket, + .buf = buffer.ptr, + .len = @as(u32, @intCast(bufferLimit(buffer.len))), + }, + struct { + fn do_operation(op: anytype) SendError!usize { + return os.send(op.socket, op.buf[0..op.len], 0); + } + }, + ); + } + + pub const TimeoutError = error{Canceled} || os.UnexpectedError; + + pub fn timeout( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: TimeoutError!void, + ) void, + completion: *Completion, + nanoseconds: u63, + ) void { + // Special case a zero timeout as a yield. + if (nanoseconds == 0) { + completion.* = .{ + .next = null, + .context = context, + .operation = undefined, + .callback = struct { + fn on_complete(_io: *IO, _completion: *Completion) void { + _ = _io; + const _context: Context = @ptrCast(@alignCast(_completion.context)); + callback(_context, _completion, {}); + } + }.on_complete, + }; + + self.completed.push(completion); + return; + } + + self.submit( + context, + callback, + completion, + .timeout, + .{ + .expires = self.time.monotonic() + nanoseconds, + }, + struct { + fn do_operation(_: anytype) TimeoutError!void { + return; // timeouts don't have errors for now + } + }, + ); + } + + pub const WriteError = os.PWriteError; + + pub fn write( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: WriteError!usize, + ) void, + completion: *Completion, + fd: os.fd_t, + buffer: []const u8, + offset: u64, + ) void { + self.submit( + context, + callback, + completion, + .write, + .{ + .fd = fd, + .buf = buffer.ptr, + .len = @as(u32, @intCast(bufferLimit(buffer.len))), + .offset = offset, + }, + struct { + fn do_operation(op: anytype) WriteError!usize { + return os.pwrite(op.fd, op.buf[0..op.len], op.offset); + } + }, + ); + } + + pub const INVALID_SOCKET = -1; + + /// Creates a socket that can be used for async operations with the IO instance. + pub fn open_socket(self: *IO, family: u32, sock_type: u32, protocol: u32) !os.socket_t { + _ = self; + + const fd = try os.socket(family, sock_type | os.SOCK.NONBLOCK, protocol); + errdefer os.closeSocket(fd); + + // darwin doesn't support os.MSG_NOSIGNAL, but instead a socket option to avoid SIGPIPE. + try os.setsockopt(fd, os.SOL.SOCKET, os.SO.NOSIGPIPE, &mem.toBytes(@as(c_int, 1))); + return fd; + } + + /// Opens a directory with read only access. + pub fn open_dir(dir_path: []const u8) !os.fd_t { + return os.open(dir_path, os.O.CLOEXEC | os.O.RDONLY, 0); + } + + pub const INVALID_FILE: os.fd_t = -1; + + /// Opens or creates a journal file: + /// - For reading and writing. + /// - For Direct I/O (required on darwin). + /// - Obtains an advisory exclusive lock to the file descriptor. + /// - Allocates the file contiguously on disk if this is supported by the file system. + /// - Ensures that the file data (and file inode in the parent directory) is durable on disk. + /// The caller is responsible for ensuring that the parent directory inode is durable. + /// - Verifies that the file size matches the expected file size before returning. + pub fn open_file( + dir_fd: os.fd_t, + relative_path: []const u8, + size: u64, + method: enum { create, create_or_open, open }, + ) !os.fd_t { + assert(relative_path.len > 0); + assert(size % sector_size == 0); + + // TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock. + // This is much stronger than an advisory exclusive lock, and is required on some platforms. + + // Opening with O_DSYNC is essential for both durability and correctness. + // O_DSYNC enables us to omit fsync() calls in the data plane, since we sync to the disk on every write. + var flags: u32 = os.O.CLOEXEC | os.O.RDWR | os.O.DSYNC; + var mode: os.mode_t = 0; + + // TODO Document this and investigate whether this is in fact correct to set here. + if (@hasDecl(os.O, "LARGEFILE")) flags |= os.O.LARGEFILE; + + switch (method) { + .create => { + flags |= os.O.CREAT; + flags |= os.O.EXCL; + mode = 0o666; + log.info("creating \"{s}\"...", .{relative_path}); + }, + .create_or_open => { + flags |= os.O.CREAT; + mode = 0o666; + log.info("opening or creating \"{s}\"...", .{relative_path}); + }, + .open => { + log.info("opening \"{s}\"...", .{relative_path}); + }, + } + + // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file: + assert((flags & os.O.DSYNC) > 0); + + // Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page) + assert(!std.fs.path.isAbsolute(relative_path)); + const fd = try os.openat(dir_fd, relative_path, flags, mode); + // TODO Return a proper error message when the path exists or does not exist (init/start). + errdefer os.close(fd); + + // TODO Check that the file is actually a file. + + // On darwin assume that Direct I/O is always supported. + // Use F_NOCACHE to disable the page cache as O_DIRECT doesn't exist. + if (direct_io) { + _ = try os.fcntl(fd, os.F.NOCACHE, 1); + } + + // Obtain an advisory exclusive lock that works only if all processes actually use flock(). + // LOCK_NB means that we want to fail the lock without waiting if another process has it. + os.flock(fd, os.LOCK.EX | os.LOCK.NB) catch |err| switch (err) { + error.WouldBlock => @panic("another process holds the data file lock"), + else => return err, + }; + + // Ask the file system to allocate contiguous sectors for the file (if possible): + // If the file system does not support `fallocate()`, then this could mean more seeks or a + // panic if we run out of disk space (ENOSPC). + if (method == .create) try fs_allocate(fd, size); + + // The best fsync strategy is always to fsync before reading because this prevents us from + // making decisions on data that was never durably written by a previously crashed process. + // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC. + // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out. + try fs_sync(fd); + + // We fsync the parent directory to ensure that the file inode is durably written. + // The caller is responsible for the parent directory inode stored under the grandparent. + // We always do this when opening because we don't know if this was done before crashing. + try fs_sync(dir_fd); + + // TODO Document that `size` is now `data_file_size_min` from `main.zig`. + const stat = try os.fstat(fd); + if (stat.size < size) @panic("data file inode size was truncated or corrupted"); + + return fd; + } + + /// Darwin's fsync() syscall does not flush past the disk cache. We must use F_FULLFSYNC instead. + /// https://twitter.com/TigerBeetleDB/status/1422491736224436225 + fn fs_sync(fd: os.fd_t) !void { + _ = os.fcntl(fd, os.F.FULLFSYNC, 1) catch return os.fsync(fd); + } + + /// Allocates a file contiguously using fallocate() if supported. + /// Alternatively, writes to the last sector so that at least the file size is correct. + fn fs_allocate(fd: os.fd_t, size: u64) !void { + log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)}); + + // Darwin doesn't have fallocate() but we can simulate it using fcntl()s. + // + // https://stackoverflow.com/a/11497568 + // https://api.kde.org/frameworks/kcoreaddons/html/posix__fallocate__mac_8h_source.html + // http://hg.mozilla.org/mozilla-central/file/3d846420a907/xpcom/glue/FileUtils.cpp#l61 + + const F_ALLOCATECONTIG = 0x2; // Allocate contiguous space. + const F_ALLOCATEALL = 0x4; // Allocate all or nothing. + const F_PEOFPOSMODE = 3; // Use relative offset from the seek pos mode. + const fstore_t = extern struct { + fst_flags: c_uint, + fst_posmode: c_int, + fst_offset: os.off_t, + fst_length: os.off_t, + fst_bytesalloc: os.off_t, + }; + + var store = fstore_t{ + .fst_flags = F_ALLOCATECONTIG | F_ALLOCATEALL, + .fst_posmode = F_PEOFPOSMODE, + .fst_offset = 0, + .fst_length = @as(os.off_t, @intCast(size)), + .fst_bytesalloc = 0, + }; + + // Try to pre-allocate contiguous space and fall back to default non-contiguous. + var res = os.system.fcntl(fd, os.F.PREALLOCATE, @intFromPtr(&store)); + if (os.errno(res) != .SUCCESS) { + store.fst_flags = F_ALLOCATEALL; + res = os.system.fcntl(fd, os.F.PREALLOCATE, @intFromPtr(&store)); + } + + switch (os.errno(res)) { + .SUCCESS => {}, + .ACCES => unreachable, // F_SETLK or F_SETSIZE of F_WRITEBOOTSTRAP + .BADF => return error.FileDescriptorInvalid, + .DEADLK => unreachable, // F_SETLKW + .INTR => unreachable, // F_SETLKW + .INVAL => return error.ArgumentsInvalid, // for F_PREALLOCATE (offset invalid) + .MFILE => unreachable, // F_DUPFD or F_DUPED + .NOLCK => unreachable, // F_SETLK or F_SETLKW + .OVERFLOW => return error.FileTooBig, + .SRCH => unreachable, // F_SETOWN + .OPNOTSUPP => return error.OperationNotSupported, // not reported but need same error union + else => |errno| return os.unexpectedErrno(errno), + } + + // Now actually perform the allocation. + return os.ftruncate(fd, size) catch |err| switch (err) { + error.AccessDenied => error.PermissionDenied, + else => |e| e, + }; + } +}; diff --git a/src/io/linux.zig b/src/io/linux.zig new file mode 100644 index 0000000..eefb08c --- /dev/null +++ b/src/io/linux.zig @@ -0,0 +1,1126 @@ +const std = @import("std"); +const assert = std.debug.assert; +const os = std.os; +const linux = os.linux; +const IO_Uring = linux.IO_Uring; +const io_uring_cqe = linux.io_uring_cqe; +const io_uring_sqe = linux.io_uring_sqe; +const log = std.log.scoped(.io); + +const stdx = @import("../stdx.zig"); +const FIFO = @import("../fifo.zig").FIFO; +const bufferLimit = @import("../io.zig").bufferLimit; +const parse_dirty_semver = stdx.parse_dirty_semver; + +const direct_io = true; +const direct_io_required = true; +const sector_size = 4096; + +pub const IO = struct { + ring: IO_Uring, + + /// Operations not yet submitted to the kernel and waiting on available space in the + /// submission queue. + unqueued: FIFO(Completion) = .{ .name = "io_unqueued" }, + + /// Completions that are ready to have their callbacks run. + completed: FIFO(Completion) = .{ .name = "io_completed" }, + + ios_queued: u64 = 0, + ios_in_kernel: u64 = 0, + + pub fn init(entries: u12, flags: u32) !IO { + // Detect the linux version to ensure that we support all io_uring ops used. + const uts = std.os.uname(); + const version = try parse_dirty_semver(&uts.release); + if (version.order(std.SemanticVersion{ .major = 5, .minor = 5, .patch = 0 }) == .lt) { + @panic("Linux kernel 5.5 or greater is required for io_uring OP_ACCEPT"); + } + + return IO{ .ring = try IO_Uring.init(entries, flags) }; + } + + pub fn deinit(self: *IO) void { + self.ring.deinit(); + } + + /// Pass all queued submissions to the kernel and peek for completions. + pub fn tick(self: *IO) !void { + // We assume that all timeouts submitted by `run_for_ns()` will be reaped by `run_for_ns()` + // and that `tick()` and `run_for_ns()` cannot be run concurrently. + // Therefore `timeouts` here will never be decremented and `etime` will always be false. + var timeouts: usize = 0; + var etime = false; + + try self.flush(0, &timeouts, &etime); + assert(etime == false); + + // Flush any SQEs that were queued while running completion callbacks in `flush()`: + // This is an optimization to avoid delaying submissions until the next tick. + // At the same time, we do not flush any ready CQEs since SQEs may complete synchronously. + // We guard against an io_uring_enter() syscall if we know we do not have any queued SQEs. + // We cannot use `self.ring.sq_ready()` here since this counts flushed and unflushed SQEs. + const queued = self.ring.sq.sqe_tail -% self.ring.sq.sqe_head; + if (queued > 0) { + try self.flush_submissions(0, &timeouts, &etime); + assert(etime == false); + } + } + + /// Pass all queued submissions to the kernel and run for `nanoseconds`. + /// The `nanoseconds` argument is a u63 to allow coercion to the i64 used + /// in the kernel_timespec struct. + pub fn run_for_ns(self: *IO, nanoseconds: u63) !void { + // We must use the same clock source used by io_uring (CLOCK_MONOTONIC) since we specify the + // timeout below as an absolute value. Otherwise, we may deadlock if the clock sources are + // dramatically different. Any kernel that supports io_uring will support CLOCK_MONOTONIC. + var current_ts: os.timespec = undefined; + os.clock_gettime(os.CLOCK.MONOTONIC, ¤t_ts) catch unreachable; + // The absolute CLOCK_MONOTONIC time after which we may return from this function: + const timeout_ts: os.linux.kernel_timespec = .{ + .tv_sec = current_ts.tv_sec, + .tv_nsec = current_ts.tv_nsec + nanoseconds, + }; + var timeouts: usize = 0; + var etime = false; + while (!etime) { + const timeout_sqe = self.ring.get_sqe() catch blk: { + // The submission queue is full, so flush submissions to make space: + try self.flush_submissions(0, &timeouts, &etime); + break :blk self.ring.get_sqe() catch unreachable; + }; + // Submit an absolute timeout that will be canceled if any other SQE completes first: + linux.io_uring_prep_timeout(timeout_sqe, &timeout_ts, 1, os.linux.IORING_TIMEOUT_ABS); + timeout_sqe.user_data = 0; + timeouts += 1; + + // We don't really want to count this timeout as an io, + // but it's tricky to track separately. + self.ios_queued += 1; + + // The amount of time this call will block is bounded by the timeout we just submitted: + try self.flush(1, &timeouts, &etime); + } + // Reap any remaining timeouts, which reference the timespec in the current stack frame. + // The busy loop here is required to avoid a potential deadlock, as the kernel determines + // when the timeouts are pushed to the completion queue, not us. + while (timeouts > 0) _ = try self.flush_completions(0, &timeouts, &etime); + } + + fn flush(self: *IO, wait_nr: u32, timeouts: *usize, etime: *bool) !void { + // Flush any queued SQEs and reuse the same syscall to wait for completions if required: + try self.flush_submissions(wait_nr, timeouts, etime); + // We can now just peek for any CQEs without waiting and without another syscall: + try self.flush_completions(0, timeouts, etime); + + // The SQE array is empty from flush_submissions(). Fill it up with unqueued completions. + // This runs before `self.completed` is flushed below to prevent new IO from reserving SQE + // slots and potentially starving those in `self.unqueued`. + // Loop over a copy to avoid an infinite loop of `enqueue()` re-adding to `self.unqueued`. + { + var copy = self.unqueued; + self.unqueued.reset(); + while (copy.pop()) |completion| self.enqueue(completion); + } + + // Run completions only after all completions have been flushed: + // Loop until all completions are processed. Calls to complete() may queue more work + // and extend the duration of the loop, but this is fine as it 1) executes completions + // that become ready without going through another syscall from flush_submissions() and + // 2) potentially queues more SQEs to take advantage more of the next flush_submissions(). + while (self.completed.pop()) |completion| completion.complete(); + + // At this point, unqueued could have completions either by 1) those who didn't get an SQE + // during the popping of unqueued or 2) completion.complete() which start new IO. These + // unqueued completions will get priority to acquiring SQEs on the next flush(). + } + + fn flush_completions(self: *IO, wait_nr: u32, timeouts: *usize, etime: *bool) !void { + var cqes: [256]io_uring_cqe = undefined; + var wait_remaining = wait_nr; + while (true) { + // Guard against waiting indefinitely (if there are too few requests inflight), + // especially if this is not the first time round the loop: + const completed = self.ring.copy_cqes(&cqes, wait_remaining) catch |err| switch (err) { + error.SignalInterrupt => continue, + else => return err, + }; + if (completed > wait_remaining) wait_remaining = 0 else wait_remaining -= completed; + for (cqes[0..completed]) |cqe| { + self.ios_in_kernel -= 1; + + if (cqe.user_data == 0) { + timeouts.* -= 1; + // We are only done if the timeout submitted was completed due to time, not if + // it was completed due to the completion of an event, in which case `cqe.res` + // would be 0. It is possible for multiple timeout operations to complete at the + // same time if the nanoseconds value passed to `run_for_ns()` is very short. + if (-cqe.res == @intFromEnum(os.E.TIME)) etime.* = true; + continue; + } + const completion = @as(*Completion, @ptrFromInt(@as(usize, @intCast(cqe.user_data)))); + completion.result = cqe.res; + // We do not run the completion here (instead appending to a linked list) to avoid: + // * recursion through `flush_submissions()` and `flush_completions()`, + // * unbounded stack usage, and + // * confusing stack traces. + self.completed.push(completion); + } + + if (completed < cqes.len) break; + } + } + + fn flush_submissions(self: *IO, wait_nr: u32, timeouts: *usize, etime: *bool) !void { + while (true) { + const submitted = self.ring.submit_and_wait(wait_nr) catch |err| switch (err) { + error.SignalInterrupt => continue, + // Wait for some completions and then try again: + // See https://github.com/axboe/liburing/issues/281 re: error.SystemResources. + // Be careful also that copy_cqes() will flush before entering to wait (it does): + // https://github.com/axboe/liburing/commit/35c199c48dfd54ad46b96e386882e7ac341314c5 + error.CompletionQueueOvercommitted, error.SystemResources => { + try self.flush_completions(1, timeouts, etime); + continue; + }, + else => return err, + }; + + self.ios_queued -= submitted; + self.ios_in_kernel += submitted; + + break; + } + } + + fn enqueue(self: *IO, completion: *Completion) void { + const sqe = self.ring.get_sqe() catch |err| switch (err) { + error.SubmissionQueueFull => { + self.unqueued.push(completion); + return; + }, + }; + completion.prep(sqe); + + self.ios_queued += 1; + } + + /// This struct holds the data needed for a single io_uring operation + pub const Completion = struct { + io: *IO, + result: i32 = undefined, + next: ?*Completion = null, + operation: Operation, + context: ?*anyopaque, + callback: *const fn (context: ?*anyopaque, completion: *Completion, result: *const anyopaque) void, + + fn prep(completion: *Completion, sqe: *io_uring_sqe) void { + switch (completion.operation) { + .accept => |*op| { + linux.io_uring_prep_accept( + sqe, + op.socket, + &op.address, + &op.address_size, + os.SOCK.CLOEXEC, + ); + }, + .close => |op| { + linux.io_uring_prep_close(sqe, op.fd); + }, + .connect => |*op| { + linux.io_uring_prep_connect( + sqe, + op.socket, + &op.address.any, + op.address.getOsSockLen(), + ); + }, + .read => |op| { + linux.io_uring_prep_read( + sqe, + op.fd, + op.buffer[0..bufferLimit(op.buffer.len)], + op.offset, + ); + }, + .recv => |op| { + linux.io_uring_prep_recv(sqe, op.socket, op.buffer, os.MSG.NOSIGNAL); + }, + .send => |op| { + linux.io_uring_prep_send(sqe, op.socket, op.buffer, os.MSG.NOSIGNAL); + }, + .timeout => |*op| { + linux.io_uring_prep_timeout(sqe, &op.timespec, 0, 0); + }, + .write => |op| { + linux.io_uring_prep_write( + sqe, + op.fd, + op.buffer[0..bufferLimit(op.buffer.len)], + op.offset, + ); + }, + } + sqe.user_data = @intFromPtr(completion); + } + + fn complete(completion: *Completion) void { + switch (completion.operation) { + .accept => { + const result: anyerror!os.socket_t = blk: { + if (completion.result < 0) { + const err = switch (@as(os.E, @enumFromInt(-completion.result))) { + .INTR => { + completion.io.enqueue(completion); + return; + }, + .AGAIN => error.WouldBlock, + .BADF => error.FileDescriptorInvalid, + .CONNABORTED => error.ConnectionAborted, + .FAULT => unreachable, + .INVAL => error.SocketNotListening, + .MFILE => error.ProcessFdQuotaExceeded, + .NFILE => error.SystemFdQuotaExceeded, + .NOBUFS => error.SystemResources, + .NOMEM => error.SystemResources, + .NOTSOCK => error.FileDescriptorNotASocket, + .OPNOTSUPP => error.OperationNotSupported, + .PERM => error.PermissionDenied, + .PROTO => error.ProtocolFailure, + else => |errno| os.unexpectedErrno(errno), + }; + break :blk err; + } else { + break :blk @as(os.socket_t, @intCast(completion.result)); + } + }; + call_callback(completion, &result); + }, + .close => { + const result: anyerror!void = blk: { + if (completion.result < 0) { + const err = switch (@as(os.E, @enumFromInt(-completion.result))) { + .INTR => {}, // A success, see https://github.com/ziglang/zig/issues/2425 + .BADF => error.FileDescriptorInvalid, + .DQUOT => error.DiskQuota, + .IO => error.InputOutput, + .NOSPC => error.NoSpaceLeft, + else => |errno| os.unexpectedErrno(errno), + }; + break :blk err; + } else { + assert(completion.result == 0); + } + }; + call_callback(completion, &result); + }, + .connect => { + const result: anyerror!void = blk: { + if (completion.result < 0) { + const err = switch (@as(os.E, @enumFromInt(-completion.result))) { + .INTR => { + completion.io.enqueue(completion); + return; + }, + .ACCES => error.AccessDenied, + .ADDRINUSE => error.AddressInUse, + .ADDRNOTAVAIL => error.AddressNotAvailable, + .AFNOSUPPORT => error.AddressFamilyNotSupported, + .AGAIN, .INPROGRESS => error.WouldBlock, + .ALREADY => error.OpenAlreadyInProgress, + .BADF => error.FileDescriptorInvalid, + .CONNREFUSED => error.ConnectionRefused, + .CONNRESET => error.ConnectionResetByPeer, + .FAULT => unreachable, + .ISCONN => error.AlreadyConnected, + .NETUNREACH => error.NetworkUnreachable, + .NOENT => error.FileNotFound, + .NOTSOCK => error.FileDescriptorNotASocket, + .PERM => error.PermissionDenied, + .PROTOTYPE => error.ProtocolNotSupported, + .TIMEDOUT => error.ConnectionTimedOut, + else => |errno| os.unexpectedErrno(errno), + }; + break :blk err; + } else { + assert(completion.result == 0); + } + }; + call_callback(completion, &result); + }, + .read => { + const result: anyerror!usize = blk: { + if (completion.result < 0) { + const err = switch (@as(os.E, @enumFromInt(-completion.result))) { + .INTR => { + completion.io.enqueue(completion); + return; + }, + .AGAIN => error.WouldBlock, + .BADF => error.NotOpenForReading, + .CONNRESET => error.ConnectionResetByPeer, + .FAULT => unreachable, + .INVAL => error.Alignment, + .IO => error.InputOutput, + .ISDIR => error.IsDir, + .NOBUFS => error.SystemResources, + .NOMEM => error.SystemResources, + .NXIO => error.Unseekable, + .OVERFLOW => error.Unseekable, + .SPIPE => error.Unseekable, + .TIMEDOUT => error.ConnectionTimedOut, + else => |errno| os.unexpectedErrno(errno), + }; + break :blk err; + } else { + break :blk @as(usize, @intCast(completion.result)); + } + }; + call_callback(completion, &result); + }, + .recv => { + const result: anyerror!usize = blk: { + if (completion.result < 0) { + const err = switch (@as(os.E, @enumFromInt(-completion.result))) { + .INTR => { + completion.io.enqueue(completion); + return; + }, + .AGAIN => error.WouldBlock, + .BADF => error.FileDescriptorInvalid, + .CONNREFUSED => error.ConnectionRefused, + .FAULT => unreachable, + .INVAL => unreachable, + .NOMEM => error.SystemResources, + .NOTCONN => error.SocketNotConnected, + .NOTSOCK => error.FileDescriptorNotASocket, + .CONNRESET => error.ConnectionResetByPeer, + .TIMEDOUT => error.ConnectionTimedOut, + .OPNOTSUPP => error.OperationNotSupported, + else => |errno| os.unexpectedErrno(errno), + }; + break :blk err; + } else { + break :blk @as(usize, @intCast(completion.result)); + } + }; + call_callback(completion, &result); + }, + .send => { + const result: anyerror!usize = blk: { + if (completion.result < 0) { + const err = switch (@as(os.E, @enumFromInt(-completion.result))) { + .INTR => { + completion.io.enqueue(completion); + return; + }, + .ACCES => error.AccessDenied, + .AGAIN => error.WouldBlock, + .ALREADY => error.FastOpenAlreadyInProgress, + .AFNOSUPPORT => error.AddressFamilyNotSupported, + .BADF => error.FileDescriptorInvalid, + .CONNRESET => error.ConnectionResetByPeer, + .DESTADDRREQ => unreachable, + .FAULT => unreachable, + .INVAL => unreachable, + .ISCONN => unreachable, + .MSGSIZE => error.MessageTooBig, + .NOBUFS => error.SystemResources, + .NOMEM => error.SystemResources, + .NOTCONN => error.SocketNotConnected, + .NOTSOCK => error.FileDescriptorNotASocket, + .OPNOTSUPP => error.OperationNotSupported, + .PIPE => error.BrokenPipe, + .TIMEDOUT => error.ConnectionTimedOut, + else => |errno| os.unexpectedErrno(errno), + }; + break :blk err; + } else { + break :blk @as(usize, @intCast(completion.result)); + } + }; + call_callback(completion, &result); + }, + .timeout => { + assert(completion.result < 0); + const result: anyerror!void = switch (@as(os.E, @enumFromInt(-completion.result))) { + .INTR => { + completion.io.enqueue(completion); + return; + }, + .CANCELED => error.Canceled, + .TIME => {}, // A success. + else => |errno| os.unexpectedErrno(errno), + }; + call_callback(completion, &result); + }, + .write => { + const result: anyerror!usize = blk: { + if (completion.result < 0) { + const err = switch (@as(os.E, @enumFromInt(-completion.result))) { + .INTR => { + completion.io.enqueue(completion); + return; + }, + .AGAIN => error.WouldBlock, + .BADF => error.NotOpenForWriting, + .DESTADDRREQ => error.NotConnected, + .DQUOT => error.DiskQuota, + .FAULT => unreachable, + .FBIG => error.FileTooBig, + .INVAL => error.Alignment, + .IO => error.InputOutput, + .NOSPC => error.NoSpaceLeft, + .NXIO => error.Unseekable, + .OVERFLOW => error.Unseekable, + .PERM => error.AccessDenied, + .PIPE => error.BrokenPipe, + .SPIPE => error.Unseekable, + else => |errno| os.unexpectedErrno(errno), + }; + break :blk err; + } else { + break :blk @as(usize, @intCast(completion.result)); + } + }; + call_callback(completion, &result); + }, + } + } + }; + + fn call_callback( + completion: *Completion, + result: *const anyopaque, + ) void { + completion.callback(completion.context, completion, result); + } + + /// This union encodes the set of operations supported as well as their arguments. + const Operation = union(enum) { + accept: struct { + socket: os.socket_t, + address: os.sockaddr = undefined, + address_size: os.socklen_t = @sizeOf(os.sockaddr), + }, + close: struct { + fd: os.fd_t, + }, + connect: struct { + socket: os.socket_t, + address: std.net.Address, + }, + read: struct { + fd: os.fd_t, + buffer: []u8, + offset: u64, + }, + recv: struct { + socket: os.socket_t, + buffer: []u8, + }, + send: struct { + socket: os.socket_t, + buffer: []const u8, + }, + timeout: struct { + timespec: os.linux.kernel_timespec, + }, + write: struct { + fd: os.fd_t, + buffer: []const u8, + offset: u64, + }, + }; + + pub const AcceptError = error{ + WouldBlock, + FileDescriptorInvalid, + ConnectionAborted, + SocketNotListening, + ProcessFdQuotaExceeded, + SystemFdQuotaExceeded, + SystemResources, + FileDescriptorNotASocket, + OperationNotSupported, + PermissionDenied, + ProtocolFailure, + } || os.UnexpectedError; + + pub fn accept( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: AcceptError!os.socket_t, + ) void, + completion: *Completion, + socket: os.socket_t, + ) void { + completion.* = .{ + .io = self, + .context = context, + .callback = struct { + fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void { + callback( + @ptrCast(@alignCast(ctx)), + comp, + @as(*const AcceptError!os.socket_t, @ptrCast(@alignCast(res))).*, + ); + } + }.wrapper, + .operation = .{ + .accept = .{ + .socket = socket, + .address = undefined, + .address_size = @sizeOf(os.sockaddr), + }, + }, + }; + self.enqueue(completion); + } + + pub const CloseError = error{ + FileDescriptorInvalid, + DiskQuota, + InputOutput, + NoSpaceLeft, + } || os.UnexpectedError; + + pub fn close( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: CloseError!void, + ) void, + completion: *Completion, + fd: os.fd_t, + ) void { + completion.* = .{ + .io = self, + .context = context, + .callback = struct { + fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void { + callback( + @ptrCast(@alignCast(ctx)), + comp, + @as(*const CloseError!void, @ptrCast(@alignCast(res))).*, + ); + } + }.wrapper, + .operation = .{ + .close = .{ .fd = fd }, + }, + }; + self.enqueue(completion); + } + + pub const ConnectError = error{ + AccessDenied, + AddressInUse, + AddressNotAvailable, + AddressFamilyNotSupported, + WouldBlock, + OpenAlreadyInProgress, + FileDescriptorInvalid, + ConnectionRefused, + AlreadyConnected, + NetworkUnreachable, + FileNotFound, + FileDescriptorNotASocket, + PermissionDenied, + ProtocolNotSupported, + ConnectionTimedOut, + SystemResources, + } || os.UnexpectedError; + + pub fn connect( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: ConnectError!void, + ) void, + completion: *Completion, + socket: os.socket_t, + address: std.net.Address, + ) void { + completion.* = .{ + .io = self, + .context = context, + .callback = struct { + fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void { + callback( + @ptrCast(@alignCast(ctx)), + comp, + @as(*const ConnectError!void, @ptrCast(@alignCast(res))).*, + ); + } + }.wrapper, + .operation = .{ + .connect = .{ + .socket = socket, + .address = address, + }, + }, + }; + self.enqueue(completion); + } + + pub const ReadError = error{ + WouldBlock, + NotOpenForReading, + ConnectionResetByPeer, + Alignment, + InputOutput, + IsDir, + SystemResources, + Unseekable, + ConnectionTimedOut, + } || os.UnexpectedError; + + pub fn read( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: ReadError!usize, + ) void, + completion: *Completion, + fd: os.fd_t, + buffer: []u8, + offset: u64, + ) void { + completion.* = .{ + .io = self, + .context = context, + .callback = struct { + fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void { + callback( + @ptrCast(@alignCast(ctx)), + comp, + @as(*const ReadError!usize, @ptrCast(@alignCast(res))).*, + ); + } + }.wrapper, + .operation = .{ + .read = .{ + .fd = fd, + .buffer = buffer, + .offset = offset, + }, + }, + }; + self.enqueue(completion); + } + + pub const RecvError = error{ + WouldBlock, + FileDescriptorInvalid, + ConnectionRefused, + SystemResources, + SocketNotConnected, + FileDescriptorNotASocket, + ConnectionTimedOut, + OperationNotSupported, + } || os.UnexpectedError; + + pub fn recv( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: RecvError!usize, + ) void, + completion: *Completion, + socket: os.socket_t, + buffer: []u8, + ) void { + completion.* = .{ + .io = self, + .context = context, + .callback = struct { + fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void { + callback( + @ptrCast(@alignCast(ctx)), + comp, + @as(*const RecvError!usize, @ptrCast(@alignCast(res))).*, + ); + } + }.wrapper, + .operation = .{ + .recv = .{ + .socket = socket, + .buffer = buffer, + }, + }, + }; + self.enqueue(completion); + } + + pub const SendError = error{ + AccessDenied, + WouldBlock, + FastOpenAlreadyInProgress, + AddressFamilyNotSupported, + FileDescriptorInvalid, + ConnectionResetByPeer, + MessageTooBig, + SystemResources, + SocketNotConnected, + FileDescriptorNotASocket, + OperationNotSupported, + BrokenPipe, + ConnectionTimedOut, + } || os.UnexpectedError; + + pub fn send( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: SendError!usize, + ) void, + completion: *Completion, + socket: os.socket_t, + buffer: []const u8, + ) void { + completion.* = .{ + .io = self, + .context = context, + .callback = struct { + fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void { + callback( + @ptrCast(@alignCast(ctx)), + comp, + @as(*const SendError!usize, @ptrCast(@alignCast(res))).*, + ); + } + }.wrapper, + .operation = .{ + .send = .{ + .socket = socket, + .buffer = buffer, + }, + }, + }; + self.enqueue(completion); + } + + pub const TimeoutError = error{Canceled} || os.UnexpectedError; + + pub fn timeout( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: TimeoutError!void, + ) void, + completion: *Completion, + nanoseconds: u63, + ) void { + completion.* = .{ + .io = self, + .context = context, + .callback = struct { + fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void { + callback( + @ptrCast(@alignCast(ctx)), + comp, + @as(*const TimeoutError!void, @ptrCast(@alignCast(res))).*, + ); + } + }.wrapper, + .operation = .{ + .timeout = .{ + .timespec = .{ .tv_sec = 0, .tv_nsec = nanoseconds }, + }, + }, + }; + + // Special case a zero timeout as a yield. + if (nanoseconds == 0) { + completion.result = -@as(i32, @intCast(@intFromEnum(std.os.E.TIME))); + self.completed.push(completion); + return; + } + + self.enqueue(completion); + } + + pub const WriteError = error{ + WouldBlock, + NotOpenForWriting, + NotConnected, + DiskQuota, + FileTooBig, + Alignment, + InputOutput, + NoSpaceLeft, + Unseekable, + AccessDenied, + BrokenPipe, + } || os.UnexpectedError; + + pub fn write( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: WriteError!usize, + ) void, + completion: *Completion, + fd: os.fd_t, + buffer: []const u8, + offset: u64, + ) void { + completion.* = .{ + .io = self, + .context = context, + .callback = struct { + fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void { + callback( + @ptrCast(@alignCast(ctx)), + comp, + @as(*const WriteError!usize, @ptrCast(@alignCast(res))).*, + ); + } + }.wrapper, + .operation = .{ + .write = .{ + .fd = fd, + .buffer = buffer, + .offset = offset, + }, + }, + }; + self.enqueue(completion); + } + + pub const INVALID_SOCKET = -1; + + /// Creates a socket that can be used for async operations with the IO instance. + pub fn open_socket(self: *IO, family: u32, sock_type: u32, protocol: u32) !os.socket_t { + _ = self; + return os.socket(family, sock_type, protocol); + } + + /// Opens a directory with read only access. + pub fn open_dir(dir_path: []const u8) !os.fd_t { + return os.open(dir_path, os.O.CLOEXEC | os.O.RDONLY, 0); + } + + pub const INVALID_FILE: os.fd_t = -1; + + /// Opens or creates a journal file: + /// - For reading and writing. + /// - For Direct I/O (if possible in development mode, but required in production mode). + /// - Obtains an advisory exclusive lock to the file descriptor. + /// - Allocates the file contiguously on disk if this is supported by the file system. + /// - Ensures that the file data (and file inode in the parent directory) is durable on disk. + /// The caller is responsible for ensuring that the parent directory inode is durable. + /// - Verifies that the file size matches the expected file size before returning. + pub fn open_file( + dir_fd: os.fd_t, + relative_path: []const u8, + size: u64, + method: enum { create, create_or_open, open }, + ) !os.fd_t { + assert(relative_path.len > 0); + assert(size % sector_size == 0); + + // TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock. + // This is much stronger than an advisory exclusive lock, and is required on some platforms. + + var flags: u32 = os.O.CLOEXEC | os.O.RDWR | os.O.DSYNC; + var mode: os.mode_t = 0; + + // TODO Document this and investigate whether this is in fact correct to set here. + if (@hasDecl(os.O, "LARGEFILE")) flags |= os.O.LARGEFILE; + + var direct_io_supported = false; + var dir_on_tmpfs = try fs_is_tmpfs(dir_fd); + + if (dir_on_tmpfs) { + log.warn("tmpfs is not durable, and your data will be lost on reboot", .{}); + } + + // Special case. tmpfs doesn't support Direct I/O. Normally we would panic here (see below) + // but being able to benchmark production workloads on tmpfs is very useful for removing + // disk speed from the equation. + if (direct_io and !dir_on_tmpfs) { + direct_io_supported = try fs_supports_direct_io(dir_fd); + if (direct_io_supported) { + flags |= os.O.DIRECT; + } else if (!direct_io_required) { + log.warn("file system does not support Direct I/O", .{}); + } else { + // We require Direct I/O for safety to handle fsync failure correctly, and therefore + // panic in production if it is not supported. + @panic("file system does not support Direct I/O"); + } + } + + switch (method) { + .create => { + flags |= os.O.CREAT; + flags |= os.O.EXCL; + mode = 0o666; + log.info("creating \"{s}\"...", .{relative_path}); + }, + .create_or_open => { + flags |= os.O.CREAT; + mode = 0o666; + log.info("opening or creating \"{s}\"...", .{relative_path}); + }, + .open => { + log.info("opening \"{s}\"...", .{relative_path}); + }, + } + + // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file: + assert((flags & os.O.DSYNC) > 0); + + // Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page) + assert(!std.fs.path.isAbsolute(relative_path)); + const fd = try os.openat(dir_fd, relative_path, flags, mode); + // TODO Return a proper error message when the path exists or does not exist (init/start). + errdefer os.close(fd); + + // TODO Check that the file is actually a file. + + // Obtain an advisory exclusive lock that works only if all processes actually use flock(). + // LOCK_NB means that we want to fail the lock without waiting if another process has it. + os.flock(fd, os.LOCK.EX | os.LOCK.NB) catch |err| switch (err) { + error.WouldBlock => @panic("another process holds the data file lock"), + else => return err, + }; + + // Ask the file system to allocate contiguous sectors for the file (if possible): + // If the file system does not support `fallocate()`, then this could mean more seeks or a + // panic if we run out of disk space (ENOSPC). + if (method == .create) { + log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)}); + fs_allocate(fd, size) catch |err| switch (err) { + error.OperationNotSupported => { + log.warn("file system does not support fallocate(), an ENOSPC will panic", .{}); + log.info("allocating by writing to the last sector of the file instead...", .{}); + + const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size; + + // Handle partial writes where the physical sector is less than a logical sector: + const write_offset = size - sector.len; + var written: usize = 0; + while (written < sector.len) { + written += try os.pwrite(fd, sector[written..], write_offset + written); + } + }, + else => |e| return e, + }; + } + + // The best fsync strategy is always to fsync before reading because this prevents us from + // making decisions on data that was never durably written by a previously crashed process. + // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC. + // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out. + try os.fsync(fd); + + // We fsync the parent directory to ensure that the file inode is durably written. + // The caller is responsible for the parent directory inode stored under the grandparent. + // We always do this when opening because we don't know if this was done before crashing. + try os.fsync(dir_fd); + + const stat = try os.fstat(fd); + if (stat.size < size) @panic("data file inode size was truncated or corrupted"); + + return fd; + } + + /// Detects whether the underlying file system for a given directory fd is tmpfs. This is used + /// to relax our Direct I/O check - running on tmpfs for benchmarking is useful. + fn fs_is_tmpfs(dir_fd: std.os.fd_t) !bool { + var statfs: stdx.StatFs = undefined; + + while (true) { + const res = stdx.fstatfs(dir_fd, &statfs); + switch (os.linux.getErrno(res)) { + .SUCCESS => { + return statfs.f_type == stdx.TmpfsMagic; + }, + .INTR => continue, + else => |err| return os.unexpectedErrno(err), + } + } + } + + /// Detects whether the underlying file system for a given directory fd supports Direct I/O. + /// Not all Linux file systems support `O_DIRECT`, e.g. a shared macOS volume. + fn fs_supports_direct_io(dir_fd: std.os.fd_t) !bool { + if (!@hasDecl(std.os.O, "DIRECT")) return false; + + const path = "fs_supports_direct_io"; + const dir = std.fs.Dir{ .fd = dir_fd }; + const fd = try os.openatZ(dir_fd, path, os.O.CLOEXEC | os.O.CREAT | os.O.TRUNC, 0o666); + defer os.close(fd); + defer dir.deleteFile(path) catch {}; + + while (true) { + const res = os.linux.openat(dir_fd, path, os.O.CLOEXEC | os.O.RDONLY | os.O.DIRECT, 0); + switch (os.linux.getErrno(res)) { + .SUCCESS => { + os.close(@as(os.fd_t, @intCast(res))); + return true; + }, + .INTR => continue, + .INVAL => return false, + else => |err| return os.unexpectedErrno(err), + } + } + } + + /// Allocates a file contiguously using fallocate() if supported. + /// Alternatively, writes to the last sector so that at least the file size is correct. + fn fs_allocate(fd: os.fd_t, size: u64) !void { + const mode: i32 = 0; + const offset: i64 = 0; + const length = @as(i64, @intCast(size)); + + while (true) { + const rc = os.linux.fallocate(fd, mode, offset, length); + switch (os.linux.getErrno(rc)) { + .SUCCESS => return, + .BADF => return error.FileDescriptorInvalid, + .FBIG => return error.FileTooBig, + .INTR => continue, + .INVAL => return error.ArgumentsInvalid, + .IO => return error.InputOutput, + .NODEV => return error.NoDevice, + .NOSPC => return error.NoSpaceLeft, + .NOSYS => return error.SystemOutdated, + .OPNOTSUPP => return error.OperationNotSupported, + .PERM => return error.PermissionDenied, + .SPIPE => return error.Unseekable, + .TXTBSY => return error.FileBusy, + else => |errno| return os.unexpectedErrno(errno), + } + } + } +}; diff --git a/src/io/windows.zig b/src/io/windows.zig new file mode 100644 index 0000000..206b380 --- /dev/null +++ b/src/io/windows.zig @@ -0,0 +1,1209 @@ +const std = @import("std"); +const os = std.os; +const assert = std.debug.assert; +const log = std.log.scoped(.io); + +const FIFO = @import("../fifo.zig").FIFO; +const Time = @import("../time.zig").Time; +const bufferLimit = @import("../io.zig").bufferLimit; + +const sector_size = 4096; + +pub const IO = struct { + iocp: os.windows.HANDLE, + timer: Time = .{}, + io_pending: usize = 0, + timeouts: FIFO(Completion) = .{ .name = "io_timeouts" }, + completed: FIFO(Completion) = .{ .name = "io_completed" }, + + pub fn init(entries: u12, flags: u32) !IO { + _ = entries; + _ = flags; + + _ = try os.windows.WSAStartup(2, 2); + errdefer os.windows.WSACleanup() catch unreachable; + + const iocp = try os.windows.CreateIoCompletionPort(os.windows.INVALID_HANDLE_VALUE, null, 0, 0); + return IO{ .iocp = iocp }; + } + + pub fn deinit(self: *IO) void { + assert(self.iocp != os.windows.INVALID_HANDLE_VALUE); + os.windows.CloseHandle(self.iocp); + self.iocp = os.windows.INVALID_HANDLE_VALUE; + + os.windows.WSACleanup() catch unreachable; + } + + pub fn tick(self: *IO) !void { + return self.flush(.non_blocking); + } + + pub fn run_for_ns(self: *IO, nanoseconds: u63) !void { + const Callback = struct { + fn on_timeout(timed_out: *bool, completion: *Completion, result: TimeoutError!void) void { + _ = result catch unreachable; + _ = completion; + timed_out.* = true; + } + }; + + var timed_out = false; + var completion: Completion = undefined; + self.timeout(*bool, &timed_out, Callback.on_timeout, &completion, nanoseconds); + + while (!timed_out) { + try self.flush(.blocking); + } + } + + const FlushMode = enum { + blocking, + non_blocking, + }; + + fn flush(self: *IO, mode: FlushMode) !void { + if (self.completed.empty()) { + // Compute how long to poll by flushing timeout completions. + // NOTE: this may push to completed queue + var timeout_ms: ?os.windows.DWORD = null; + if (self.flush_timeouts()) |expires_ns| { + // 0ns expires should have been completed not returned + assert(expires_ns != 0); + // Round up sub-millisecond expire times to the next millisecond + const expires_ms = (expires_ns + (std.time.ns_per_ms / 2)) / std.time.ns_per_ms; + // Saturating cast to DWORD milliseconds + const expires = std.math.cast(os.windows.DWORD, expires_ms) orelse std.math.maxInt(os.windows.DWORD); + // max DWORD is reserved for INFINITE so cap the cast at max - 1 + timeout_ms = if (expires == os.windows.INFINITE) expires - 1 else expires; + } + + // Poll for IO iff theres IO pending and flush_timeouts() found no ready completions + if (self.io_pending > 0 and self.completed.empty()) { + // In blocking mode, we're always waiting at least until the timeout by run_for_ns. + // In non-blocking mode, we shouldn't wait at all. + const io_timeout = switch (mode) { + .blocking => timeout_ms orelse @panic("IO.flush blocking unbounded"), + .non_blocking => 0, + }; + + var events: [64]os.windows.OVERLAPPED_ENTRY = undefined; + const num_events: u32 = os.windows.GetQueuedCompletionStatusEx( + self.iocp, + &events, + io_timeout, + false, // non-alertable wait + ) catch |err| switch (err) { + error.Timeout => 0, + error.Aborted => unreachable, + else => |e| return e, + }; + + assert(self.io_pending >= num_events); + self.io_pending -= num_events; + + for (events[0..num_events]) |event| { + const raw_overlapped = event.lpOverlapped; + const overlapped = @fieldParentPtr(Completion.Overlapped, "raw", raw_overlapped); + const completion = overlapped.completion; + completion.next = null; + self.completed.push(completion); + } + } + } + + // Dequeue and invoke all the completions currently ready. + // Must read all `completions` before invoking the callbacks + // as the callbacks could potentially submit more completions. + var completed = self.completed; + self.completed.reset(); + while (completed.pop()) |completion| { + (completion.callback)(Completion.Context{ + .io = self, + .completion = completion, + }); + } + } + + fn flush_timeouts(self: *IO) ?u64 { + var min_expires: ?u64 = null; + var current_time: ?u64 = null; + var timeouts: ?*Completion = self.timeouts.peek(); + + // iterate through the timeouts, returning min_expires at the end + while (timeouts) |completion| { + timeouts = completion.next; + + // lazily get the current time + const now = current_time orelse self.timer.monotonic(); + current_time = now; + + // move the completion to completed if it expired + if (now >= completion.operation.timeout.deadline) { + self.timeouts.remove(completion); + self.completed.push(completion); + continue; + } + + // if it's still waiting, update min_timeout + const expires = completion.operation.timeout.deadline - now; + if (min_expires) |current_min_expires| { + min_expires = @min(expires, current_min_expires); + } else { + min_expires = expires; + } + } + + return min_expires; + } + + /// This struct holds the data needed for a single IO operation + pub const Completion = struct { + next: ?*Completion, + context: ?*anyopaque, + callback: *const fn (Context) void, + operation: Operation, + + const Context = struct { + io: *IO, + completion: *Completion, + }; + + const Overlapped = struct { + raw: os.windows.OVERLAPPED, + completion: *Completion, + }; + + const Transfer = struct { + socket: os.socket_t, + buf: os.windows.ws2_32.WSABUF, + overlapped: Overlapped, + pending: bool, + }; + + const Operation = union(enum) { + accept: struct { + overlapped: Overlapped, + listen_socket: os.socket_t, + client_socket: os.socket_t, + addr_buffer: [(@sizeOf(std.net.Address) + 16) * 2]u8 align(4), + }, + connect: struct { + socket: os.socket_t, + address: std.net.Address, + overlapped: Overlapped, + pending: bool, + }, + send: Transfer, + recv: Transfer, + read: struct { + fd: os.fd_t, + buf: [*]u8, + len: u32, + offset: u64, + }, + write: struct { + fd: os.fd_t, + buf: [*]const u8, + len: u32, + offset: u64, + }, + close: struct { + fd: os.fd_t, + }, + timeout: struct { + deadline: u64, + }, + }; + }; + + fn submit( + self: *IO, + context: anytype, + comptime callback: anytype, + completion: *Completion, + comptime op_tag: std.meta.Tag(Completion.Operation), + op_data: anytype, + comptime OperationImpl: type, + ) void { + const Callback = struct { + fn onComplete(ctx: Completion.Context) void { + // Perform the operation and get the result + const data = &@field(ctx.completion.operation, @tagName(op_tag)); + const result = OperationImpl.do_operation(ctx, data); + + // For OVERLAPPED IO, error.WouldBlock assumes that it will be completed by IOCP. + switch (op_tag) { + .accept, .read, .recv, .connect, .write, .send => { + _ = result catch |err| switch (err) { + error.WouldBlock => { + ctx.io.io_pending += 1; + return; + }, + else => {}, + }; + }, + else => {}, + } + + // The completion is finally ready to invoke the callback + callback( + @ptrCast(@alignCast(ctx.completion.context)), + ctx.completion, + result, + ); + } + }; + + // Setup the completion with the callback wrapper above + completion.* = .{ + .next = null, + .context = @as(?*anyopaque, @ptrCast(context)), + .callback = Callback.onComplete, + .operation = @unionInit(Completion.Operation, @tagName(op_tag), op_data), + }; + + // Submit the completion onto the right queue + switch (op_tag) { + .timeout => self.timeouts.push(completion), + else => self.completed.push(completion), + } + } + + pub const AcceptError = os.AcceptError || os.SetSockOptError; + + pub fn accept( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: AcceptError!os.socket_t, + ) void, + completion: *Completion, + socket: os.socket_t, + ) void { + self.submit( + context, + callback, + completion, + .accept, + .{ + .overlapped = undefined, + .listen_socket = socket, + .client_socket = INVALID_SOCKET, + .addr_buffer = undefined, + }, + struct { + fn do_operation(ctx: Completion.Context, op: anytype) AcceptError!os.socket_t { + var flags: os.windows.DWORD = undefined; + var transferred: os.windows.DWORD = undefined; + + const rc = switch (op.client_socket) { + // When first called, the client_socket is invalid so we start the op. + INVALID_SOCKET => blk: { + // Create the socket that will be used for accept. + op.client_socket = ctx.io.open_socket( + os.AF.INET, + os.SOCK.STREAM, + os.IPPROTO.TCP, + ) catch |err| switch (err) { + error.AddressFamilyNotSupported, error.ProtocolNotSupported => unreachable, + else => |e| return e, + }; + + var sync_bytes_read: os.windows.DWORD = undefined; + op.overlapped = .{ + .raw = std.mem.zeroes(os.windows.OVERLAPPED), + .completion = ctx.completion, + }; + + // Start the asynchronous accept with the created socket. + break :blk os.windows.ws2_32.AcceptEx( + op.listen_socket, + op.client_socket, + &op.addr_buffer, + 0, + @sizeOf(std.net.Address) + 16, + @sizeOf(std.net.Address) + 16, + &sync_bytes_read, + &op.overlapped.raw, + ); + }, + // Called after accept was started, so get the result + else => os.windows.ws2_32.WSAGetOverlappedResult( + op.listen_socket, + &op.overlapped.raw, + &transferred, + os.windows.FALSE, // dont wait + &flags, + ), + }; + + // return the socket if we succeed in accepting. + if (rc != os.windows.FALSE) { + // enables getsockopt, setsockopt, getsockname, getpeername + _ = os.windows.ws2_32.setsockopt( + op.client_socket, + os.windows.ws2_32.SOL.SOCKET, + os.windows.ws2_32.SO.UPDATE_ACCEPT_CONTEXT, + null, + 0, + ); + + return op.client_socket; + } + + // destroy the client_socket we created if we get a non WouldBlock error + errdefer |err| switch (err) { + error.WouldBlock => {}, + else => { + os.closeSocket(op.client_socket); + op.client_socket = INVALID_SOCKET; + }, + }; + + return switch (os.windows.ws2_32.WSAGetLastError()) { + .WSA_IO_PENDING, .WSAEWOULDBLOCK, .WSA_IO_INCOMPLETE => error.WouldBlock, + .WSANOTINITIALISED => unreachable, // WSAStartup() was called + .WSAENETDOWN => unreachable, // WinSock error + .WSAENOTSOCK => error.FileDescriptorNotASocket, + .WSAEOPNOTSUPP => error.OperationNotSupported, + .WSA_INVALID_HANDLE => unreachable, // we dont use hEvent in OVERLAPPED + .WSAEFAULT, .WSA_INVALID_PARAMETER => unreachable, // params should be ok + .WSAECONNRESET => error.ConnectionAborted, + .WSAEMFILE => unreachable, // we create our own descriptor so its available + .WSAENOBUFS => error.SystemResources, + .WSAEINTR, .WSAEINPROGRESS => unreachable, // no blocking calls + else => |err| os.windows.unexpectedWSAError(err), + }; + } + }, + ); + } + + pub const CloseError = error{ + FileDescriptorInvalid, + DiskQuota, + InputOutput, + NoSpaceLeft, + } || os.UnexpectedError; + + pub const ConnectError = os.ConnectError || error{FileDescriptorNotASocket}; + + pub fn connect( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: ConnectError!void, + ) void, + completion: *Completion, + socket: os.socket_t, + address: std.net.Address, + ) void { + self.submit( + context, + callback, + completion, + .connect, + .{ + .socket = socket, + .address = address, + .overlapped = undefined, + .pending = false, + }, + struct { + fn do_operation(ctx: Completion.Context, op: anytype) ConnectError!void { + var flags: os.windows.DWORD = undefined; + var transferred: os.windows.DWORD = undefined; + + const rc = blk: { + // Poll for the result if we've already started the connect op. + if (op.pending) { + break :blk os.windows.ws2_32.WSAGetOverlappedResult( + op.socket, + &op.overlapped.raw, + &transferred, + os.windows.FALSE, // dont wait + &flags, + ); + } + + // ConnectEx requires the socket to be initially bound (INADDR_ANY) + const inaddr_any = std.mem.zeroes([4]u8); + const bind_addr = std.net.Address.initIp4(inaddr_any, 0); + os.bind( + op.socket, + &bind_addr.any, + bind_addr.getOsSockLen(), + ) catch |err| switch (err) { + error.AccessDenied => unreachable, + error.SymLinkLoop => unreachable, + error.NameTooLong => unreachable, + error.NotDir => unreachable, + error.ReadOnlyFileSystem => unreachable, + error.NetworkSubsystemFailed => unreachable, + error.AlreadyBound => unreachable, + else => |e| return e, + }; + + const LPFN_CONNECTEX = *const fn ( + Socket: os.windows.ws2_32.SOCKET, + SockAddr: *const os.windows.ws2_32.sockaddr, + SockLen: os.socklen_t, + SendBuf: ?*const anyopaque, + SendBufLen: os.windows.DWORD, + BytesSent: *os.windows.DWORD, + Overlapped: *os.windows.OVERLAPPED, + ) callconv(os.windows.WINAPI) os.windows.BOOL; + + // Find the ConnectEx function by dynamically looking it up on the socket. + // TODO: use `os.windows.loadWinsockExtensionFunction` once the function + // pointer is no longer required to be comptime. + var connect_ex: LPFN_CONNECTEX = undefined; + var num_bytes: os.windows.DWORD = undefined; + const guid = os.windows.ws2_32.WSAID_CONNECTEX; + switch (os.windows.ws2_32.WSAIoctl( + op.socket, + os.windows.ws2_32.SIO_GET_EXTENSION_FUNCTION_POINTER, + @as(*const anyopaque, @ptrCast(&guid)), + @sizeOf(os.windows.GUID), + @as(*anyopaque, @ptrCast(&connect_ex)), + @sizeOf(LPFN_CONNECTEX), + &num_bytes, + null, + null, + )) { + os.windows.ws2_32.SOCKET_ERROR => switch (os.windows.ws2_32.WSAGetLastError()) { + .WSAEOPNOTSUPP => unreachable, + .WSAENOTSOCK => unreachable, + else => |err| return os.windows.unexpectedWSAError(err), + }, + else => assert(num_bytes == @sizeOf(LPFN_CONNECTEX)), + } + + op.pending = true; + op.overlapped = .{ + .raw = std.mem.zeroes(os.windows.OVERLAPPED), + .completion = ctx.completion, + }; + + // Start the connect operation. + break :blk (connect_ex)( + op.socket, + &op.address.any, + op.address.getOsSockLen(), + null, + 0, + &transferred, + &op.overlapped.raw, + ); + }; + + // return if we succeeded in connecting + if (rc != os.windows.FALSE) { + // enables getsockopt, setsockopt, getsockname, getpeername + _ = os.windows.ws2_32.setsockopt( + op.socket, + os.windows.ws2_32.SOL.SOCKET, + os.windows.ws2_32.SO.UPDATE_CONNECT_CONTEXT, + null, + 0, + ); + + return; + } + + return switch (os.windows.ws2_32.WSAGetLastError()) { + .WSA_IO_PENDING, .WSAEWOULDBLOCK, .WSA_IO_INCOMPLETE, .WSAEALREADY => error.WouldBlock, + .WSANOTINITIALISED => unreachable, // WSAStartup() was called + .WSAENETDOWN => unreachable, // network subsystem is down + .WSAEADDRNOTAVAIL => error.AddressNotAvailable, + .WSAEAFNOSUPPORT => error.AddressFamilyNotSupported, + .WSAECONNREFUSED => error.ConnectionRefused, + .WSAEFAULT => unreachable, // all addresses should be valid + .WSAEINVAL => unreachable, // invalid socket type + .WSAEHOSTUNREACH, .WSAENETUNREACH => error.NetworkUnreachable, + .WSAENOBUFS => error.SystemResources, + .WSAENOTSOCK => unreachable, // socket is not bound or is listening + .WSAETIMEDOUT => error.ConnectionTimedOut, + .WSA_INVALID_HANDLE => unreachable, // we dont use hEvent in OVERLAPPED + else => |err| os.windows.unexpectedWSAError(err), + }; + } + }, + ); + } + + pub const SendError = os.SendError; + + pub fn send( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: SendError!usize, + ) void, + completion: *Completion, + socket: os.socket_t, + buffer: []const u8, + ) void { + const transfer = Completion.Transfer{ + .socket = socket, + .buf = os.windows.ws2_32.WSABUF{ + .len = @as(u32, @intCast(bufferLimit(buffer.len))), + .buf = @constCast(buffer.ptr), + }, + .overlapped = undefined, + .pending = false, + }; + + self.submit( + context, + callback, + completion, + .send, + transfer, + struct { + fn do_operation(ctx: Completion.Context, op: anytype) SendError!usize { + var flags: os.windows.DWORD = undefined; + var transferred: os.windows.DWORD = undefined; + + const rc = blk: { + // Poll for the result if we've already started the send op. + if (op.pending) { + break :blk os.windows.ws2_32.WSAGetOverlappedResult( + op.socket, + &op.overlapped.raw, + &transferred, + os.windows.FALSE, // dont wait + &flags, + ); + } + + op.pending = true; + op.overlapped = .{ + .raw = std.mem.zeroes(os.windows.OVERLAPPED), + .completion = ctx.completion, + }; + + // Start the send operation. + break :blk switch (os.windows.ws2_32.WSASend( + op.socket, + @as([*]os.windows.ws2_32.WSABUF, @ptrCast(&op.buf)), + 1, // one buffer + &transferred, + 0, // no flags + &op.overlapped.raw, + null, + )) { + os.windows.ws2_32.SOCKET_ERROR => @as(os.windows.BOOL, os.windows.FALSE), + 0 => os.windows.TRUE, + else => unreachable, + }; + }; + + // Return bytes transferred on success. + if (rc != os.windows.FALSE) + return transferred; + + return switch (os.windows.ws2_32.WSAGetLastError()) { + .WSA_IO_PENDING, .WSAEWOULDBLOCK, .WSA_IO_INCOMPLETE => error.WouldBlock, + .WSANOTINITIALISED => unreachable, // WSAStartup() was called + .WSA_INVALID_HANDLE => unreachable, // we dont use OVERLAPPED.hEvent + .WSA_INVALID_PARAMETER => unreachable, // parameters are fine + .WSAECONNABORTED => error.ConnectionResetByPeer, + .WSAECONNRESET => error.ConnectionResetByPeer, + .WSAEFAULT => unreachable, // invalid buffer + .WSAEINTR => unreachable, // this is non blocking + .WSAEINPROGRESS => unreachable, // this is non blocking + .WSAEINVAL => unreachable, // invalid socket type + .WSAEMSGSIZE => error.MessageTooBig, + .WSAENETDOWN => error.NetworkSubsystemFailed, + .WSAENETRESET => error.ConnectionResetByPeer, + .WSAENOBUFS => error.SystemResources, + .WSAENOTCONN => error.FileDescriptorNotASocket, + .WSAEOPNOTSUPP => unreachable, // we dont use MSG_OOB or MSG_PARTIAL + .WSAESHUTDOWN => error.BrokenPipe, + .WSA_OPERATION_ABORTED => unreachable, // operation was cancelled + else => |err| os.windows.unexpectedWSAError(err), + }; + } + }, + ); + } + + pub const RecvError = os.RecvFromError; + + pub fn recv( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: RecvError!usize, + ) void, + completion: *Completion, + socket: os.socket_t, + buffer: []u8, + ) void { + const transfer = Completion.Transfer{ + .socket = socket, + .buf = os.windows.ws2_32.WSABUF{ + .len = @as(u32, @intCast(bufferLimit(buffer.len))), + .buf = buffer.ptr, + }, + .overlapped = undefined, + .pending = false, + }; + + self.submit( + context, + callback, + completion, + .recv, + transfer, + struct { + fn do_operation(ctx: Completion.Context, op: anytype) RecvError!usize { + var flags: os.windows.DWORD = 0; // used both as input and output + var transferred: os.windows.DWORD = undefined; + + const rc = blk: { + // Poll for the result if we've already started the recv op. + if (op.pending) { + break :blk os.windows.ws2_32.WSAGetOverlappedResult( + op.socket, + &op.overlapped.raw, + &transferred, + os.windows.FALSE, // dont wait + &flags, + ); + } + + op.pending = true; + op.overlapped = .{ + .raw = std.mem.zeroes(os.windows.OVERLAPPED), + .completion = ctx.completion, + }; + + // Start the recv operation. + break :blk switch (os.windows.ws2_32.WSARecv( + op.socket, + @as([*]os.windows.ws2_32.WSABUF, @ptrCast(&op.buf)), + 1, // one buffer + &transferred, + &flags, + &op.overlapped.raw, + null, + )) { + os.windows.ws2_32.SOCKET_ERROR => @as(os.windows.BOOL, os.windows.FALSE), + 0 => os.windows.TRUE, + else => unreachable, + }; + }; + + // Return bytes received on success. + if (rc != os.windows.FALSE) + return transferred; + + return switch (os.windows.ws2_32.WSAGetLastError()) { + .WSA_IO_PENDING, .WSAEWOULDBLOCK, .WSA_IO_INCOMPLETE => error.WouldBlock, + .WSANOTINITIALISED => unreachable, // WSAStartup() was called + .WSA_INVALID_HANDLE => unreachable, // we dont use OVERLAPPED.hEvent + .WSA_INVALID_PARAMETER => unreachable, // parameters are fine + .WSAECONNABORTED => error.ConnectionRefused, + .WSAECONNRESET => error.ConnectionResetByPeer, + .WSAEDISCON => unreachable, // we only stream sockets + .WSAEFAULT => unreachable, // invalid buffer + .WSAEINTR => unreachable, // this is non blocking + .WSAEINPROGRESS => unreachable, // this is non blocking + .WSAEINVAL => unreachable, // invalid socket type + .WSAEMSGSIZE => error.MessageTooBig, + .WSAENETDOWN => error.NetworkSubsystemFailed, + .WSAENETRESET => error.ConnectionResetByPeer, + .WSAENOTCONN => error.SocketNotConnected, + .WSAEOPNOTSUPP => unreachable, // we dont use MSG_OOB or MSG_PARTIAL + .WSAESHUTDOWN => error.SocketNotConnected, + .WSAETIMEDOUT => error.ConnectionRefused, + .WSA_OPERATION_ABORTED => unreachable, // operation was cancelled + else => |err| os.windows.unexpectedWSAError(err), + }; + } + }, + ); + } + + pub const ReadError = error{ + WouldBlock, + NotOpenForReading, + ConnectionResetByPeer, + Alignment, + InputOutput, + IsDir, + SystemResources, + Unseekable, + ConnectionTimedOut, + } || os.UnexpectedError; + + pub fn read( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: ReadError!usize, + ) void, + completion: *Completion, + fd: os.fd_t, + buffer: []u8, + offset: u64, + ) void { + self.submit( + context, + callback, + completion, + .read, + .{ + .fd = fd, + .buf = buffer.ptr, + .len = @as(u32, @intCast(bufferLimit(buffer.len))), + .offset = offset, + }, + struct { + fn do_operation(ctx: Completion.Context, op: anytype) ReadError!usize { + // Do a synchronous read for now. + _ = ctx; + return os.pread(op.fd, op.buf[0..op.len], op.offset) catch |err| switch (err) { + error.OperationAborted => unreachable, + error.BrokenPipe => unreachable, + error.ConnectionTimedOut => unreachable, + error.AccessDenied => error.InputOutput, + error.NetNameDeleted => unreachable, + else => |e| e, + }; + } + }, + ); + } + + pub const WriteError = os.PWriteError; + + pub fn write( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: WriteError!usize, + ) void, + completion: *Completion, + fd: os.fd_t, + buffer: []const u8, + offset: u64, + ) void { + self.submit( + context, + callback, + completion, + .write, + .{ + .fd = fd, + .buf = buffer.ptr, + .len = @as(u32, @intCast(bufferLimit(buffer.len))), + .offset = offset, + }, + struct { + fn do_operation(ctx: Completion.Context, op: anytype) WriteError!usize { + // Do a synchronous write for now. + _ = ctx; + return os.pwrite(op.fd, op.buf[0..op.len], op.offset); + } + }, + ); + } + + pub fn close( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: CloseError!void, + ) void, + completion: *Completion, + fd: os.fd_t, + ) void { + self.submit( + context, + callback, + completion, + .close, + .{ .fd = fd }, + struct { + fn do_operation(ctx: Completion.Context, op: anytype) CloseError!void { + _ = ctx; + + // Check if the fd is a SOCKET by seeing if getsockopt() returns ENOTSOCK + // https://stackoverflow.com/a/50981652 + const socket: os.socket_t = @ptrCast(op.fd); + getsockoptError(socket) catch |err| switch (err) { + error.FileDescriptorNotASocket => return os.windows.CloseHandle(op.fd), + else => {}, + }; + + os.closeSocket(socket); + } + }, + ); + } + + pub const TimeoutError = error{Canceled} || os.UnexpectedError; + + pub fn timeout( + self: *IO, + comptime Context: type, + context: Context, + comptime callback: fn ( + context: Context, + completion: *Completion, + result: TimeoutError!void, + ) void, + completion: *Completion, + nanoseconds: u63, + ) void { + // Special case a zero timeout as a yield. + if (nanoseconds == 0) { + completion.* = .{ + .next = null, + .context = @ptrCast(context), + .operation = undefined, + .callback = struct { + fn on_complete(ctx: Completion.Context) void { + const _context: Context = @ptrCast(@alignCast(ctx.completion.context)); + callback(_context, ctx.completion, {}); + } + }.on_complete, + }; + + self.completed.push(completion); + return; + } + + self.submit( + context, + callback, + completion, + .timeout, + .{ .deadline = self.timer.monotonic() + nanoseconds }, + struct { + fn do_operation(ctx: Completion.Context, op: anytype) TimeoutError!void { + _ = ctx; + _ = op; + return; + } + }, + ); + } + + pub const INVALID_SOCKET = os.windows.ws2_32.INVALID_SOCKET; + + /// Creates a socket that can be used for async operations with the IO instance. + pub fn open_socket(self: *IO, family: u32, sock_type: u32, protocol: u32) !os.socket_t { + // SOCK_NONBLOCK | SOCK_CLOEXEC + var flags: os.windows.DWORD = 0; + flags |= os.windows.ws2_32.WSA_FLAG_OVERLAPPED; + flags |= os.windows.ws2_32.WSA_FLAG_NO_HANDLE_INHERIT; + + const socket = try os.windows.WSASocketW( + @as(i32, @bitCast(family)), + @as(i32, @bitCast(sock_type)), + @as(i32, @bitCast(protocol)), + null, + 0, + flags, + ); + errdefer os.closeSocket(socket); + + const socket_iocp = try os.windows.CreateIoCompletionPort(socket, self.iocp, 0, 0); + assert(socket_iocp == self.iocp); + + // Ensure that synchronous IO completion doesn't queue an unneeded overlapped + // and that the event for the socket (WaitForSingleObject) doesn't need to be set. + var mode: os.windows.BYTE = 0; + mode |= os.windows.FILE_SKIP_COMPLETION_PORT_ON_SUCCESS; + mode |= os.windows.FILE_SKIP_SET_EVENT_ON_HANDLE; + + const handle = @as(os.windows.HANDLE, @ptrCast(socket)); + try os.windows.SetFileCompletionNotificationModes(handle, mode); + + return socket; + } + + /// Opens a directory with read only access. + pub fn open_dir(dir_path: []const u8) !os.fd_t { + const dir = try std.fs.cwd().openDir(dir_path, .{}); + return dir.fd; + } + + pub const INVALID_FILE = os.windows.INVALID_HANDLE_VALUE; + + fn open_file_handle(relative_path: []const u8, method: enum { create, open }) !os.fd_t { + const path_w = try os.windows.sliceToPrefixedFileW(relative_path); + + // FILE_CREATE = O_CREAT | O_EXCL + var creation_disposition: os.windows.DWORD = 0; + switch (method) { + .create => { + creation_disposition = os.windows.FILE_CREATE; + log.info("creating \"{s}\"...", .{relative_path}); + }, + .open => { + creation_disposition = os.windows.OPEN_EXISTING; + log.info("opening \"{s}\"...", .{relative_path}); + }, + } + + // O_EXCL + var shared_mode: os.windows.DWORD = 0; + + // O_RDWR + var access_mask: os.windows.DWORD = 0; + access_mask |= os.windows.GENERIC_READ; + access_mask |= os.windows.GENERIC_WRITE; + + // O_DIRECT | O_DSYNC + var attributes: os.windows.DWORD = 0; + attributes |= os.windows.FILE_FLAG_NO_BUFFERING; + attributes |= os.windows.FILE_FLAG_WRITE_THROUGH; + + // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file: + assert((attributes & os.windows.FILE_FLAG_WRITE_THROUGH) > 0); + + // TODO: Add ReadFileEx/WriteFileEx support. + // Not currently needed for O_DIRECT disk IO. + // attributes |= os.windows.FILE_FLAG_OVERLAPPED; + + const handle = os.windows.kernel32.CreateFileW( + path_w.span(), + access_mask, + shared_mode, + null, // no security attributes required + creation_disposition, + attributes, + null, // no existing template file + ); + + if (handle == os.windows.INVALID_HANDLE_VALUE) { + return switch (os.windows.kernel32.GetLastError()) { + .FILE_NOT_FOUND => error.FileNotFound, + .SHARING_VIOLATION, .ACCESS_DENIED => error.AccessDenied, + else => |err| { + log.warn("CreateFileW(): {}", .{err}); + return os.windows.unexpectedError(err); + }, + }; + } + + return handle; + } + + /// Opens or creates a journal file: + /// - For reading and writing. + /// - For Direct I/O (required on windows). + /// - Obtains an advisory exclusive lock to the file descriptor. + /// - Allocates the file contiguously on disk if this is supported by the file system. + /// - Ensures that the file data is durable on disk. + /// The caller is responsible for ensuring that the parent directory inode is durable. + /// - Verifies that the file size matches the expected file size before returning. + pub fn open_file( + dir_handle: os.fd_t, + relative_path: []const u8, + size: u64, + method: enum { create, create_or_open, open }, + ) !os.fd_t { + assert(relative_path.len > 0); + assert(size % sector_size == 0); + + const handle = switch (method) { + .open => try open_file_handle(relative_path, .open), + .create => try open_file_handle(relative_path, .create), + .create_or_open => open_file_handle(relative_path, .open) catch |err| switch (err) { + error.FileNotFound => try open_file_handle(relative_path, .create), + else => return err, + }, + }; + errdefer os.windows.CloseHandle(handle); + + // Obtain an advisory exclusive lock + // even when we haven't given shared access to other processes. + fs_lock(handle, size) catch |err| switch (err) { + error.WouldBlock => @panic("another process holds the data file lock"), + else => return err, + }; + + // Ask the file system to allocate contiguous sectors for the file (if possible): + if (method == .create) { + log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)}); + fs_allocate(handle, size) catch { + log.warn("file system failed to preallocate the file memory", .{}); + log.info("allocating by writing to the last sector of the file instead...", .{}); + + const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size; + + // Handle partial writes where the physical sector is less than a logical sector: + const write_offset = size - sector.len; + var written: usize = 0; + while (written < sector.len) { + written += try os.pwrite(handle, sector[written..], write_offset + written); + } + }; + } + + // The best fsync strategy is always to fsync before reading because this prevents us from + // making decisions on data that was never durably written by a previously crashed process. + // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC. + // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out. + try os.fsync(handle); + + // We cannot fsync the directory handle on Windows. + // We have no way to open a directory with write access. + // + // try os.fsync(dir_handle); + _ = dir_handle; + + const file_size = try os.windows.GetFileSizeEx(handle); + if (file_size < size) @panic("data file inode size was truncated or corrupted"); + + return handle; + } + + fn fs_lock(handle: os.fd_t, size: u64) !void { + // TODO: Look into using SetFileIoOverlappedRange() for better unbuffered async IO perf + // NOTE: Requires SeLockMemoryPrivilege. + + const kernel32 = struct { + const LOCKFILE_EXCLUSIVE_LOCK = 0x2; + const LOCKFILE_FAIL_IMMEDIATELY = 0x1; + + extern "kernel32" fn LockFileEx( + hFile: os.windows.HANDLE, + dwFlags: os.windows.DWORD, + dwReserved: os.windows.DWORD, + nNumberOfBytesToLockLow: os.windows.DWORD, + nNumberOfBytesToLockHigh: os.windows.DWORD, + lpOverlapped: ?*os.windows.OVERLAPPED, + ) callconv(os.windows.WINAPI) os.windows.BOOL; + }; + + // hEvent = null + // Offset & OffsetHigh = 0 + var lock_overlapped = std.mem.zeroes(os.windows.OVERLAPPED); + + // LOCK_EX | LOCK_NB + var lock_flags: os.windows.DWORD = 0; + lock_flags |= kernel32.LOCKFILE_EXCLUSIVE_LOCK; + lock_flags |= kernel32.LOCKFILE_FAIL_IMMEDIATELY; + + const locked = kernel32.LockFileEx( + handle, + lock_flags, + 0, // reserved param is always zero + @as(u32, @truncate(size)), // low bits of size + @as(u32, @truncate(size >> 32)), // high bits of size + &lock_overlapped, + ); + + if (locked == os.windows.FALSE) { + return switch (os.windows.kernel32.GetLastError()) { + .IO_PENDING => error.WouldBlock, + else => |err| os.windows.unexpectedError(err), + }; + } + } + + fn fs_allocate(handle: os.fd_t, size: u64) !void { + // TODO: Look into using SetFileValidData() instead + // NOTE: Requires SE_MANAGE_VOLUME_NAME privilege + + // Move the file pointer to the start + size + const seeked = os.windows.kernel32.SetFilePointerEx( + handle, + @as(i64, @intCast(size)), + null, // no reference to new file pointer + os.windows.FILE_BEGIN, + ); + + if (seeked == os.windows.FALSE) { + return switch (os.windows.kernel32.GetLastError()) { + .INVALID_HANDLE => unreachable, + .INVALID_PARAMETER => unreachable, + else => |err| os.windows.unexpectedError(err), + }; + } + + // Mark the moved file pointer (start + size) as the physical EOF. + const allocated = os.windows.kernel32.SetEndOfFile(handle); + if (allocated == os.windows.FALSE) { + const err = os.windows.kernel32.GetLastError(); + return os.windows.unexpectedError(err); + } + } +}; + +// TODO: use os.getsockoptError when fixed for windows in stdlib +fn getsockoptError(socket: os.socket_t) IO.ConnectError!void { + var err_code: u32 = undefined; + var size: i32 = @sizeOf(u32); + const rc = os.windows.ws2_32.getsockopt( + socket, + os.SOL.SOCKET, + os.SO.ERROR, + std.mem.asBytes(&err_code), + &size, + ); + + if (rc != 0) { + switch (os.windows.ws2_32.WSAGetLastError()) { + .WSAENETDOWN => return error.NetworkUnreachable, + .WSANOTINITIALISED => unreachable, // WSAStartup() was never called + .WSAEFAULT => unreachable, // The address pointed to by optval or optlen is not in a valid part of the process address space. + .WSAEINVAL => unreachable, // The level parameter is unknown or invalid + .WSAENOPROTOOPT => unreachable, // The option is unknown at the level indicated. + .WSAENOTSOCK => return error.FileDescriptorNotASocket, + else => |err| return os.windows.unexpectedWSAError(err), + } + } + + assert(size == 4); + if (err_code == 0) + return; + + const ws_err = @as(os.windows.ws2_32.WinsockError, @enumFromInt(@as(u16, @intCast(err_code)))); + return switch (ws_err) { + .WSAEACCES => error.PermissionDenied, + .WSAEADDRINUSE => error.AddressInUse, + .WSAEADDRNOTAVAIL => error.AddressNotAvailable, + .WSAEAFNOSUPPORT => error.AddressFamilyNotSupported, + .WSAEALREADY => error.ConnectionPending, + .WSAEBADF => unreachable, + .WSAECONNREFUSED => error.ConnectionRefused, + .WSAEFAULT => unreachable, + .WSAEISCONN => unreachable, // error.AlreadyConnected, + .WSAENETUNREACH => error.NetworkUnreachable, + .WSAENOTSOCK => error.FileDescriptorNotASocket, + .WSAEPROTOTYPE => unreachable, + .WSAETIMEDOUT => error.ConnectionTimedOut, + .WSAECONNRESET => error.ConnectionResetByPeer, + else => |e| os.windows.unexpectedWSAError(e), + }; +} diff --git a/src/low_level_hash_vectors.zig b/src/low_level_hash_vectors.zig new file mode 100644 index 0000000..9d15e46 --- /dev/null +++ b/src/low_level_hash_vectors.zig @@ -0,0 +1,142 @@ +//! Test vectors for `stdx.inline_hash` from +//! +//! + +pub const Case = struct { seed: u64, hash: u64, b64: []const u8 }; + +pub const cases = [_]Case{ + .{ .seed = 0xec42b7ab404b8acb, .hash = 0xe5a40d39ab796423, .b64 = "" }, + .{ .seed = 0, .hash = 0x1766974bf7527d81, .b64 = "ICAg" }, + .{ .seed = 0, .hash = 0x5c3bbbe230db17a8, .b64 = "YWFhYQ==" }, + .{ .seed = 0, .hash = 0xa6630143a7e6aa6f, .b64 = "AQID" }, + .{ .seed = 0, .hash = 0x8787cb2d04b0c984, .b64 = "AQIDBA==" }, + .{ .seed = 0, .hash = 0x33603654ff574ac2, .b64 = "dGhpcmRfcGFydHl8d3loYXNofDY0" }, + .{ .seed = 0xeeee074043a3ee0f, .hash = 0xa6564b468248c683, .b64 = "Zw==" }, + .{ .seed = 0x857902089c393de, .hash = 0xef192f401b116e1c, .b64 = "xmk=" }, + .{ .seed = 0x993df040024ca3af, .hash = 0xbe8dc0c54617639d, .b64 = "c1H/" }, + .{ .seed = 0xc4e4c2acea740e96, .hash = 0x93d7f665b5521c8e, .b64 = "SuwpzQ==" }, + .{ .seed = 0x6a214b3db872d0cf, .hash = 0x646d70bb42445f28, .b64 = "uqvy++M=" }, + .{ .seed = 0x44343db6a89dba4d, .hash = 0x96a7b1e3cc9bd426, .b64 = "RnzCVPgb" }, + .{ .seed = 0x77b5d6d1ae1dd483, .hash = 0x76020289ab0790c4, .b64 = "6OeNdlouYw==" }, + .{ .seed = 0x89ab8ecb44d221f1, .hash = 0x39f842e4133b9b44, .b64 = "M5/JmmYyDbc=" }, + .{ .seed = 0x60244b17577ca81b, .hash = 0x2b8d7047be4bcaab, .b64 = "MVijWiVdBRdY" }, + .{ .seed = 0x59a08dcee0717067, .hash = 0x99628abef6716a97, .b64 = "6V7Uq7LNxpu0VA==" }, + .{ .seed = 0xf5f20db3ade57396, .hash = 0x4432e02ba42b2740, .b64 = "EQ6CdEEhPdyHcOk=" }, + .{ .seed = 0xbf8dee0751ad3efb, .hash = 0x74d810efcad7918a, .b64 = "PqFB4fxnPgF+l+rc" }, + .{ .seed = 0x6b7a06b268d63e30, .hash = 0x88c84e986002507f, .b64 = "a5aPOFwq7LA7+zKvPA==" }, + .{ .seed = 0xb8c37f0ae0f54c82, .hash = 0x4f99acf193cf39b9, .b64 = "VOwY21wCGv5D+/qqOvs=" }, + .{ .seed = 0x9fcbed0c38e50eef, .hash = 0xd90e7a3655891e37, .b64 = "KdHmBTx8lHXYvmGJ+Vy7" }, + .{ .seed = 0x2af4bade1d8e3a1d, .hash = 0x3bb378b1d4df8fcf, .b64 = "qJkPlbHr8bMF7/cA6aE65Q==" }, + .{ .seed = 0x714e3aa912da2f2c, .hash = 0xf78e94045c052d47, .b64 = "ygvL0EhHZL0fIx6oHHtkxRQ=" }, + .{ .seed = 0xf5ee75e3cbb82c1c, .hash = 0x26da0b2130da6b40, .b64 = "c1rFXkt5YztwZCQRngncqtSs" }, + .{ .seed = 0x620e7007321b93b9, .hash = 0x30b4d426af8c6986, .b64 = "8hsQrzszzeNQSEcVXLtvIhm6mw==" }, + .{ .seed = 0xc08528cac2e551fc, .hash = 0x5413b4aaf3baaeae, .b64 = "ffUL4RocfyP4KfikGxO1yk7omDI=" }, + .{ .seed = 0x6a1debf9cc3ad39, .hash = 0x756ab265370a1597, .b64 = "OOB5TT00vF9Od/rLbAWshiErqhpV" }, + .{ .seed = 0x7e0a3c88111fc226, .hash = 0xdaf5f4b7d09814fb, .b64 = "or5wtXM7BFzTNpSzr+Lw5J5PMhVJ/Q==" }, + .{ .seed = 0x1301fef15df39edb, .hash = 0x8f874ae37742b75e, .b64 = "gk6pCHDUsoopVEiaCrzVDhioRKxb844=" }, + .{ .seed = 0x64e181f3d5817ab, .hash = 0x8fecd03956121ce8, .b64 = "TNctmwlC5QbEM6/No4R/La3UdkfeMhzs" }, + .{ .seed = 0xafafc44961078ecb, .hash = 0x229c292ea7a08285, .b64 = "SsQw9iAjhWz7sgcE9OwLuSC6hsM+BfHs2Q==" }, + .{ .seed = 0x4f7bb45549250094, .hash = 0xbb4bf0692d14bae, .b64 = "ZzO3mVCj4xTT2TT3XqDyEKj2BZQBvrS8RHg=" }, + .{ .seed = 0xa30061abaa2818c, .hash = 0x207b24ca3bdac1db, .b64 = "+klp5iPQGtppan5MflEls0iEUzqU+zGZkDJX" }, + .{ .seed = 0xd902ee3e44a5705f, .hash = 0x64f6cd6745d3825b, .b64 = "RO6bvOnlJc8I9eniXlNgqtKy0IX6VNg16NRmgg==" }, + .{ .seed = 0x316d36da516f583, .hash = 0xa2b2e1656b58df1e, .b64 = "ZJjZqId1ZXBaij9igClE3nyliU5XWdNRrayGlYA=" }, + .{ .seed = 0x402d83f9f834f616, .hash = 0xd01d30d9ee7a148, .b64 = "7BfkhfGMDGbxfMB8uyL85GbaYQtjr2K8g7RpLzr/" }, + .{ .seed = 0x9c604164c016b72c, .hash = 0x1cb4cd00ab804e3b, .b64 = "rycWk6wHH7htETQtje9PidS2YzXBx+Qkg2fY7ZYS7A==" }, + .{ .seed = 0x3f4507e01f9e73ba, .hash = 0x4697f2637fd90999, .b64 = "RTkC2OUK+J13CdGllsH0H5WqgspsSa6QzRZouqx6pvI=" }, + .{ .seed = 0xc3fe0d5be8d2c7c7, .hash = 0x8383a756b5688c07, .b64 = "tKjKmbLCNyrLCM9hycOAXm4DKNpM12oZ7dLTmUx5iwAi" }, + .{ .seed = 0x531858a40bfa7ea1, .hash = 0x695c29cb3696a975, .b64 = "VprUGNH+5NnNRaORxgH/ySrZFQFDL+4VAodhfBNinmn8cg==" }, + .{ .seed = 0x86689478a7a7e8fa, .hash = 0xda2e5a5a5e971521, .b64 = "gc1xZaY+q0nPcUvOOnWnT3bqfmT/geth/f7Dm2e/DemMfk4=" }, + .{ .seed = 0x4ec948b8e7f27288, .hash = 0x7935d4befa056b2b, .b64 = "Mr35fIxqx1ukPAL0su1yFuzzAU3wABCLZ8+ZUFsXn47UmAph" }, + .{ .seed = 0xce46c7213c10032, .hash = 0x38dd541ca95420fe, .b64 = "A9G8pw2+m7+rDtWYAdbl8tb2fT7FFo4hLi2vAsa5Y8mKH3CX3g==" }, + .{ .seed = 0xf63e96ee6f32a8b6, .hash = 0xcc06c7a4963f967f, .b64 = "DFaJGishGwEHDdj9ixbCoaTjz9KS0phLNWHVVdFsM93CvPft3hM=" }, + .{ .seed = 0x1cfe85e65fc5225, .hash = 0xbf0f6f66e232fb20, .b64 = "7+Ugx+Kr3aRNgYgcUxru62YkTDt5Hqis+2po81hGBkcrJg4N0uuy" }, + .{ .seed = 0x45c474f1cee1d2e8, .hash = 0xf7efb32d373fe71a, .b64 = "H2w6O8BUKqu6Tvj2xxaecxEI2wRgIgqnTTG1WwOgDSINR13Nm4d4Vg==" }, + .{ .seed = 0x6e024e14015f329c, .hash = 0xe2e64634b1c12660, .b64 = "1XBMnIbqD5jy65xTDaf6WtiwtdtQwv1dCVoqpeKj+7cTR1SaMWMyI04=" }, + .{ .seed = 0x760c40502103ae1c, .hash = 0x285b8fd1638e306d, .b64 = "znZbdXG2TSFrKHEuJc83gPncYpzXGbAebUpP0XxzH0rpe8BaMQ17nDbt" }, + .{ .seed = 0x17fd05c3c560c320, .hash = 0x658e8a4e3b714d6c, .b64 = "ylu8Atu13j1StlcC1MRMJJXIl7USgDDS22HgVv0WQ8hx/8pNtaiKB17hCQ==" }, + .{ .seed = 0x8b34200a6f8e90d9, .hash = 0xf391fb968e0eb398, .b64 = "M6ZVVzsd7vAvbiACSYHioH/440dp4xG2mLlBnxgiqEvI/aIEGpD0Sf4VS0g=" }, + .{ .seed = 0x6be89e50818bdf69, .hash = 0x744a9ea0cc144bf2, .b64 = "li3oFSXLXI+ubUVGJ4blP6mNinGKLHWkvGruun85AhVn6iuMtocbZPVhqxzn" }, + .{ .seed = 0xfb389773315b47d8, .hash = 0x12636f2be11012f1, .b64 = "kFuQHuUCqBF3Tc3hO4dgdIp223ShaCoog48d5Do5zMqUXOh5XpGK1t5XtxnfGA==" }, + .{ .seed = 0x4f2512a23f61efee, .hash = 0x29c57de825948f80, .b64 = "jWmOad0v0QhXVJd1OdGuBZtDYYS8wBVHlvOeTQx9ZZnm8wLEItPMeihj72E0nWY=" }, + .{ .seed = 0x59ccd92fc16c6fda, .hash = 0x58c6f99ab0d1c021, .b64 = "z+DHU52HaOQdW4JrZwDQAebEA6rm13Zg/9lPYA3txt3NjTBqFZlOMvTRnVzRbl23" }, + .{ .seed = 0x25c5a7f5bd330919, .hash = 0x13e7b5a7b82fe3bb, .b64 = "MmBiGDfYeTayyJa/tVycg+rN7f9mPDFaDc+23j0TlW9094er0ADigsl4QX7V3gG/qw==" }, + .{ .seed = 0x51df4174d34c97d7, .hash = 0x10fbc87901e02b63, .b64 = "774RK+9rOL4iFvs1q2qpo/JVc/I39buvNjqEFDtDvyoB0FXxPI2vXqOrk08VPfIHkmU=" }, + .{ .seed = 0x80ce6d76f89cb57, .hash = 0xa24c9184901b748b, .b64 = "+slatXiQ7/2lK0BkVUI1qzNxOOLP3I1iK6OfHaoxgqT63FpzbElwEXSwdsryq3UlHK0I" }, + .{ .seed = 0x20961c911965f684, .hash = 0xcac4fd4c5080e581, .b64 = "64mVTbQ47dHjHlOHGS/hjJwr/K2frCNpn87exOqMzNUVYiPKmhCbfS7vBUce5tO6Ec9osQ==" }, + .{ .seed = 0x4e5b926ec83868e7, .hash = 0xc38bdb7483ba68e1, .b64 = "fIsaG1r530SFrBqaDj1kqE0AJnvvK8MNEZbII2Yw1OK77v0V59xabIh0B5axaz/+a2V5WpA=" }, + .{ .seed = 0x3927b30b922eecef, .hash = 0xdb2a8069b2ceaffa, .b64 = "PGih0zDEOWCYGxuHGDFu9Ivbff/iE7BNUq65tycTR2R76TerrXALRosnzaNYO5fjFhTi+CiS" }, + .{ .seed = 0xbd0291284a49b61c, .hash = 0xdf9fe91d0d1c7887, .b64 = "RnpA/zJnEnnLjmICORByRVb9bCOgxF44p3VMiW10G7PvW7IhwsWajlP9kIwNA9FjAD2GoQHk2Q==" }, + .{ .seed = 0x73a77c575bcc956, .hash = 0xe83f49e96e2e6a08, .b64 = "qFklMceaTHqJpy2qavJE+EVBiNFOi6OxjOA3LeIcBop1K7w8xQi3TrDk+BrWPRIbfprszSaPfrI=" }, + .{ .seed = 0x766a0e2ade6d09a6, .hash = 0xc69e61b62ca2b62, .b64 = "cLbfUtLl3EcQmITWoTskUR8da/VafRDYF/ylPYwk7/zazk6ssyrzxMN3mmSyvrXR2yDGNZ3WDrTT" }, + .{ .seed = 0x2599f4f905115869, .hash = 0xb4a4f3f85f8298fe, .b64 = "s/Jf1+FbsbCpXWPTUSeWyMH6e4CvTFvPE5Fs6Z8hvFITGyr0dtukHzkI84oviVLxhM1xMxrMAy1dbw==" }, + .{ .seed = 0xd8256e5444d21e53, .hash = 0x167a1b39e1e95f41, .b64 = "FvyQ00+j7nmYZVQ8hI1Edxd0AWplhTfWuFGiu34AK5X8u2hLX1bE97sZM0CmeLe+7LgoUT1fJ/axybE=" }, + .{ .seed = 0xf664a91333fb8dfd, .hash = 0xf8a2a5649855ee41, .b64 = "L8ncxMaYLBH3g9buPu8hfpWZNlOF7nvWLNv9IozH07uQsIBWSKxoPy8+LW4tTuzC6CIWbRGRRD1sQV/4" }, + .{ .seed = 0x9625b859be372cd1, .hash = 0x27992565b595c498, .b64 = "CDK0meI07yrgV2kQlZZ+wuVqhc2NmzqeLH7bmcA6kchsRWFPeVF5Wqjjaj556ABeUoUr3yBmfU3kWOakkg==" }, + .{ .seed = 0x7b99940782e29898, .hash = 0x3e08cca5b71f9346, .b64 = "d23/vc5ONh/HkMiq+gYk4gaCNYyuFKwUkvn46t+dfVcKfBTYykr4kdvAPNXGYLjM4u1YkAEFpJP+nX7eOvs=" }, + .{ .seed = 0x4fe12fa5383b51a8, .hash = 0xad406b10c770a6d2, .b64 = "NUR3SRxBkxTSbtQORJpu/GdR6b/h6sSGfsMj/KFd99ahbh+9r7LSgSGmkGVB/mGoT0pnMTQst7Lv2q6QN6Vm" }, + .{ .seed = 0xe2ccb09ac0f5b4b6, .hash = 0xd1713ce6e552bcf2, .b64 = "2BOFlcI3Z0RYDtS9T9Ie9yJoXlOdigpPeeT+CRujb/O39Ih5LPC9hP6RQk1kYESGyaLZZi3jtabHs7DiVx/VDg==" }, + .{ .seed = 0x7d0a37adbd7b753b, .hash = 0x753b287194c73ad3, .b64 = "FF2HQE1FxEvWBpg6Z9zAMH+Zlqx8S1JD/wIlViL6ZDZY63alMDrxB0GJQahmAtjlm26RGLnjW7jmgQ4Ie3I+014=" }, + .{ .seed = 0xd3ae96ef9f7185f2, .hash = 0x5ae41a95f600af1c, .b64 = "tHmO7mqVL/PX11nZrz50Hc+M17Poj5lpnqHkEN+4bpMx/YGbkrGOaYjoQjgmt1X2QyypK7xClFrjeWrCMdlVYtbW" }, + .{ .seed = 0x4fb88ea63f79a0d8, .hash = 0x4a61163b86a8bb4c, .b64 = "/WiHi9IQcxRImsudkA/KOTqGe8/gXkhKIHkjddv5S9hi02M049dIK3EUyAEjkjpdGLUs+BN0QzPtZqjIYPOgwsYE9g==" }, + .{ .seed = 0xed564e259bb5ebe9, .hash = 0x42eeaa79e760c7e4, .b64 = "qds+1ExSnU11L4fTSDz/QE90g4Jh6ioqSh3KDOTOAo2pQGL1k/9CCC7J23YF27dUTzrWsCQA2m4epXoCc3yPHb3xElA=" }, + .{ .seed = 0x3e3256b60c428000, .hash = 0x698df622ef465b0a, .b64 = "8FVYHx40lSQPTHheh08Oq0/pGm2OlG8BEf8ezvAxHuGGdgCkqpXIueJBF2mQJhTfDy5NncO8ntS7vaKs7sCNdDaNGOEi" }, + .{ .seed = 0xfb05bad59ec8705, .hash = 0x157583111e1a6026, .b64 = "4ZoEIrJtstiCkeew3oRzmyJHVt/pAs2pj0HgHFrBPztbQ10NsQ/lM6DM439QVxpznnBSiHMgMQJhER+70l72LqFTO1JiIQ==" }, + .{ .seed = 0xafdc251dbf97b5f8, .hash = 0xaa1388f078e793e0, .b64 = "hQPtaYI+wJyxXgwD5n8jGIKFKaFA/P83KqCKZfPthnjwdOFysqEOYwAaZuaaiv4cDyi9TyS8hk5cEbNP/jrI7q6pYGBLbsM=" }, + .{ .seed = 0x10ec9c92ddb5dcbc, .hash = 0xf10d68d0f3309360, .b64 = "S4gpMSKzMD7CWPsSfLeYyhSpfWOntyuVZdX1xSBjiGvsspwOZcxNKCRIOqAA0moUfOh3I5+juQV4rsqYElMD/gWfDGpsWZKQ" }, + .{ .seed = 0x9a767d5822c7dac4, .hash = 0x2af056184457a3de, .b64 = "oswxop+bthuDLT4j0PcoSKby4LhF47ZKg8K17xxHf74UsGCzTBbOz0MM8hQEGlyqDT1iUiAYnaPaUpL2mRK0rcIUYA4qLt5uOw==" }, + .{ .seed = 0xee46254080d6e2db, .hash = 0x6d0058e1590b2489, .b64 = "0II/697p+BtLSjxj5989OXI004TogEb94VUnDzOVSgMXie72cuYRvTFNIBgtXlKfkiUjeqVpd4a+n5bxNOD1TGrjQtzKU5r7obo=" }, + .{ .seed = 0xbbb669588d8bf398, .hash = 0x638f287f68817f12, .b64 = "E84YZW2qipAlMPmctrg7TKlwLZ68l4L+c0xRDUfyyFrA4MAti0q9sHq3TDFviH0Y+Kq3tEE5srWFA8LM9oomtmvm5PYxoaarWPLc" }, + .{ .seed = 0xdc2afaa529beef44, .hash = 0xc46b71fecefd5467, .b64 = "x3pa4HIElyZG0Nj7Vdy9IdJIR4izLmypXw5PCmZB5y68QQ4uRaVVi3UthsoJROvbjDJkP2DQ6L/eN8pFeLFzNPKBYzcmuMOb5Ull7w==" }, + .{ .seed = 0xf1f67391d45013a8, .hash = 0x2c8e94679d964e0a, .b64 = "jVDKGYIuWOP/QKLdd2wi8B2VJA8Wh0c8PwrXJVM8FOGM3voPDVPyDJOU6QsBDPseoR8uuKd19OZ/zAvSCB+zlf6upAsBlheUKgCfKww=" }, + .{ .seed = 0x16fce2b8c65a3429, .hash = 0x8612b797ce22503a, .b64 = "mkquunhmYe1aR2wmUz4vcvLEcKBoe6H+kjUok9VUn2+eTSkWs4oDDtJvNCWtY5efJwg/j4PgjRYWtqnrCkhaqJaEvkkOwVfgMIwF3e+d" }, + .{ .seed = 0xf4b096699f49fe67, .hash = 0x59f929babfba7170, .b64 = "fRelvKYonTQ+s+rnnvQw+JzGfFoPixtna0vzcSjiDqX5s2Kg2//UGrK+AVCyMUhO98WoB1DDbrsOYSw2QzrcPe0+3ck9sePvb+Q/IRaHbw==" }, + .{ .seed = 0xca584c4bc8198682, .hash = 0x9527556923fb49a0, .b64 = "DUwXFJzagljo44QeJ7/6ZKw4QXV18lhkYT2jglMr8WB3CHUU4vdsytvw6AKv42ZcG6fRkZkq9fpnmXy6xG0aO3WPT1eHuyFirAlkW+zKtwg=" }, + .{ .seed = 0xed269fc3818b6aad, .hash = 0x1039ab644f5e150b, .b64 = "cYmZCrOOBBongNTr7e4nYn52uQUy2mfe48s50JXx2AZ6cRAt/xRHJ5QbEoEJOeOHsJyM4nbzwFm++SlT6gFZZHJpkXJ92JkR86uS/eV1hJUR" }, + .{ .seed = 0x33f253cbb8fe66a8, .hash = 0x7816c83f3aa05e6d, .b64 = "EXeHBDfhwzAKFhsMcH9+2RHwV+mJaN01+9oacF6vgm8mCXRd6jeN9U2oAb0of5c5cO4i+Vb/LlHZSMI490SnHU0bejhSCC2gsC5d2K30ER3iNA==" }, + .{ .seed = 0xd0b76b2c1523d99c, .hash = 0xf51d2f564518c619, .b64 = "FzkzRYoNjkxFhZDso94IHRZaJUP61nFYrh5MwDwv9FNoJ5jyNCY/eazPZk+tbmzDyJIGw2h3GxaWZ9bSlsol/vK98SbkMKCQ/wbfrXRLcDzdd/8=" }, + .{ .seed = 0xfd28f0811a2a237f, .hash = 0x67d494cff03ac004, .b64 = "Re4aXISCMlYY/XsX7zkIFR04ta03u4zkL9dVbLXMa/q6hlY/CImVIIYRN3VKP4pnd0AUr/ugkyt36JcstAInb4h9rpAGQ7GMVOgBniiMBZ/MGU7H" }, + .{ .seed = 0x6261fb136482e84, .hash = 0x2802d636ced1cfbb, .b64 = "ueLyMcqJXX+MhO4UApylCN9WlTQ+ltJmItgG7vFUtqs2qNwBMjmAvr5u0sAKd8jpzV0dDPTwchbIeAW5zbtkA2NABJV6hFM48ib4/J3A5mseA3cS8w==" }, + .{ .seed = 0x458efc750bca7c3a, .hash = 0xf64e20bad771cb12, .b64 = "6Si7Yi11L+jZMkwaN+GUuzXMrlvEqviEkGOilNq0h8TdQyYKuFXzkYc/q74gP3pVCyiwz9KpVGMM9vfnq36riMHRknkmhQutxLZs5fbmOgEO69HglCU=" }, + .{ .seed = 0xa7e69ff84e5e7c27, .hash = 0xb9a6cf84a83e15e, .b64 = "Q6AbOofGuTJOegPh9Clm/9crtUMQqylKrTc1fhfJo1tqvpXxhU4k08kntL1RG7woRnFrVh2UoMrL1kjin+s9CanT+y4hHwLqRranl9FjvxfVKm3yvg68" }, + .{ .seed = 0x3c59bfd0c29efe9e, .hash = 0x8da6630319609301, .b64 = "ieQEbIPvqY2YfIjHnqfJiO1/MIVRk0RoaG/WWi3kFrfIGiNLCczYoklgaecHMm/1sZ96AjO+a5stQfZbJQwS7Sc1ODABEdJKcTsxeW2hbh9A6CFzpowP1A==" }, + .{ .seed = 0x10befacc6afd298d, .hash = 0x40946a86e2a996f3, .b64 = "zQUv8hFB3zh2GGl3KTvCmnfzE+SUgQPVaSVIELFX5H9cE3FuVFGmymkPQZJLAyzC90Cmi8GqYCvPqTuAAB//XTJxy4bCcVArgZG9zJXpjowpNBfr3ngWrSE=" }, + .{ .seed = 0x41d5320b0a38efa7, .hash = 0xcab7f5997953fa76, .b64 = "US4hcC1+op5JKGC7eIs8CUgInjKWKlvKQkapulxW262E/B2ye79QxOexf188u2mFwwe3WTISJHRZzS61IwljqAWAWoBAqkUnW8SHmIDwHUP31J0p5sGdP47L" }, + .{ .seed = 0x58db1c7450fe17f3, .hash = 0x39129ca0e04fc465, .b64 = "9bHUWFna2LNaGF6fQLlkx1Hkt24nrkLE2CmFdWgTQV3FFbUe747SSqYw6ebpTa07MWSpWRPsHesVo2B9tqHbe7eQmqYebPDFnNqrhSdZwFm9arLQVs+7a3Ic6A==" }, + .{ .seed = 0x6098c055a335b7a6, .hash = 0x5238221fd685e1b8, .b64 = "Kb3DpHRUPhtyqgs3RuXjzA08jGb59hjKTOeFt1qhoINfYyfTt2buKhD6YVffRCPsgK9SeqZqRPJSyaqsa0ovyq1WnWW8jI/NhvAkZTVHUrX2pC+cD3OPYT05Dag=" }, + .{ .seed = 0x1bbacec67845a801, .hash = 0x175130c407dbcaab, .b64 = "gzxyMJIPlU+bJBwhFUCHSofZ/319LxqMoqnt3+L6h2U2+ZXJCSsYpE80xmR0Ta77Jq54o92SMH87HV8dGOaCTuAYF+lDL42SY1P316Cl0sZTS2ow3ZqwGbcPNs/1" }, + .{ .seed = 0xc419cfc7442190, .hash = 0x2f20e7536c0b0df, .b64 = "uR7V0TW+FGVMpsifnaBAQ3IGlr1wx5sKd7TChuqRe6OvUXTlD4hKWy8S+8yyOw8lQabism19vOQxfmocEOW/vzY0pEa87qHrAZy4s9fH2Bltu8vaOIe+agYohhYORQ==" }, + .{ .seed = 0xc95e510d94ba270c, .hash = 0x2742cb488a04ad56, .b64 = "1UR5eoo2aCwhacjZHaCh9bkOsITp6QunUxHQ2SfeHv0imHetzt/Z70mhyWZBalv6eAx+YfWKCUib2SHDtz/A2dc3hqUWX5VfAV7FQsghPUAtu6IiRatq4YSLpDvKZBQ=" }, + .{ .seed = 0xff1ae05c98089c3f, .hash = 0xd6afb593879ff93b, .b64 = "opubR7H63BH7OtY+Avd7QyQ25UZ8kLBdFDsBTwZlY6gA/u+x+czC9AaZMgmQrUy15DH7YMGsvdXnviTtI4eVI4aF1H9Rl3NXMKZgwFOsdTfdcZeeHVRzBBKX8jUfh1il" }, + .{ .seed = 0x90c02b8dceced493, .hash = 0xf50ad64caac0ca7f, .b64 = "DC0kXcSXtfQ9FbSRwirIn5tgPri0sbzHSa78aDZVDUKCMaBGyFU6BmrulywYX8yzvwprdLsoOwTWN2wMjHlPDqrvVHNEjnmufRDblW+nSS+xtKNs3N5xsxXdv6JXDrAB/Q==" }, + .{ .seed = 0x9f8a76697ab1aa36, .hash = 0x2ade95c4261364ae, .b64 = "BXRBk+3wEP3Lpm1y75wjoz+PgB0AMzLe8tQ1AYU2/oqrQB2YMC6W+9QDbcOfkGbeH+b7IBkt/gwCMw2HaQsRFEsurXtcQ3YwRuPz5XNaw5NAvrNa67Fm7eRzdE1+hWLKtA8=" }, + .{ .seed = 0x6ba1bf3d811a531d, .hash = 0x5c4f3299faacd07a, .b64 = "RRBSvEGYnzR9E45Aps/+WSnpCo/X7gJLO4DRnUqFrJCV/kzWlusLE/6ZU6RoUf2ROwcgEvUiXTGjLs7ts3t9SXnJHxC1KiOzxHdYLMhVvgNd3hVSAXODpKFSkVXND55G2L1W" }, + .{ .seed = 0x6a418974109c67b4, .hash = 0xfffe3bff0ae5e9bc, .b64 = "jeh6Qazxmdi57pa9S3XSnnZFIRrnc6s8QLrah5OX3SB/V2ErSPoEAumavzQPkdKF1/SfvmdL+qgF1C+Yawy562QaFqwVGq7+tW0yxP8FStb56ZRgNI4IOmI30s1Ei7iops9Uuw==" }, + .{ .seed = 0x8472f1c2b3d230a3, .hash = 0x1db785c0005166e4, .b64 = "6QO5nnDrY2/wrUXpltlKy2dSBcmK15fOY092CR7KxAjNfaY+aAmtWbbzQk3MjBg03x39afSUN1fkrWACdyQKRaGxgwq6MGNxI6W+8DLWJBHzIXrntrE/ml6fnNXEpxplWJ1vEs4=" }, + .{ .seed = 0x5e06068f884e73a7, .hash = 0xea000d962ad18418, .b64 = "0oPxeEHhqhcFuwonNfLd5jF3RNATGZS6NPoS0WklnzyokbTqcl4BeBkMn07+fDQv83j/BpGUwcWO05f3+DYzocfnizpFjLJemFGsls3gxcBYxcbqWYev51tG3lN9EvRE+X9+Pwww" }, + .{ .seed = 0x55290b1a8f170f59, .hash = 0xe42aef38359362d9, .b64 = "naSBSjtOKgAOg8XVbR5cHAW3Y+QL4Pb/JO9/oy6L08wvVRZqo0BrssMwhzBP401Um7A4ppAupbQeJFdMrysY34AuSSNvtNUy5VxjNECwiNtgwYHw7yakDUv8WvonctmnoSPKENegQg==" }, + .{ .seed = 0x5501cfd83dfe706a, .hash = 0xc8e95657348a3891, .b64 = "vPyl8DxVeRe1OpilKb9KNwpGkQRtA94UpAHetNh+95V7nIW38v7PpzhnTWIml5kw3So1Si0TXtIUPIbsu32BNhoH7QwFvLM+JACgSpc5e3RjsL6Qwxxi11npwxRmRUqATDeMUfRAjxg=" }, + .{ .seed = 0xe43ed13d13a66990, .hash = 0xc162eca864f238c6, .b64 = "QC9i2GjdTMuNC1xQJ74ngKfrlA4w3o58FhvNCltdIpuMhHP1YsDA78scQPLbZ3OCUgeQguYf/vw6zAaVKSgwtaykqg5ka/4vhz4hYqWU5ficdXqClHl+zkWEY26slCNYOM5nnDlly8Cj" }, + .{ .seed = 0xdf43bc375cf5283f, .hash = 0xbe1fb373e20579ad, .b64 = "7CNIgQhAHX27nxI0HeB5oUTnTdgKpRDYDKwRcXfSFGP1XeT9nQF6WKCMjL1tBV6x7KuJ91GZz11F4c+8s+MfqEAEpd4FHzamrMNjGcjCyrVtU6y+7HscMVzr7Q/ODLcPEFztFnwjvCjmHw==" }, + .{ .seed = 0x8112b806d288d7b5, .hash = 0x628a1d4f40aa6ffd, .b64 = "Qa/hC2RPXhANSospe+gUaPfjdK/yhQvfm4cCV6/pdvCYWPv8p1kMtKOX3h5/8oZ31fsmx4Axphu5qXJokuhZKkBUJueuMpxRyXpwSWz2wELx5glxF7CM0Fn+OevnkhUn5jsPlG2r5jYlVn8=" }, + .{ .seed = 0xd52a18abb001cb46, .hash = 0xa87bdb7456340f90, .b64 = "kUw/0z4l3a89jTwN5jpG0SHY5km/IVhTjgM5xCiPRLncg40aqWrJ5vcF891AOq5hEpSq0bUCJUMFXgct7kvnys905HjerV7Vs1Gy84tgVJ70/2+pAZTsB/PzNOE/G6sOj4+GbTzkQu819OLB" }, + .{ .seed = 0xe12b76a2433a1236, .hash = 0x5960ef3ba982c801, .b64 = "VDdfSDbO8Tdj3T5W0XM3EI7iHh5xpIutiM6dvcJ/fhe23V/srFEkDy5iZf/VnA9kfi2C79ENnFnbOReeuZW1b3MUXB9lgC6U4pOTuC+jHK3Qnpyiqzj7h3ISJSuo2pob7vY6VHZo6Fn7exEqHg==" }, + .{ .seed = 0x175bf7319cf1fa00, .hash = 0x5026586df9a431ec, .b64 = "Ldfvy3ORdquM/R2fIkhH/ONi69mcP1AEJ6n/oropwecAsLJzQSgezSY8bEiEs0VnFTBBsW+RtZY6tDj03fnb3amNUOq1b7jbqyQkL9hpl+2Z2J8IaVSeownWl+bQcsR5/xRktIMckC5AtF4YHfU=" }, + .{ .seed = 0xd63d57b3f67525ae, .hash = 0xfe4b8a20fdf0840b, .b64 = "BrbNpb42+VzZAjJw6QLirXzhweCVRfwlczzZ0VX2xluskwBqyfnGovz5EuX79JJ31VNXa5hTkAyQat3lYKRADTdAdwE5PqM1N7YaMqqsqoAAAeuYVXuk5eWCykYmClNdSspegwgCuT+403JigBzi" }, + .{ .seed = 0x933faea858832b73, .hash = 0xdcb761867da7072f, .b64 = "gB3NGHJJvVcuPyF0ZSvHwnWSIfmaI7La24VMPQVoIIWF7Z74NltPZZpx2f+cocESM+ILzQW9p+BC8x5IWz7N4Str2WLGKMdgmaBfNkEhSHQDU0IJEOnpUt0HmjhFaBlx0/LTmhua+rQ6Wup8ezLwfg==" }, + .{ .seed = 0x53d061e5f8e7c04f, .hash = 0xc10d4653667275b7, .b64 = "hTKHlRxx6Pl4gjG+6ksvvj0CWFicUg3WrPdSJypDpq91LUWRni2KF6+81ZoHBFhEBrCdogKqeK+hy9bLDnx7g6rAFUjtn1+cWzQ2YjiOpz4+ROBB7lnwjyTGWzJD1rXtlso1g2qVH8XJVigC5M9AIxM=" }, + .{ .seed = 0xdb4124556dd515e0, .hash = 0x727720deec13110b, .b64 = "IWQBelSQnhrr0F3BhUpXUIDauhX6f95Qp+A0diFXiUK7irwPG1oqBiqHyK/SH/9S+rln9DlFROAmeFdH0OCJi2tFm4afxYzJTFR4HnR4cG4x12JqHaZLQx6iiu6CE3rtWBVz99oAwCZUOEXIsLU24o2Y" }, + .{ .seed = 0x4fb31a0dd681ee71, .hash = 0x710b009662858dc9, .b64 = "TKo+l+1dOXdLvIrFqeLaHdm0HZnbcdEgOoLVcGRiCbAMR0j5pIFw8D36tefckAS1RCFOH5IgP8yiFT0Gd0a2hI3+fTKA7iK96NekxWeoeqzJyctc6QsoiyBlkZerRxs5RplrxoeNg29kKDTM0K94mnhD9g==" }, + .{ .seed = 0x27cc72eefa138e4c, .hash = 0xfbf8f7a3ecac1eb7, .b64 = "YU4e7G6EfQYvxCFoCrrT0EFgVLHFfOWRTJQJ5gxM3G2b+1kJf9YPrpsxF6Xr6nYtS8reEEbDoZJYqnlk9lXSkVArm88Cqn6d25VCx3+49MqC0trIlXtb7SXUUhwpJK16T0hJUfPH7s5cMZXc6YmmbFuBNPE=" }, + .{ .seed = 0x44bc2dfba4bd3ced, .hash = 0xb6fc4fcd0722e3df, .b64 = "/I/eImMwPo1U6wekNFD1Jxjk9XQVi1D+FPdqcHifYXQuP5aScNQfxMAmaPR2XhuOQhADV5tTVbBKwCDCX4E3jcDNHzCiPvViZF1W27txaf2BbFQdwKrNCmrtzcluBFYu0XZfc7RU1RmxK/RtnF1qHsq/O4pp" }, + .{ .seed = 0x242da1e3a439bed8, .hash = 0x7cb86dcc55104aac, .b64 = "CJTT9WGcY2XykTdo8KodRIA29qsqY0iHzWZRjKHb9alwyJ7RZAE3V5Juv4MY3MeYEr1EPCCMxO7yFXqT8XA8YTjaMp3bafRt17Pw8JC4iKJ1zN+WWKOESrj+3aluGQqn8z1EzqY4PH7rLG575PYeWsP98BugdA==" }, + .{ .seed = 0xdc559c746e35c139, .hash = 0x19e71e9b45c3a51e, .b64 = "ZlhyQwLhXQyIUEnMH/AEW27vh9xrbNKJxpWGtrEmKhd+nFqAfbeNBQjW0SfG1YI0xQkQMHXjuTt4P/EpZRtA47ibZDVS8TtaxwyBjuIDwqcN09eCtpC+Ls+vWDTLmBeDM3u4hmzz4DQAYsLiZYSJcldg9Q3wszw=" }, + .{ .seed = 0xd0b0350275b9989, .hash = 0x51de38573c2bea48, .b64 = "v2KU8y0sCrBghmnm8lzGJlwo6D6ObccAxCf10heoDtYLosk4ztTpLlpSFEyu23MLA1tJkcgRko04h19QMG0mOw/wc93EXAweriBqXfvdaP85sZABwiKO+6rtS9pacRVpYYhHJeVTQ5NzrvBvi1huxAr+xswhVMfL" }, + .{ .seed = 0xb04489e41d17730c, .hash = 0xa73ab6996d6df158, .b64 = "QhKlnIS6BuVCTQsnoE67E/yrgogE8EwO7xLaEGei26m0gEU4OksefJgppDh3X0x0Cs78Dr9IHK5b977CmZlrTRmwhlP8pM+UzXPNRNIZuN3ntOum/QhUWP8SGpirheXENWsXMQ/nxtxakyEtrNkKk471Oov9juP8oQ==" }, + .{ .seed = 0x2217285eb4572156, .hash = 0x55ef2b8c930817b2, .b64 = "/ZRMgnoRt+Uo6fUPr9FqQvKX7syhgVqWu+WUSsiQ68UlN0efSP6Eced5gJZL6tg9gcYJIkhjuQNITU0Q3TjVAnAcobgbJikCn6qZ6pRxKBY4MTiAlfGD3T7R7hwJwx554MAy++Zb/YUFlnCaCJiwQMnowF7aQzwYFCo=" }, + .{ .seed = 0x12c2e8e68aede73b, .hash = 0xb2850bf5fae87157, .b64 = "NB7tU5fNE8nI+SXGfipc7sRkhnSkUF1krjeo6k+8FITaAtdyz+o7mONgXmGLulBPH9bEwyYhKNVY0L+njNQrZ9YC2aXsFD3PdZsxAFaBT3VXEzh+NGBTjDASNL3mXyS8Yv1iThGfHoY7T4aR0NYGJ+k+pR6f+KrPC96M" }, + .{ .seed = 0x4d612125bdc4fd00, .hash = 0xecf3de1acd04651f, .b64 = "8T6wrqCtEO6/rwxF6lvMeyuigVOLwPipX/FULvwyu+1wa5sQGav/2FsLHUVn6cGSi0LlFwLewGHPFJDLR0u4t7ZUyM//x6da0sWgOa5hzDqjsVGmjxEHXiaXKW3i4iSZNuxoNbMQkIbVML+DkYu9ND0O2swg4itGeVSzXA==" }, + .{ .seed = 0x81826b553954464e, .hash = 0xcc0a40552559ff32, .b64 = "Ntf1bMRdondtMv1CYr3G80iDJ4WSAlKy5H34XdGruQiCrnRGDBa+eUi7vKp4gp3BBcVGl8eYSasVQQjn7MLvb3BjtXx6c/bCL7JtpzQKaDnPr9GWRxpBXVxKREgMM7d8lm35EODv0w+hQLfVSh8OGs7fsBb68nNWPLeeSOo=" }, + .{ .seed = 0xc2e5d345dc0ddd2d, .hash = 0xc385c374f20315b1, .b64 = "VsSAw72Ro6xks02kaiLuiTEIWBC5bgqr4WDnmP8vglXzAhixk7td926rm9jNimL+kroPSygZ9gl63aF5DCPOACXmsbmhDrAQuUzoh9ZKhWgElLQsrqo1KIjWoZT5b5QfVUXY9lSIBg3U75SqORoTPq7HalxxoIT5diWOcJQi" }, + .{ .seed = 0x3da6830a9e32631e, .hash = 0xb90208a4c7234183, .b64 = "j+loZ+C87+bJxNVebg94gU0mSLeDulcHs84tQT7BZM2rzDSLiCNxUedHr1ZWJ9ejTiBa0dqy2I2ABc++xzOLcv+//YfibtjKtYggC6/3rv0XCc7xu6d/O6xO+XOBhOWAQ+IHJVHf7wZnDxIXB8AUHsnjEISKj7823biqXjyP3g==" }, + .{ .seed = 0xc9ae5c8759b4877a, .hash = 0x58aa1ca7a4c075d9, .b64 = "f3LlpcPElMkspNtDq5xXyWU62erEaKn7RWKlo540gR6mZsNpK1czV/sOmqaq8XAQLEn68LKj6/cFkJukxRzCa4OF1a7cCAXYFp9+wZDu0bw4y63qbpjhdCl8GO6Z2lkcXy7KOzbPE01ukg7+gN+7uKpoohgAhIwpAKQXmX5xtd0=" }, +}; diff --git a/src/stdx.zig b/src/stdx.zig new file mode 100644 index 0000000..33fac5e --- /dev/null +++ b/src/stdx.zig @@ -0,0 +1,728 @@ +//! Extensions to the standard library -- things which could have been in std, but aren't. + +const std = @import("std"); +const builtin = @import("builtin"); +const assert = std.debug.assert; + +pub const BoundedArray = @import("bounded_array.zig").BoundedArray; + +pub inline fn div_ceil(numerator: anytype, denominator: anytype) @TypeOf(numerator, denominator) { + comptime { + switch (@typeInfo(@TypeOf(numerator))) { + .Int => |int| assert(int.signedness == .unsigned), + .ComptimeInt => assert(numerator >= 0), + else => @compileError("div_ceil: invalid numerator type"), + } + + switch (@typeInfo(@TypeOf(denominator))) { + .Int => |int| assert(int.signedness == .unsigned), + .ComptimeInt => assert(denominator > 0), + else => @compileError("div_ceil: invalid denominator type"), + } + } + + assert(denominator > 0); + + if (numerator == 0) return 0; + return @divFloor(numerator - 1, denominator) + 1; +} + +test "div_ceil" { + // Comptime ints. + try std.testing.expectEqual(div_ceil(0, 8), 0); + try std.testing.expectEqual(div_ceil(1, 8), 1); + try std.testing.expectEqual(div_ceil(7, 8), 1); + try std.testing.expectEqual(div_ceil(8, 8), 1); + try std.testing.expectEqual(div_ceil(9, 8), 2); + + // Unsized ints + const max = std.math.maxInt(u64); + try std.testing.expectEqual(div_ceil(@as(u64, 0), 8), 0); + try std.testing.expectEqual(div_ceil(@as(u64, 1), 8), 1); + try std.testing.expectEqual(div_ceil(@as(u64, max), 2), max / 2 + 1); + try std.testing.expectEqual(div_ceil(@as(u64, max) - 1, 2), max / 2); + try std.testing.expectEqual(div_ceil(@as(u64, max) - 2, 2), max / 2); +} + +pub const CopyPrecision = enum { exact, inexact }; + +pub inline fn copy_left( + comptime precision: CopyPrecision, + comptime T: type, + target: []T, + source: []const T, +) void { + switch (precision) { + .exact => assert(target.len == source.len), + .inexact => assert(target.len >= source.len), + } + + if (!disjoint_slices(T, T, target, source)) { + assert(@intFromPtr(target.ptr) < @intFromPtr(source.ptr)); + } + std.mem.copy(T, target, source); +} + +test "copy_left" { + const a = try std.testing.allocator.alloc(usize, 8); + defer std.testing.allocator.free(a); + + for (a, 0..) |*v, i| v.* = i; + copy_left(.exact, usize, a[0..6], a[2..]); + try std.testing.expect(std.mem.eql(usize, a, &.{ 2, 3, 4, 5, 6, 7, 6, 7 })); +} + +pub inline fn copy_right( + comptime precision: CopyPrecision, + comptime T: type, + target: []T, + source: []const T, +) void { + switch (precision) { + .exact => assert(target.len == source.len), + .inexact => assert(target.len >= source.len), + } + + if (!disjoint_slices(T, T, target, source)) { + assert(@intFromPtr(target.ptr) > @intFromPtr(source.ptr)); + } + std.mem.copyBackwards(T, target, source); +} + +test "copy_right" { + const a = try std.testing.allocator.alloc(usize, 8); + defer std.testing.allocator.free(a); + + for (a, 0..) |*v, i| v.* = i; + copy_right(.exact, usize, a[2..], a[0..6]); + try std.testing.expect(std.mem.eql(usize, a, &.{ 0, 1, 0, 1, 2, 3, 4, 5 })); +} + +pub inline fn copy_disjoint( + comptime precision: CopyPrecision, + comptime T: type, + target: []T, + source: []const T, +) void { + switch (precision) { + .exact => assert(target.len == source.len), + .inexact => assert(target.len >= source.len), + } + + assert(disjoint_slices(T, T, target, source)); + std.mem.copy(T, target, source); +} + +pub inline fn disjoint_slices(comptime A: type, comptime B: type, a: []const A, b: []const B) bool { + return @intFromPtr(a.ptr) + a.len * @sizeOf(A) <= @intFromPtr(b.ptr) or + @intFromPtr(b.ptr) + b.len * @sizeOf(B) <= @intFromPtr(a.ptr); +} + +test "disjoint_slices" { + const a = try std.testing.allocator.alignedAlloc(u8, @sizeOf(u32), 8 * @sizeOf(u32)); + defer std.testing.allocator.free(a); + + const b = try std.testing.allocator.alloc(u32, 8); + defer std.testing.allocator.free(b); + + try std.testing.expectEqual(true, disjoint_slices(u8, u32, a, b)); + try std.testing.expectEqual(true, disjoint_slices(u32, u8, b, a)); + + try std.testing.expectEqual(true, disjoint_slices(u8, u8, a, a[0..0])); + try std.testing.expectEqual(true, disjoint_slices(u32, u32, b, b[0..0])); + + try std.testing.expectEqual(false, disjoint_slices(u8, u8, a, a[0..1])); + try std.testing.expectEqual(false, disjoint_slices(u8, u8, a, a[a.len - 1 .. a.len])); + + try std.testing.expectEqual(false, disjoint_slices(u32, u32, b, b[0..1])); + try std.testing.expectEqual(false, disjoint_slices(u32, u32, b, b[b.len - 1 .. b.len])); + + try std.testing.expectEqual(false, disjoint_slices(u8, u32, a, std.mem.bytesAsSlice(u32, a))); + try std.testing.expectEqual(false, disjoint_slices(u32, u8, b, std.mem.sliceAsBytes(b))); +} + +/// Checks that a byteslice is zeroed. +pub fn zeroed(bytes: []const u8) bool { + // This implementation already gets vectorized + // https://godbolt.org/z/46cMsPKPc + var byte_bits: u8 = 0; + for (bytes) |byte| { + byte_bits |= byte; + } + return byte_bits == 0; +} + +const Cut = struct { + prefix: []const u8, + suffix: []const u8, +}; + +/// Splits the `haystack` around the first occurrence of `needle`, returning parts before and after. +/// +/// This is a Zig version of Go's `string.Cut` / Rust's `str::split_once`. Cut turns out to be a +/// surprisingly versatile primitive for ad-hoc string processing. Often `std.mem.indexOf` and +/// `std.mem.split` can be replaced with a shorter and clearer code using `cut`. +pub fn cut(haystack: []const u8, needle: []const u8) ?Cut { + const index = std.mem.indexOf(u8, haystack, needle) orelse return null; + + return Cut{ + .prefix = haystack[0..index], + .suffix = haystack[index + needle.len ..], + }; +} + +/// `maybe` is the dual of `assert`: it signals that condition is sometimes true +/// and sometimes false. +/// +/// Currently we use it for documentation, but maybe one day we plug it into +/// coverage. +pub fn maybe(ok: bool) void { + assert(ok or !ok); +} + +/// Signal that something is not yet fully implemented, and abort the process. +/// +/// In VOPR, this will exit with status 0, to make it easy to find "real" failures by running +/// the simulator in a loop. +pub fn unimplemented(comptime message: []const u8) noreturn { + const full_message = "unimplemented: " ++ message; + const root = @import("root"); + if (@hasDecl(root, "Simulator")) { + root.output.info(full_message, .{}); + root.output.info("not crashing in VOPR", .{}); + std.process.exit(0); + } + @panic(full_message); +} + +/// Utility function for ad-hoc profiling. +/// +/// A thin wrapper around `std.time.Timer` which handles the boilerplate of +/// printing to stderr and formatting times in some (unspecified) readable way. +pub fn timeit() TimeIt { + return TimeIt{ .inner = std.time.Timer.start() catch unreachable }; +} + +const TimeIt = struct { + inner: std.time.Timer, + + /// Prints elapsed time to stderr and resets the internal timer. + pub fn lap(self: *TimeIt, comptime label: []const u8) void { + const label_alignment = comptime " " ** (1 + (12 -| label.len)); + + const nanos = self.inner.lap(); + std.debug.print( + label ++ ":" ++ label_alignment ++ "{}\n", + .{std.fmt.fmtDuration(nanos)}, + ); + } +}; + +pub const log = if (builtin.is_test) + // Downgrade `err` to `warn` for tests. + // Zig fails any test that does `log.err`, but we want to test those code paths here. + struct { + pub fn scoped(comptime scope: @Type(.EnumLiteral)) type { + const base = std.log.scoped(scope); + return struct { + pub const err = warn; + pub const warn = base.warn; + pub const info = base.info; + pub const debug = base.debug; + }; + } + } +else + std.log; + +/// Compare two values by directly comparing the underlying memory. +/// +/// Assert at compile time that this is a reasonable thing to do for a given `T`. That is, check +/// that: +/// - `T` doesn't have any non-deterministic padding, +/// - `T` doesn't embed any pointers. +pub fn equal_bytes(comptime T: type, a: *const T, b: *const T) bool { + comptime assert(has_unique_representation(T)); + comptime assert(!has_pointers(T)); + comptime assert(@sizeOf(T) * 8 == @bitSizeOf(T)); + + // Pick the biggest "word" for word-wise comparison, and don't try to early-return on the first + // mismatch, so that a compiler can vectorize the loop. + + const Word = inline for (.{ u64, u32, u16, u8 }) |Word| { + if (@alignOf(T) >= @alignOf(Word) and @sizeOf(T) % @sizeOf(Word) == 0) break Word; + } else unreachable; + + const a_words = std.mem.bytesAsSlice(Word, std.mem.asBytes(a)); + const b_words = std.mem.bytesAsSlice(Word, std.mem.asBytes(b)); + assert(a_words.len == b_words.len); + + var total: Word = 0; + for (a_words, 0..) |a_word, i| { + const b_word = b_words[i]; + total |= a_word ^ b_word; + } + + return total == 0; +} + +fn has_pointers(comptime T: type) bool { + switch (@typeInfo(T)) { + .Pointer => return true, + // Be conservative. + else => return true, + + .Bool, .Int, .Enum => return false, + + .Array => |info| return comptime has_pointers(info.child), + .Struct => |info| { + inline for (info.fields) |field| { + if (comptime has_pointers(field.type)) return true; + } + return false; + }, + } +} + +/// Checks that a type does not have implicit padding. +pub fn no_padding(comptime T: type) bool { + comptime switch (@typeInfo(T)) { + .Int => return @bitSizeOf(T) == 8 * @sizeOf(T), + .Array => |info| return no_padding(info.child), + .Struct => |info| { + switch (info.layout) { + .Auto => return false, + .Extern => { + for (info.fields) |field| { + if (!no_padding(field.type)) return false; + } + + // Check offsets of u128 and pseudo-u256 fields. + for (info.fields) |field| { + if (field.type == u128) { + const offset = @offsetOf(T, field.name); + if (offset % @sizeOf(u128) != 0) return false; + + if (@hasField(T, field.name ++ "_padding")) { + if (offset % @sizeOf(u256) != 0) return false; + if (offset + @sizeOf(u128) != + @offsetOf(T, field.name ++ "_padding")) + { + return false; + } + } + } + } + + var offset = 0; + for (info.fields) |field| { + const field_offset = @offsetOf(T, field.name); + if (offset != field_offset) return false; + offset += @sizeOf(field.type); + } + return offset == @sizeOf(T); + }, + .Packed => return @bitSizeOf(T) == 8 * @sizeOf(T), + } + }, + .Enum => |info| { + maybe(info.is_exhaustive); + return no_padding(info.tag_type); + }, + .Pointer => return false, + .Union => return false, + else => return false, + }; +} + +test no_padding { + comptime for (.{ + u8, + extern struct { x: u8 }, + packed struct { x: u7, y: u1 }, + extern struct { x: extern struct { y: u64, z: u64 } }, + enum(u8) { x }, + }) |T| { + assert(no_padding(T)); + }; + + comptime for (.{ + u7, + struct { x: u7 }, + struct { x: u8 }, + struct { x: u64, y: u32 }, + extern struct { x: extern struct { y: u64, z: u32 } }, + packed struct { x: u7 }, + enum(u7) { x }, + }) |T| { + assert(!no_padding(T)); + }; +} + +pub inline fn hash_inline(value: anytype) u64 { + comptime { + assert(no_padding(@TypeOf(value))); + assert(has_unique_representation(@TypeOf(value))); + } + return low_level_hash(0, switch (@typeInfo(@TypeOf(value))) { + .Struct, .Int => std.mem.asBytes(&value), + else => @compileError("unsupported hashing for " ++ @typeName(@TypeOf(value))), + }); +} + +/// Inline version of Google Abseil "LowLevelHash" (inspired by wyhash). +/// https://github.com/abseil/abseil-cpp/blob/master/absl/hash/internal/low_level_hash.cc +inline fn low_level_hash(seed: u64, input: anytype) u64 { + const salt = [_]u64{ + 0xa0761d6478bd642f, + 0xe7037ed1a0b428db, + 0x8ebc6af09c88c6e3, + 0x589965cc75374cc3, + 0x1d8e4e27c47d124f, + }; + + var in: []const u8 = input; + var state = seed ^ salt[0]; + const starting_len = input.len; + + if (in.len > 64) { + var dup = [_]u64{ state, state }; + defer state = dup[0] ^ dup[1]; + + while (in.len > 64) : (in = in[64..]) { + for (@as([2][4]u64, @bitCast(in[0..64].*)), 0..) |chunk, i| { + const mix1 = @as(u128, chunk[0] ^ salt[(i * 2) + 1]) *% (chunk[1] ^ dup[i]); + const mix2 = @as(u128, chunk[2] ^ salt[(i * 2) + 2]) *% (chunk[3] ^ dup[i]); + dup[i] = @as(u64, @truncate(mix1 ^ (mix1 >> 64))); + dup[i] ^= @as(u64, @truncate(mix2 ^ (mix2 >> 64))); + } + } + } + + while (in.len > 16) : (in = in[16..]) { + const chunk = @as([2]u64, @bitCast(in[0..16].*)); + const mixed = @as(u128, chunk[0] ^ salt[1]) *% (chunk[1] ^ state); + state = @as(u64, @truncate(mixed ^ (mixed >> 64))); + } + + var chunk = std.mem.zeroes([2]u64); + if (in.len > 8) { + chunk[0] = @as(u64, @bitCast(in[0..8].*)); + chunk[1] = @as(u64, @bitCast(in[in.len - 8 ..][0..8].*)); + } else if (in.len > 3) { + chunk[0] = @as(u32, @bitCast(in[0..4].*)); + chunk[1] = @as(u32, @bitCast(in[in.len - 4 ..][0..4].*)); + } else if (in.len > 0) { + chunk[0] = (@as(u64, in[0]) << 16) | (@as(u64, in[in.len / 2]) << 8) | in[in.len - 1]; + } + + var mixed = @as(u128, chunk[0] ^ salt[1]) *% (chunk[1] ^ state); + mixed = @as(u64, @truncate(mixed ^ (mixed >> 64))); + mixed *%= (@as(u64, starting_len) ^ salt[1]); + return @as(u64, @truncate(mixed ^ (mixed >> 64))); +} + +test "hash_inline" { + for (@import("low_level_hash_vectors.zig").cases) |case| { + var buffer: [0x100]u8 = undefined; + + const b64 = std.base64.standard; + const input = buffer[0..try b64.Decoder.calcSizeForSlice(case.b64)]; + try b64.Decoder.decode(input, case.b64); + + const hash = low_level_hash(case.seed, input); + try std.testing.expectEqual(case.hash, hash); + } +} + +/// Returns a copy of `base` with fields changed according to `diff`. +/// +/// Intended exclusively for table-driven prototype-based tests. Write +/// updates explicitly in production code. +pub fn update(base: anytype, diff: anytype) @TypeOf(base) { + assert(builtin.is_test); + assert(@typeInfo(@TypeOf(base)) == .Struct); + + var updated = base; + inline for (std.meta.fields(@TypeOf(diff))) |f| { + @field(updated, f.name) = @field(diff, f.name); + } + return updated; +} + +// std.SemanticVersion requires there be no extra characters after the +// major/minor/patch numbers. But when we try to parse `uname +// --kernel-release` (note: while Linux doesn't follow semantic +// versioning, it doesn't violate it either), some distributions have +// extra characters, such as this Fedora one: 6.3.8-100.fc37.x86_64, and +// this WSL one has more than three dots: +// 5.15.90.1-microsoft-standard-WSL2. +pub fn parse_dirty_semver(dirty_release: []const u8) !std.SemanticVersion { + const release = blk: { + var last_valid_version_character_index: usize = 0; + var dots_found: u8 = 0; + for (dirty_release) |c| { + if (c == '.') dots_found += 1; + if (dots_found == 3) { + break; + } + + if (c == '.' or (c >= '0' and c <= '9')) { + last_valid_version_character_index += 1; + continue; + } + + break; + } + + break :blk dirty_release[0..last_valid_version_character_index]; + }; + + return std.SemanticVersion.parse(release); +} + +test "stdx.zig: parse_dirty_semver" { + const SemverTestCase = struct { + dirty_release: []const u8, + expected_version: std.SemanticVersion, + }; + + const cases = &[_]SemverTestCase{ + .{ + .dirty_release = "1.2.3", + .expected_version = std.SemanticVersion{ .major = 1, .minor = 2, .patch = 3 }, + }, + .{ + .dirty_release = "1001.843.909", + .expected_version = std.SemanticVersion{ .major = 1001, .minor = 843, .patch = 909 }, + }, + .{ + .dirty_release = "6.3.8-100.fc37.x86_64", + .expected_version = std.SemanticVersion{ .major = 6, .minor = 3, .patch = 8 }, + }, + .{ + .dirty_release = "5.15.90.1-microsoft-standard-WSL2", + .expected_version = std.SemanticVersion{ .major = 5, .minor = 15, .patch = 90 }, + }, + }; + for (cases) |case| { + const version = try parse_dirty_semver(case.dirty_release); + try std.testing.expectEqual(case.expected_version, version); + } +} + +// TODO(zig): Zig 0.11 doesn't have the statfs / fstatfs syscalls to get the type of a filesystem. +// Once those are available, this can be removed. +// The `statfs` definition used by the Linux kernel, and the magic number for tmpfs, from +// `man 2 fstatfs`. +const fsblkcnt64_t = u64; +const fsfilcnt64_t = u64; +const fsword_t = i64; +const fsid_t = u64; + +pub const TmpfsMagic = 0x01021994; +pub const StatFs = extern struct { + f_type: fsword_t, + f_bsize: fsword_t, + f_blocks: fsblkcnt64_t, + f_bfree: fsblkcnt64_t, + f_bavail: fsblkcnt64_t, + f_files: fsfilcnt64_t, + f_ffree: fsfilcnt64_t, + f_fsid: fsid_t, + f_namelen: fsword_t, + f_frsize: fsword_t, + f_flags: fsword_t, + f_spare: [4]fsword_t, +}; + +pub fn fstatfs(fd: i32, statfs_buf: *StatFs) usize { + return std.os.linux.syscall2( + if (@hasField(std.os.linux.SYS, "fstatfs64")) .fstatfs64 else .fstatfs, + @as(usize, @bitCast(@as(isize, fd))), + @intFromPtr(statfs_buf), + ); +} + +// TODO(Zig): https://github.com/ziglang/zig/issues/17592. +/// True if every value of the type `T` has a unique bit pattern representing it. +/// In other words, `T` has no unused bits and no padding. +pub fn has_unique_representation(comptime T: type) bool { + switch (@typeInfo(T)) { + else => return false, // TODO can we know if it's true for some of these types ? + + .AnyFrame, + .Enum, + .ErrorSet, + .Fn, + => return true, + + .Bool => return false, + + .Int => |info| return @sizeOf(T) * 8 == info.bits, + + .Pointer => |info| return info.size != .Slice, + + .Array => |info| return comptime has_unique_representation(info.child), + + .Struct => |info| { + // Only consider packed structs unique if they are byte aligned. + if (info.backing_integer) |backing_integer| { + return @sizeOf(T) * 8 == @bitSizeOf(backing_integer); + } + + var sum_size = @as(usize, 0); + + inline for (info.fields) |field| { + const FieldType = field.type; + if (comptime !has_unique_representation(FieldType)) return false; + sum_size += @sizeOf(FieldType); + } + + return @sizeOf(T) == sum_size; + }, + + .Vector => |info| return comptime has_unique_representation(info.child) and + @sizeOf(T) == @sizeOf(info.child) * info.len, + } +} + +// Test vectors mostly from upstream, with some added to test the packed struct case. +test "has_unique_representation" { + const TestStruct1 = struct { + a: u32, + b: u32, + }; + + try std.testing.expect(has_unique_representation(TestStruct1)); + + const TestStruct2 = struct { + a: u32, + b: u16, + }; + + try std.testing.expect(!has_unique_representation(TestStruct2)); + + const TestStruct3 = struct { + a: u32, + b: u32, + }; + + try std.testing.expect(has_unique_representation(TestStruct3)); + + const TestStruct4 = struct { a: []const u8 }; + + try std.testing.expect(!has_unique_representation(TestStruct4)); + + const TestStruct5 = struct { a: TestStruct4 }; + + try std.testing.expect(!has_unique_representation(TestStruct5)); + + const TestStruct6 = packed struct { + a: u32, + b: u31, + }; + + try std.testing.expect(!has_unique_representation(TestStruct6)); + + const TestStruct7 = struct { + a: u64, + b: TestStruct6, + }; + + try std.testing.expect(!has_unique_representation(TestStruct7)); + + const TestStruct8 = packed struct { + a: u32, + b: u32, + }; + + try std.testing.expect(has_unique_representation(TestStruct8)); + + const TestStruct9 = struct { + a: u64, + b: TestStruct8, + }; + + try std.testing.expect(has_unique_representation(TestStruct9)); + + const TestStruct10 = packed struct { + a: TestStruct8, + b: TestStruct8, + }; + + try std.testing.expect(has_unique_representation(TestStruct10)); + + const TestUnion1 = packed union { + a: u32, + b: u16, + }; + + try std.testing.expect(!has_unique_representation(TestUnion1)); + + const TestUnion2 = extern union { + a: u32, + b: u16, + }; + + try std.testing.expect(!has_unique_representation(TestUnion2)); + + const TestUnion3 = union { + a: u32, + b: u16, + }; + + try std.testing.expect(!has_unique_representation(TestUnion3)); + + const TestUnion4 = union(enum) { + a: u32, + b: u16, + }; + + try std.testing.expect(!has_unique_representation(TestUnion4)); + + inline for ([_]type{ i0, u8, i16, u32, i64 }) |T| { + try std.testing.expect(has_unique_representation(T)); + } + inline for ([_]type{ i1, u9, i17, u33, i24 }) |T| { + try std.testing.expect(!has_unique_representation(T)); + } + + try std.testing.expect(!has_unique_representation([]u8)); + try std.testing.expect(!has_unique_representation([]const u8)); + + try std.testing.expect(has_unique_representation(@Vector(4, u16))); +} + +/// Construct a `union(Enum)` type, where each union "value" type is defined in terms of the +/// variant. +/// +/// That is, `EnumUnionType(Enum, TypeForVariant)` is equivalent to: +/// +/// union(Enum) { +/// // For every `e` in `Enum`: +/// e: TypeForVariant(e), +/// } +/// +pub fn EnumUnionType( + comptime Enum: type, + comptime TypeForVariant: fn (comptime variant: Enum) type, +) type { + const UnionField = std.builtin.Type.UnionField; + + var fields: []const UnionField = &[_]UnionField{}; + for (std.enums.values(Enum)) |enum_variant| { + fields = fields ++ &[_]UnionField{.{ + .name = @tagName(enum_variant), + .type = TypeForVariant(enum_variant), + .alignment = @alignOf(TypeForVariant(enum_variant)), + }}; + } + + return @Type(.{ .Union = .{ + .layout = .Auto, + .fields = fields, + .decls = &.{}, + .tag_type = Enum, + } }); +} diff --git a/src/test.zig b/src/test.zig new file mode 100644 index 0000000..8e5fd4b --- /dev/null +++ b/src/test.zig @@ -0,0 +1,654 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const os = std.os; +const testing = std.testing; +const assert = std.debug.assert; + +const Time = @import("time.zig").Time; +const IO = @import("io.zig").IO; + +test "write/read/close" { + try struct { + const Context = @This(); + + io: IO, + done: bool = false, + fd: os.fd_t, + + write_buf: [20]u8 = [_]u8{97} ** 20, + read_buf: [20]u8 = [_]u8{98} ** 20, + + written: usize = 0, + read: usize = 0, + + fn run_test() !void { + const path = "test_io_write_read_close"; + const file = try std.fs.cwd().createFile(path, .{ .read = true, .truncate = true }); + defer std.fs.cwd().deleteFile(path) catch {}; + + var self: Context = .{ + .io = try IO.init(32, 0), + .fd = file.handle, + }; + defer self.io.deinit(); + + var completion: IO.Completion = undefined; + + self.io.write( + *Context, + &self, + write_callback, + &completion, + self.fd, + &self.write_buf, + 10, + ); + while (!self.done) try self.io.tick(); + + try testing.expectEqual(self.write_buf.len, self.written); + try testing.expectEqual(self.read_buf.len, self.read); + try testing.expectEqualSlices(u8, &self.write_buf, &self.read_buf); + } + + fn write_callback( + self: *Context, + completion: *IO.Completion, + result: IO.WriteError!usize, + ) void { + self.written = result catch @panic("write error"); + self.io.read(*Context, self, read_callback, completion, self.fd, &self.read_buf, 10); + } + + fn read_callback( + self: *Context, + completion: *IO.Completion, + result: IO.ReadError!usize, + ) void { + self.read = result catch @panic("read error"); + self.io.close(*Context, self, close_callback, completion, self.fd); + } + + fn close_callback( + self: *Context, + completion: *IO.Completion, + result: IO.CloseError!void, + ) void { + _ = completion; + _ = result catch @panic("close error"); + + self.done = true; + } + }.run_test(); +} + +test "accept/connect/send/receive" { + try struct { + const Context = @This(); + + io: *IO, + done: bool = false, + server: os.socket_t, + client: os.socket_t, + + accepted_sock: os.socket_t = undefined, + + send_buf: [10]u8 = [_]u8{ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }, + recv_buf: [5]u8 = [_]u8{ 0, 1, 0, 1, 0 }, + + sent: usize = 0, + received: usize = 0, + + fn run_test() !void { + var io = try IO.init(32, 0); + defer io.deinit(); + + const address = try std.net.Address.parseIp4("127.0.0.1", 0); + const kernel_backlog = 1; + const server = try io.open_socket(address.any.family, os.SOCK.STREAM, os.IPPROTO.TCP); + defer os.closeSocket(server); + + const client = try io.open_socket(address.any.family, os.SOCK.STREAM, os.IPPROTO.TCP); + defer os.closeSocket(client); + + try os.setsockopt( + server, + os.SOL.SOCKET, + os.SO.REUSEADDR, + &std.mem.toBytes(@as(c_int, 1)), + ); + try os.bind(server, &address.any, address.getOsSockLen()); + try os.listen(server, kernel_backlog); + + var client_address = std.net.Address.initIp4(undefined, undefined); + var client_address_len = client_address.getOsSockLen(); + try os.getsockname(server, &client_address.any, &client_address_len); + + var self: Context = .{ + .io = &io, + .server = server, + .client = client, + }; + + var client_completion: IO.Completion = undefined; + self.io.connect( + *Context, + &self, + connect_callback, + &client_completion, + client, + client_address, + ); + + var server_completion: IO.Completion = undefined; + self.io.accept(*Context, &self, accept_callback, &server_completion, server); + + while (!self.done) try self.io.tick(); + + try testing.expectEqual(self.send_buf.len, self.sent); + try testing.expectEqual(self.recv_buf.len, self.received); + + try testing.expectEqualSlices(u8, self.send_buf[0..self.received], &self.recv_buf); + } + + fn connect_callback( + self: *Context, + completion: *IO.Completion, + result: IO.ConnectError!void, + ) void { + _ = result catch @panic("connect error"); + + self.io.send( + *Context, + self, + send_callback, + completion, + self.client, + &self.send_buf, + ); + } + + fn send_callback( + self: *Context, + completion: *IO.Completion, + result: IO.SendError!usize, + ) void { + _ = completion; + + self.sent = result catch @panic("send error"); + } + + fn accept_callback( + self: *Context, + completion: *IO.Completion, + result: IO.AcceptError!os.socket_t, + ) void { + self.accepted_sock = result catch @panic("accept error"); + self.io.recv( + *Context, + self, + recv_callback, + completion, + self.accepted_sock, + &self.recv_buf, + ); + } + + fn recv_callback( + self: *Context, + completion: *IO.Completion, + result: IO.RecvError!usize, + ) void { + _ = completion; + + self.received = result catch @panic("recv error"); + self.done = true; + } + }.run_test(); +} + +test "timeout" { + const ms = 20; + const margin = 5; + const count = 10; + + try struct { + const Context = @This(); + + io: IO, + timer: *Time, + count: u32 = 0, + stop_time: u64 = 0, + + fn run_test() !void { + var timer = Time{}; + const start_time = timer.monotonic(); + var self: Context = .{ + .timer = &timer, + .io = try IO.init(32, 0), + }; + defer self.io.deinit(); + + var completions: [count]IO.Completion = undefined; + for (&completions) |*completion| { + self.io.timeout( + *Context, + &self, + timeout_callback, + completion, + ms * std.time.ns_per_ms, + ); + } + while (self.count < count) try self.io.tick(); + + try self.io.tick(); + try testing.expectEqual(@as(u32, count), self.count); + + try testing.expectApproxEqAbs( + @as(f64, ms), + @as(f64, @floatFromInt((self.stop_time - start_time) / std.time.ns_per_ms)), + margin, + ); + } + + fn timeout_callback( + self: *Context, + completion: *IO.Completion, + result: IO.TimeoutError!void, + ) void { + _ = completion; + _ = result catch @panic("timeout error"); + + if (self.stop_time == 0) self.stop_time = self.timer.monotonic(); + self.count += 1; + } + }.run_test(); +} + +test "submission queue full" { + const ms = 20; + const count = 10; + + try struct { + const Context = @This(); + + io: IO, + count: u32 = 0, + + fn run_test() !void { + var self: Context = .{ .io = try IO.init(1, 0) }; + defer self.io.deinit(); + + var completions: [count]IO.Completion = undefined; + for (&completions) |*completion| { + self.io.timeout( + *Context, + &self, + timeout_callback, + completion, + ms * std.time.ns_per_ms, + ); + } + while (self.count < count) try self.io.tick(); + + try self.io.tick(); + try testing.expectEqual(@as(u32, count), self.count); + } + + fn timeout_callback( + self: *Context, + completion: *IO.Completion, + result: IO.TimeoutError!void, + ) void { + _ = completion; + _ = result catch @panic("timeout error"); + + self.count += 1; + } + }.run_test(); +} + +test "tick to wait" { + // Use only IO.tick() to see if pending IO is actually processsed + + try struct { + const Context = @This(); + + io: IO, + accepted: os.socket_t = IO.INVALID_SOCKET, + connected: bool = false, + received: bool = false, + + fn run_test() !void { + var self: Context = .{ .io = try IO.init(1, 0) }; + defer self.io.deinit(); + + const address = try std.net.Address.parseIp4("127.0.0.1", 0); + const kernel_backlog = 1; + + const server = try self.io.open_socket(address.any.family, os.SOCK.STREAM, os.IPPROTO.TCP); + defer os.closeSocket(server); + + try os.setsockopt( + server, + os.SOL.SOCKET, + os.SO.REUSEADDR, + &std.mem.toBytes(@as(c_int, 1)), + ); + try os.bind(server, &address.any, address.getOsSockLen()); + try os.listen(server, kernel_backlog); + + var client_address = std.net.Address.initIp4(undefined, undefined); + var client_address_len = client_address.getOsSockLen(); + try os.getsockname(server, &client_address.any, &client_address_len); + + const client = try self.io.open_socket(client_address.any.family, os.SOCK.STREAM, os.IPPROTO.TCP); + defer os.closeSocket(client); + + // Start the accept + var server_completion: IO.Completion = undefined; + self.io.accept(*Context, &self, accept_callback, &server_completion, server); + + // Start the connect + var client_completion: IO.Completion = undefined; + self.io.connect( + *Context, + &self, + connect_callback, + &client_completion, + client, + client_address, + ); + + // Tick the IO to drain the accept & connect completions + assert(!self.connected); + assert(self.accepted == IO.INVALID_SOCKET); + + while (self.accepted == IO.INVALID_SOCKET or !self.connected) + try self.io.tick(); + + assert(self.connected); + assert(self.accepted != IO.INVALID_SOCKET); + defer os.closeSocket(self.accepted); + + // Start receiving on the client + var recv_completion: IO.Completion = undefined; + var recv_buffer: [64]u8 = undefined; + @memset(&recv_buffer, 0xaa); + self.io.recv( + *Context, + &self, + recv_callback, + &recv_completion, + client, + &recv_buffer, + ); + + // Drain out the recv completion from any internal IO queues + try self.io.tick(); + try self.io.tick(); + try self.io.tick(); + + // Complete the recv() *outside* of the IO instance. + // Other tests already check .tick() with IO based completions. + // This simulates IO being completed by an external system + var send_buf = std.mem.zeroes([64]u8); + const wrote = try os_send(self.accepted, &send_buf, 0); + try testing.expectEqual(wrote, send_buf.len); + + // Wait for the recv() to complete using only IO.tick(). + // If tick is broken, then this will deadlock + assert(!self.received); + while (!self.received) { + try self.io.tick(); + } + + // Make sure the receive actually happened + assert(self.received); + try testing.expect(std.mem.eql(u8, &recv_buffer, &send_buf)); + } + + fn accept_callback( + self: *Context, + completion: *IO.Completion, + result: IO.AcceptError!os.socket_t, + ) void { + _ = completion; + + assert(self.accepted == IO.INVALID_SOCKET); + self.accepted = result catch @panic("accept error"); + } + + fn connect_callback( + self: *Context, + completion: *IO.Completion, + result: IO.ConnectError!void, + ) void { + _ = completion; + _ = result catch @panic("connect error"); + + assert(!self.connected); + self.connected = true; + } + + fn recv_callback( + self: *Context, + completion: *IO.Completion, + result: IO.RecvError!usize, + ) void { + _ = completion; + _ = result catch |err| std.debug.panic("recv error: {}", .{err}); + + assert(!self.received); + self.received = true; + } + + // TODO: use os.send() instead when it gets fixed for windows + fn os_send(sock: os.socket_t, buf: []const u8, flags: u32) !usize { + if (builtin.target.os.tag != .windows) { + return os.send(sock, buf, flags); + } + + const rc = os.windows.sendto(sock, buf.ptr, buf.len, flags, null, 0); + if (rc == os.windows.ws2_32.SOCKET_ERROR) { + switch (os.windows.ws2_32.WSAGetLastError()) { + .WSAEACCES => return error.AccessDenied, + .WSAEADDRNOTAVAIL => return error.AddressNotAvailable, + .WSAECONNRESET => return error.ConnectionResetByPeer, + .WSAEMSGSIZE => return error.MessageTooBig, + .WSAENOBUFS => return error.SystemResources, + .WSAENOTSOCK => return error.FileDescriptorNotASocket, + .WSAEAFNOSUPPORT => return error.AddressFamilyNotSupported, + .WSAEDESTADDRREQ => unreachable, // A destination address is required. + .WSAEFAULT => unreachable, // The lpBuffers, lpTo, lpOverlapped, lpNumberOfBytesSent, or lpCompletionRoutine parameters are not part of the user address space, or the lpTo parameter is too small. + .WSAEHOSTUNREACH => return error.NetworkUnreachable, + // TODO: WSAEINPROGRESS, WSAEINTR + .WSAEINVAL => unreachable, + .WSAENETDOWN => return error.NetworkSubsystemFailed, + .WSAENETRESET => return error.ConnectionResetByPeer, + .WSAENETUNREACH => return error.NetworkUnreachable, + .WSAENOTCONN => return error.SocketNotConnected, + .WSAESHUTDOWN => unreachable, // The socket has been shut down; it is not possible to WSASendTo on a socket after shutdown has been invoked with how set to SD_SEND or SD_BOTH. + .WSAEWOULDBLOCK => return error.WouldBlock, + .WSANOTINITIALISED => unreachable, // A successful WSAStartup call must occur before using this function. + else => |err| return os.windows.unexpectedWSAError(err), + } + } else { + return @as(usize, @intCast(rc)); + } + } + }.run_test(); +} + +test "pipe data over socket" { + try struct { + io: IO, + tx: Pipe, + rx: Pipe, + server: Socket = .{}, + + const buffer_size = 1 * 1024 * 1024; + + const Context = @This(); + const Socket = struct { + fd: os.socket_t = IO.INVALID_SOCKET, + completion: IO.Completion = undefined, + }; + const Pipe = struct { + socket: Socket = .{}, + buffer: []u8, + transferred: usize = 0, + }; + + fn run() !void { + const tx_buf = try testing.allocator.alloc(u8, buffer_size); + defer testing.allocator.free(tx_buf); + const rx_buf = try testing.allocator.alloc(u8, buffer_size); + defer testing.allocator.free(rx_buf); + + @memset(tx_buf, 1); + @memset(rx_buf, 0); + var self = Context{ + .io = try IO.init(32, 0), + .tx = .{ .buffer = tx_buf }, + .rx = .{ .buffer = rx_buf }, + }; + defer self.io.deinit(); + + self.server.fd = try self.io.open_socket(os.AF.INET, os.SOCK.STREAM, os.IPPROTO.TCP); + defer os.closeSocket(self.server.fd); + + const address = try std.net.Address.parseIp4("127.0.0.1", 0); + try os.setsockopt( + self.server.fd, + os.SOL.SOCKET, + os.SO.REUSEADDR, + &std.mem.toBytes(@as(c_int, 1)), + ); + + try os.bind(self.server.fd, &address.any, address.getOsSockLen()); + try os.listen(self.server.fd, 1); + + var client_address = std.net.Address.initIp4(undefined, undefined); + var client_address_len = client_address.getOsSockLen(); + try os.getsockname(self.server.fd, &client_address.any, &client_address_len); + + self.io.accept( + *Context, + &self, + on_accept, + &self.server.completion, + self.server.fd, + ); + + self.tx.socket.fd = try self.io.open_socket(os.AF.INET, os.SOCK.STREAM, os.IPPROTO.TCP); + defer os.closeSocket(self.tx.socket.fd); + + self.io.connect( + *Context, + &self, + on_connect, + &self.tx.socket.completion, + self.tx.socket.fd, + client_address, + ); + + var tick: usize = 0xdeadbeef; + while (self.rx.transferred != self.rx.buffer.len) : (tick +%= 1) { + if (tick % 61 == 0) { + const timeout_ns = tick % (10 * std.time.ns_per_ms); + try self.io.run_for_ns(@as(u63, @intCast(timeout_ns))); + } else { + try self.io.tick(); + } + } + + try testing.expect(self.server.fd != IO.INVALID_SOCKET); + try testing.expect(self.tx.socket.fd != IO.INVALID_SOCKET); + try testing.expect(self.rx.socket.fd != IO.INVALID_SOCKET); + os.closeSocket(self.rx.socket.fd); + + try testing.expectEqual(self.tx.transferred, buffer_size); + try testing.expectEqual(self.rx.transferred, buffer_size); + try testing.expect(std.mem.eql(u8, self.tx.buffer, self.rx.buffer)); + } + + fn on_accept( + self: *Context, + completion: *IO.Completion, + result: IO.AcceptError!os.socket_t, + ) void { + assert(self.rx.socket.fd == IO.INVALID_SOCKET); + assert(&self.server.completion == completion); + self.rx.socket.fd = result catch |err| std.debug.panic("accept error {}", .{err}); + + assert(self.rx.transferred == 0); + self.do_receiver(0); + } + + fn on_connect( + self: *Context, + completion: *IO.Completion, + result: IO.ConnectError!void, + ) void { + _ = result catch unreachable; + + assert(self.tx.socket.fd != IO.INVALID_SOCKET); + assert(&self.tx.socket.completion == completion); + + assert(self.tx.transferred == 0); + self.do_sender(0); + } + + fn do_sender(self: *Context, bytes: usize) void { + self.tx.transferred += bytes; + assert(self.tx.transferred <= self.tx.buffer.len); + + if (self.tx.transferred < self.tx.buffer.len) { + self.io.send( + *Context, + self, + on_send, + &self.tx.socket.completion, + self.tx.socket.fd, + self.tx.buffer[self.tx.transferred..], + ); + } + } + + fn on_send( + self: *Context, + completion: *IO.Completion, + result: IO.SendError!usize, + ) void { + const bytes = result catch |err| std.debug.panic("send error: {}", .{err}); + assert(&self.tx.socket.completion == completion); + self.do_sender(bytes); + } + + fn do_receiver(self: *Context, bytes: usize) void { + self.rx.transferred += bytes; + assert(self.rx.transferred <= self.rx.buffer.len); + + if (self.rx.transferred < self.rx.buffer.len) { + self.io.recv( + *Context, + self, + on_recv, + &self.rx.socket.completion, + self.rx.socket.fd, + self.rx.buffer[self.rx.transferred..], + ); + } + } + + fn on_recv( + self: *Context, + completion: *IO.Completion, + result: IO.RecvError!usize, + ) void { + const bytes = result catch |err| std.debug.panic("recv error: {}", .{err}); + assert(&self.rx.socket.completion == completion); + self.do_receiver(bytes); + } + }.run(); +} diff --git a/src/time.zig b/src/time.zig new file mode 100644 index 0000000..e894b6c --- /dev/null +++ b/src/time.zig @@ -0,0 +1,112 @@ +const std = @import("std"); +const builtin = @import("builtin"); + +const os = std.os; +const assert = std.debug.assert; +const is_darwin = builtin.target.os.tag.isDarwin(); +const is_windows = builtin.target.os.tag == .windows; + +pub const Time = struct { + const Self = @This(); + + /// Hardware and/or software bugs can mean that the monotonic clock may regress. + /// One example (of many): https://bugzilla.redhat.com/show_bug.cgi?id=448449 + /// We crash the process for safety if this ever happens, to protect against infinite loops. + /// It's better to crash and come back with a valid monotonic clock than get stuck forever. + monotonic_guard: u64 = 0, + + /// A timestamp to measure elapsed time, meaningful only on the same system, not across reboots. + /// Always use a monotonic timestamp if the goal is to measure elapsed time. + /// This clock is not affected by discontinuous jumps in the system time, for example if the + /// system administrator manually changes the clock. + pub fn monotonic(self: *Self) u64 { + const m = blk: { + // Uses QueryPerformanceCounter() on windows due to it being the highest precision timer + // available while also accounting for time spent suspended by default: + // https://docs.microsoft.com/en-us/windows/win32/api/realtimeapiset/nf-realtimeapiset-queryunbiasedinterrupttime#remarks + if (is_windows) { + // QPF need not be globally cached either as it ends up being a load from read-only + // memory mapped to all processed by the kernel called KUSER_SHARED_DATA (See "QpcFrequency") + // https://docs.microsoft.com/en-us/windows-hardware/drivers/ddi/ntddk/ns-ntddk-kuser_shared_data + // https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/ntexapi_x/kuser_shared_data/index.htm + const qpc = os.windows.QueryPerformanceCounter(); + const qpf = os.windows.QueryPerformanceFrequency(); + + // 10Mhz (1 qpc tick every 100ns) is a common QPF on modern systems. + // We can optimize towards this by converting to ns via a single multiply. + // https://github.com/microsoft/STL/blob/785143a0c73f030238ef618890fd4d6ae2b3a3a0/stl/inc/chrono#L694-L701 + const common_qpf = 10_000_000; + if (qpf == common_qpf) break :blk qpc * (std.time.ns_per_s / common_qpf); + + // Convert qpc to nanos using fixed point to avoid expensive extra divs and overflow. + const scale = (std.time.ns_per_s << 32) / qpf; + break :blk @as(u64, @truncate((@as(u96, qpc) * scale) >> 32)); + } + + // Uses mach_continuous_time() instead of mach_absolute_time() as it counts while suspended. + // https://developer.apple.com/documentation/kernel/1646199-mach_continuous_time + // https://opensource.apple.com/source/Libc/Libc-1158.1.2/gen/clock_gettime.c.auto.html + if (is_darwin) { + const darwin = struct { + const mach_timebase_info_t = os.darwin.mach_timebase_info_data; + extern "c" fn mach_timebase_info(info: *mach_timebase_info_t) os.darwin.kern_return_t; + extern "c" fn mach_continuous_time() u64; + }; + + // mach_timebase_info() called through libc already does global caching for us + // https://opensource.apple.com/source/xnu/xnu-7195.81.3/libsyscall/wrappers/mach_timebase_info.c.auto.html + var info: darwin.mach_timebase_info_t = undefined; + if (darwin.mach_timebase_info(&info) != 0) @panic("mach_timebase_info() failed"); + + const now = darwin.mach_continuous_time(); + return (now * info.numer) / info.denom; + } + + // The true monotonic clock on Linux is not in fact CLOCK_MONOTONIC: + // CLOCK_MONOTONIC excludes elapsed time while the system is suspended (e.g. VM migration). + // CLOCK_BOOTTIME is the same as CLOCK_MONOTONIC but includes elapsed time during a suspend. + // For more detail and why CLOCK_MONOTONIC_RAW is even worse than CLOCK_MONOTONIC, + // see https://github.com/ziglang/zig/pull/933#discussion_r656021295. + var ts: os.timespec = undefined; + os.clock_gettime(os.CLOCK.BOOTTIME, &ts) catch @panic("CLOCK_BOOTTIME required"); + break :blk @as(u64, @intCast(ts.tv_sec)) * std.time.ns_per_s + @as(u64, @intCast(ts.tv_nsec)); + }; + + // "Oops!...I Did It Again" + if (m < self.monotonic_guard) @panic("a hardware/kernel bug regressed the monotonic clock"); + self.monotonic_guard = m; + return m; + } + + /// A timestamp to measure real (i.e. wall clock) time, meaningful across systems, and reboots. + /// This clock is affected by discontinuous jumps in the system time. + pub fn realtime(_: *Self) i64 { + if (is_windows) { + const kernel32 = struct { + extern "kernel32" fn GetSystemTimePreciseAsFileTime( + lpFileTime: *os.windows.FILETIME, + ) callconv(os.windows.WINAPI) void; + }; + + var ft: os.windows.FILETIME = undefined; + kernel32.GetSystemTimePreciseAsFileTime(&ft); + const ft64 = (@as(u64, ft.dwHighDateTime) << 32) | ft.dwLowDateTime; + + // FileTime is in units of 100 nanoseconds + // and uses the NTFS/Windows epoch of 1601-01-01 instead of Unix Epoch 1970-01-01. + const epoch_adjust = std.time.epoch.windows * (std.time.ns_per_s / 100); + return (@as(i64, @bitCast(ft64)) + epoch_adjust) * 100; + } + + if (is_darwin) { + // macos has supported clock_gettime() since 10.12: + // https://opensource.apple.com/source/Libc/Libc-1158.1.2/gen/clock_gettime.3.auto.html + } + + var ts: os.timespec = undefined; + os.clock_gettime(os.CLOCK.REALTIME, &ts) catch unreachable; + return @as(i64, ts.tv_sec) * std.time.ns_per_s + ts.tv_nsec; + } + + pub fn tick(_: *Self) void {} +};