Last active
March 6, 2026 18:33
-
-
Save ivanstepanovftw/f8223bd051bcd128351571385aa7772c to your computer and use it in GitHub Desktop.
IEEE-754 binary(any)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // TODO: normalfloat https://arxiv.org/pdf/2305.14314 | |
| // TODO: NVFP4 | |
| const std = @import("std"); | |
| const math = std.math; | |
| const pretty = @import("pretty.zig"); | |
| const print = pretty.print; | |
| const p = pretty.p; | |
| const comptimePrint = std.fmt.comptimePrint; | |
| comptime { | |
| @setEvalBranchQuota(100000); | |
| } | |
| /// Custom representation of IEEE 754 single precision floating point number. | |
| /// https://en.wikipedia.org/wiki/IEEE_754 | |
| /// https://github.com/ziglang/zig/blob/f29bdd6746691d0a547140e435056a000419480f/lib/std/math/float.zig#L13 | |
| /// https://github.com/ziglang/zig/blob/f29bdd6746691d0a547140e435056a000419480f/lib/std/math.zig#L1725 | |
| pub fn Float( | |
| comptime sign_bits: comptime_int, | |
| comptime exponent_bits: comptime_int, | |
| comptime fraction_bits: comptime_int, | |
| comptime subnormals: bool, | |
| ) type { | |
| return packed struct { | |
| const Self = @This(); | |
| pub const Sign = std.meta.Int(.unsigned, sign_bits); | |
| pub const BiasedExponent = std.meta.Int(.unsigned, exponent_bits); | |
| pub const Fraction = std.meta.Int(.unsigned, fraction_bits); | |
| pub const exponent_bias = (1 << (exponent_bits - 1)) - 1; // https://en.wikipedia.org/wiki/Exponent_bias | |
| sign: Sign, | |
| biased_exponent: BiasedExponent, | |
| fraction: Fraction, | |
| pub fn init(sign: Sign, biased_exponent: BiasedExponent, fraction: Fraction) Self { | |
| return Self{ .sign = sign, .biased_exponent = biased_exponent, .fraction = fraction }; | |
| } | |
| pub fn cast(self: Self, comptime Other: type) Other { | |
| switch (@typeInfo(Other)) { | |
| .float => { | |
| const bits = @bitSizeOf(Other); | |
| const OtherStruct = switch (bits) { | |
| 16 => Float(1, 5, 10, true), | |
| 32 => Float(1, 8, 23, true), | |
| 64 => Float(1, 11, 52, true), | |
| // 80 => Float(1, 15, 64, true), // TODO: f80 is not supported yet because of implicit leading bit in the fraction | |
| 128 => Float(1, 15, 112, true), | |
| else => unreachable, | |
| }; | |
| const OtherBinary = std.meta.Int(.unsigned, bits); | |
| const other_struct = self.cast(OtherStruct); | |
| var other_binary: OtherBinary = 0; | |
| other_binary |= @as(OtherBinary, other_struct.sign) << (bits - 1); | |
| other_binary |= @as(OtherBinary, other_struct.biased_exponent) << (bits - 1 - @bitSizeOf(OtherStruct.BiasedExponent)); | |
| other_binary |= @as(OtherBinary, other_struct.fraction) << (bits - 1 - @bitSizeOf(OtherStruct.BiasedExponent) - @bitSizeOf(OtherStruct.Fraction)); | |
| return @bitCast(other_binary); | |
| }, | |
| .@"struct" => { // Assume the struct is a custom float | |
| const sign: Other.Sign = @as(Other.Sign, self.sign); | |
| const self_inf = (1 << @bitSizeOf(Self.BiasedExponent)) - 1; | |
| const other_inf = (1 << @bitSizeOf(Other.BiasedExponent)) - 1; | |
| // Infinity and NaN cases | |
| if (self.biased_exponent == self_inf) { | |
| if (self.fraction == 0) { | |
| return Other.init(sign, @intCast(other_inf), 0); // Inf | |
| } else { | |
| // NaN | |
| const s_bits = @bitSizeOf(Self.Fraction); | |
| const o_bits = @bitSizeOf(Other.Fraction); | |
| var o_frac: Other.Fraction = 0; | |
| if (s_bits > o_bits) { | |
| o_frac = @truncate(self.fraction >> @intCast(s_bits - o_bits)); | |
| } else { | |
| o_frac = @as(Other.Fraction, self.fraction) << @intCast(o_bits - s_bits); | |
| } | |
| if (o_frac == 0) o_frac = 1; // Preserve NaN payload | |
| return Other.init(sign, @intCast(other_inf), o_frac); | |
| } | |
| } | |
| // Zero case | |
| if (self.biased_exponent == 0 and self.fraction == 0) { | |
| return Other.init(sign, 0, 0); | |
| } | |
| // ------------------------------------------------------------------ | |
| // 1. Extract true exponent and integer significand (value = M * 2^E) | |
| // ------------------------------------------------------------------ | |
| var M: u128 = self.fraction; | |
| var E: i32 = 0; | |
| const S_bias = @as(i32, @intCast(Self.exponent_bias)); | |
| const S_Fs = @as(i32, @intCast(@bitSizeOf(Self.Fraction))); | |
| if (self.biased_exponent == 0) { | |
| // Source is Subnormal (no implicit 1) | |
| E = 1 - S_bias - S_Fs; | |
| } else { | |
| // Source is Normal (add implicit 1) | |
| M |= (@as(u128, 1) << @intCast(S_Fs)); | |
| E = @as(i32, @intCast(self.biased_exponent)) - S_bias - S_Fs; | |
| } | |
| const O_bias = @as(i32, @intCast(Other.exponent_bias)); | |
| const O_Fs = @as(i32, @intCast(@bitSizeOf(Other.Fraction))); | |
| // ------------------------------------------------------------------ | |
| // 2. Normalize M so its MSB is exactly positioned at O_Fs | |
| // ------------------------------------------------------------------ | |
| const msb_idx = 127 - @as(i32, @intCast(@clz(M))); | |
| var target_M: u128 = 0; | |
| var target_E: i32 = E; | |
| if (msb_idx < O_Fs) { | |
| const shl = @as(u7, @intCast(O_Fs - msb_idx)); | |
| target_M = M << shl; | |
| target_E -= @as(i32, @intCast(shl)); | |
| } else if (msb_idx > O_Fs) { | |
| const shr = @as(u7, @intCast(msb_idx - O_Fs)); | |
| target_M = M >> shr; | |
| target_E += @as(i32, @intCast(shr)); | |
| } else { | |
| target_M = M; | |
| target_E = E; | |
| } | |
| // ------------------------------------------------------------------ | |
| // 3. Pack into the Target format | |
| // ------------------------------------------------------------------ | |
| // Calculate target biased exponent assuming it's a normal number | |
| const final_O_exp = target_E + O_bias + O_Fs; | |
| if (final_O_exp >= other_inf) { | |
| // Overflow to Infinity | |
| return Other.init(sign, @intCast(other_inf), 0); | |
| } else if (final_O_exp > 0) { | |
| // Target is Normal | |
| const mask = (@as(u128, 1) << @intCast(O_Fs)) - 1; | |
| const o_frac = @as(Other.Fraction, @truncate(target_M & mask)); | |
| return Other.init(sign, @intCast(final_O_exp), o_frac); | |
| } else { | |
| // Target is Subnormal or Underflow | |
| if (subnormals) { | |
| // If exponent <= 0, we right-shift the mantissa | |
| const shift_right = 1 - final_O_exp; | |
| if (shift_right >= 128) { | |
| return Other.init(sign, 0, 0); // Underflow to absolute zero | |
| } else { | |
| const shr = @as(u7, @intCast(shift_right)); | |
| const o_frac = @as(Other.Fraction, @truncate(target_M >> shr)); | |
| return Other.init(sign, 0, o_frac); | |
| } | |
| } else { | |
| // Subnormals are flushed to zero | |
| return Other.init(sign, 0, 0); | |
| } | |
| } | |
| }, | |
| else => unreachable, | |
| } | |
| } | |
| /// Internal representation for arithmetic | |
| const Unpacked = struct { | |
| sign: Sign, | |
| exp: i32, | |
| mantissa: u128, // Includes implicit bit | |
| is_nan: bool = false, | |
| is_inf: bool = false, | |
| const implicit_bit = @as(u128, 1) << fraction_bits; | |
| }; | |
| fn unpack(self: Self) Unpacked { | |
| const max_exp = (1 << exponent_bits) - 1; | |
| if (self.biased_exponent == max_exp) { | |
| return .{ | |
| .sign = self.sign, | |
| .exp = 0, | |
| .mantissa = self.fraction, | |
| .is_nan = self.fraction != 0, | |
| .is_inf = self.fraction == 0, | |
| }; | |
| } | |
| if (self.biased_exponent == 0) { | |
| if (self.fraction == 0) return .{ .sign = self.sign, .exp = -exponent_bias, .mantissa = 0 }; | |
| // Subnormal | |
| return .{ | |
| .sign = self.sign, | |
| .exp = 1 - exponent_bias, | |
| .mantissa = self.fraction, | |
| }; | |
| } | |
| // Normal | |
| return .{ | |
| .sign = self.sign, | |
| .exp = @as(i32, @intCast(self.biased_exponent)) - exponent_bias, | |
| .mantissa = Unpacked.implicit_bit | self.fraction, | |
| }; | |
| } | |
| fn pack(unpacked: Unpacked) Self { | |
| if (unpacked.is_nan) return Self.init(unpacked.sign, (1 << exponent_bits) - 1, 1); | |
| if (unpacked.is_inf) return Self.init(unpacked.sign, (1 << exponent_bits) - 1, 0); | |
| if (unpacked.mantissa == 0) return Self.init(unpacked.sign, 0, 0); | |
| var m = unpacked.mantissa; | |
| var e = unpacked.exp; | |
| // 1. Normalize: Ensure MSB is at the implicit bit position | |
| const msb = 127 - @clz(m); | |
| const target_bit = fraction_bits; | |
| if (msb > target_bit) { | |
| const shift = @as(u7, @intCast(msb - target_bit)); | |
| m >>= shift; | |
| e += shift; | |
| } else if (msb < target_bit) { | |
| const shift = @as(u7, @intCast(target_bit - msb)); | |
| m <<= shift; | |
| e -= shift; | |
| } | |
| // 2. Handle Exponent range | |
| const biased_e = e + exponent_bias; | |
| if (biased_e >= (1 << exponent_bits) - 1) { | |
| return Self.init(unpacked.sign, (1 << exponent_bits) - 1, 0); // Overflow to Inf | |
| } | |
| if (biased_e <= 0) { | |
| if (!subnormals) return Self.init(unpacked.sign, 0, 0); | |
| // Subnormal handling | |
| const shift = @as(u7, @intCast(1 - biased_e)); | |
| if (shift > fraction_bits + 1) return Self.init(unpacked.sign, 0, 0); | |
| m >>= shift; | |
| return Self.init(unpacked.sign, 0, @truncate(m)); | |
| } | |
| return Self.init(unpacked.sign, @intCast(biased_e), @truncate(m ^ Unpacked.implicit_bit)); | |
| } | |
| pub fn add(self: Self, other: Self) Self { | |
| const a = self.unpack(); | |
| const b = other.unpack(); | |
| // Handle Specials | |
| if (a.is_nan or b.is_nan) return pack(.{ .sign = 0, .exp = 0, .mantissa = 0, .is_nan = true }); | |
| if (a.is_inf and b.is_inf and a.sign != b.sign) return pack(.{ .sign = 0, .exp = 0, .mantissa = 0, .is_nan = true }); | |
| if (a.is_inf) return self; | |
| if (b.is_inf) return other; | |
| // Align exponents | |
| var m_a = a.mantissa; | |
| var m_b = b.mantissa; | |
| var res_exp = a.exp; | |
| if (a.exp > b.exp) { | |
| const diff = @as(u7, @intCast(@min(127, a.exp - b.exp))); | |
| m_b >>= diff; | |
| res_exp = a.exp; | |
| } else if (b.exp > a.exp) { | |
| const diff = @as(u7, @intCast(@min(127, b.exp - a.exp))); | |
| m_a >>= diff; | |
| res_exp = b.exp; | |
| } | |
| // Add/Sub significands | |
| var res_mant: u128 = 0; | |
| var res_sign: Sign = a.sign; | |
| if (a.sign == b.sign) { | |
| res_mant = m_a + m_b; | |
| } else { | |
| if (m_a >= m_b) { | |
| res_mant = m_a - m_b; | |
| res_sign = a.sign; | |
| } else { | |
| res_mant = m_b - m_a; | |
| res_sign = b.sign; | |
| } | |
| } | |
| return pack(.{ .sign = res_sign, .exp = res_exp, .mantissa = res_mant }); | |
| } | |
| pub fn mul(self: Self, other: Self) Self { | |
| const a = self.unpack(); | |
| const b = other.unpack(); | |
| const res_sign = a.sign ^ b.sign; | |
| // Handle Specials | |
| if (a.is_nan or b.is_nan) return pack(.{ .sign = res_sign, .exp = 0, .mantissa = 0, .is_nan = true }); | |
| if ((a.is_inf and b.mantissa == 0 and b.exp == -exponent_bias) or | |
| (b.is_inf and a.mantissa == 0 and a.exp == -exponent_bias)) | |
| { | |
| return pack(.{ .sign = res_sign, .exp = 0, .mantissa = 0, .is_nan = true }); | |
| } | |
| if (a.is_inf or b.is_inf) return pack(.{ .sign = res_sign, .exp = 0, .mantissa = 0, .is_inf = true }); | |
| if (a.mantissa == 0 or b.mantissa == 0) return Self.init(res_sign, 0, 0); | |
| // Multiply significands | |
| // Result is in range [1, 4) if both are normal | |
| const res_mant = (a.mantissa * b.mantissa) >> fraction_bits; | |
| const res_exp = a.exp + b.exp; | |
| return pack(.{ .sign = res_sign, .exp = res_exp, .mantissa = res_mant }); | |
| } | |
| // const Shift = std.meta.Int(.unsigned, @max(exponent_bits, fraction_bits)); | |
| // | |
| // pub fn shiftRight(self: Self, shift: Shift) Self { | |
| // if (shift == 0) return self; | |
| // // return Self.init(self.sign, self.exponent -| shift, @shlExact(self.significand, shift)); | |
| // return Self.init(self.sign, @truncate(self.biased_exponent -| shift), self.fraction); | |
| // } | |
| pub fn format(self: Self, writer: anytype) !void { | |
| const s_fmt = "{b:0>" ++ comptimePrint("{d}", .{sign_bits}) ++ "}"; | |
| const e_fmt = "{b:0>" ++ comptimePrint("{d}", .{exponent_bits}) ++ "}"; | |
| const m_fmt = "{b:0>" ++ comptimePrint("{d}", .{fraction_bits}) ++ "}"; | |
| try writer.print( | |
| s_fmt ++ " " ++ e_fmt ++ " " ++ m_fmt, | |
| .{ self.sign, self.biased_exponent, self.fraction }, | |
| ); | |
| } | |
| }; | |
| } | |
| const F4 = Float(1, 2, 1, true); | |
| const F8 = Float(1, 4, 3, true); | |
| const F16 = Float(1, 5, 10, true); | |
| const BF16 = Float(1, 8, 7, true); | |
| const F32 = Float(1, 8, 23, true); | |
| const F64 = Float(1, 11, 52, true); | |
| const F128 = Float(1, 15, 112, true); | |
| pub fn main() void { | |
| @setEvalBranchQuota(50000); | |
| { | |
| const a: u3 = 1; | |
| const b: u10 = @as(u6969, @intCast(a)) << (@as(u6969, @bitSizeOf(@TypeOf(a))) + @clz(a)); | |
| print(.{ "a: ", a, "\n" }); | |
| print(.{ "b: ", b, "\n" }); | |
| } | |
| { | |
| print(.{"F4 to F4\n"}); | |
| print(.{"expected | casted\n"}); | |
| print(.{"F4 | decimal | F4\n"}); | |
| print(.{ "0 00 0 | 0.0 | ", p("f", F4.init(0, 0b00, 0).cast(F4)), "\n" }); | |
| print(.{ "0 00 1 | 0.5 | ", p("f", F4.init(0, 0b00, 1).cast(F4)), "\n" }); | |
| print(.{ "0 01 0 | 1.0 | ", p("f", F4.init(0, 0b01, 0).cast(F4)), "\n" }); | |
| print(.{ "0 01 1 | 1.5 | ", p("f", F4.init(0, 0b01, 1).cast(F4)), "\n" }); | |
| print(.{ "0 10 0 | 2.0 | ", p("f", F4.init(0, 0b10, 0).cast(F4)), "\n" }); | |
| print(.{ "0 10 1 | 3.0 | ", p("f", F4.init(0, 0b10, 1).cast(F4)), "\n" }); | |
| } | |
| { | |
| print(.{"-" ** 80 ++ "\n"}); | |
| print(.{"F4 to F8\n"}); | |
| print(.{"given | expected | casted\n"}); | |
| print(.{"F4 | decimal | F8 | F8\n"}); | |
| print(.{ "0 00 0 | 0.0 | 0 0000 000 | ", p("f", F4.init(0, 0b00, 0).cast(F8)), "\n" }); | |
| print(.{ "0 00 1 | 0.5 | 0 0110 000 | ", p("f", F4.init(0, 0b00, 1).cast(F8)), "\n" }); | |
| print(.{ "0 01 0 | 1.0 | 0 0111 000 | ", p("f", F4.init(0, 0b01, 0).cast(F8)), "\n" }); | |
| print(.{ "0 01 1 | 1.5 | 0 0111 100 | ", p("f", F4.init(0, 0b01, 1).cast(F8)), "\n" }); | |
| print(.{ "0 10 0 | 2.0 | 0 1000 000 | ", p("f", F4.init(0, 0b10, 0).cast(F8)), "\n" }); | |
| print(.{ "0 10 1 | 3.0 | 0 1000 100 | ", p("f", F4.init(0, 0b10, 1).cast(F8)), "\n" }); | |
| } | |
| inline for (.{16}) |bits| { | |
| const CustomFloat = switch (bits) { | |
| 16 => F16, | |
| 32 => F32, | |
| 64 => F64, | |
| 128 => F128, | |
| else => unreachable, | |
| }; | |
| const MetaFloat = std.meta.Float(bits); | |
| const MetaUnsigned = std.meta.Int(.unsigned, bits); | |
| print(.{"-" ** 80 ++ "\n"}); | |
| print(.{ "F8 to f", bits, "\n" }); | |
| print(.{"given | expected | casted\n"}); | |
| print(.{ "F4 | decimal | f", bits, " binary | F", bits, " | f", bits, " decimal | ok\n" }); | |
| print(.{" seeeeeffffffffff\n"}); | |
| print(.{ "0 00 0 | 0.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.0)))), " | ", p("f", F4.init(0, 0b00, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b00, 0).cast(MetaFloat)), " | ", F4.init(0, 0b00, 0).cast(MetaFloat) == 0.0, "\n" }); | |
| print(.{ "0 00 1 | 0.5 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.5)))), " | ", p("f", F4.init(0, 0b00, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b00, 1).cast(MetaFloat)), " | ", F4.init(0, 0b00, 1).cast(MetaFloat) == 0.5, "\n" }); | |
| print(.{ "0 01 0 | 1.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 1.0)))), " | ", p("f", F4.init(0, 0b01, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b01, 0).cast(MetaFloat)), " | ", F4.init(0, 0b01, 0).cast(MetaFloat) == 1.0, "\n" }); | |
| print(.{ "0 01 1 | 1.5 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 1.5)))), " | ", p("f", F4.init(0, 0b01, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b01, 1).cast(MetaFloat)), " | ", F4.init(0, 0b01, 1).cast(MetaFloat) == 1.5, "\n" }); | |
| print(.{ "0 10 0 | 2.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 2.0)))), " | ", p("f", F4.init(0, 0b10, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b10, 0).cast(MetaFloat)), " | ", F4.init(0, 0b10, 0).cast(MetaFloat) == 2.0, "\n" }); | |
| print(.{ "0 10 1 | 3.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 3.0)))), " | ", p("f", F4.init(0, 0b10, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b10, 1).cast(MetaFloat)), " | ", F4.init(0, 0b10, 1).cast(MetaFloat) == 3.0, "\n" }); | |
| print(.{ "0 11 0 | inf | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(std.math.inf(MetaFloat)))), " | ", p("f", F4.init(0, 0b11, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b11, 0).cast(MetaFloat)), " | ", std.math.isPositiveInf(F4.init(0, 0b11, 0).cast(MetaFloat)), "\n" }); | |
| print(.{ "0 11 1 | nan | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(std.math.nan(MetaFloat)))), " | ", p("f", F4.init(0, 0b11, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b11, 1).cast(MetaFloat)), " | ", std.math.isNan(F4.init(0, 0b11, 1).cast(MetaFloat)), "\n" }); | |
| print(.{ "1 00 0 | -0.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, -0.0)))), " | ", p("f", F4.init(1, 0b00, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(1, 0b00, 0).cast(MetaFloat)), " | ", F4.init(1, 0b00, 0).cast(MetaFloat) == -0.0, "\n" }); | |
| print(.{ "1 00 1 | -0.5 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, -0.5)))), " | ", p("f", F4.init(1, 0b00, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(1, 0b00, 1).cast(MetaFloat)), " | ", F4.init(1, 0b00, 1).cast(MetaFloat) == -0.5, "\n" }); | |
| print(.{ "1 10 1 | -3.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, -3.0)))), " | ", p("f", F4.init(1, 0b10, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(1, 0b10, 1).cast(MetaFloat)), " | ", F4.init(1, 0b10, 1).cast(MetaFloat) == -3.0, "\n" }); | |
| print(.{ "1 11 0 | -inf | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(-std.math.inf(MetaFloat)))), " | ", p("f", F4.init(1, 0b11, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(1, 0b11, 0).cast(MetaFloat)), " | ", std.math.isNegativeInf(F4.init(1, 0b11, 0).cast(MetaFloat)), "\n" }); | |
| print(.{ "1 11 1 | nan | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(-std.math.nan(MetaFloat)))), " | ", p("f", F4.init(1, 0b11, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(1, 0b11, 1).cast(MetaFloat)), " | ", std.math.isNan(F4.init(1, 0b11, 1).cast(MetaFloat)), "\n" }); | |
| } | |
| // https://en.wikipedia.org/wiki/Minifloat#Table_of_values | |
| // inline for (.{ 16, 32, 64, 128 }) |bits| { | |
| inline for (.{16}) |bits| { | |
| const CustomFloat = switch (bits) { | |
| 16 => F16, | |
| 32 => F32, | |
| 64 => F64, | |
| 128 => F128, | |
| else => unreachable, | |
| }; | |
| const MetaFloat = std.meta.Float(bits); | |
| const MetaUnsigned = std.meta.Int(.unsigned, bits); | |
| print(.{"-" ** 80 ++ "\n"}); | |
| print(.{ "F8 to f", bits, "\n" }); | |
| print(.{"given | expected | casted\n"}); | |
| print(.{ "F8 | decimal | f", bits, " binary | F", bits, " | f", bits, " decimal | ok\n" }); | |
| print(.{" seeeeeffffffffff\n"}); | |
| print(.{ "0 0000 000 | 0.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.000000000)))), " | ", p("f", F8.init(0, 0b0000, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b000).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b000).cast(MetaFloat) == 0.0, "\n" }); | |
| print(.{ "0 0000 001 | 0.001953125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.001953125)))), " | ", p("f", F8.init(0, 0b0000, 0b001).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b001).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b001).cast(MetaFloat) == 0.001953125, "\n" }); | |
| print(.{ "0 0000 010 | 0.00390625 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.003906250)))), " | ", p("f", F8.init(0, 0b0000, 0b010).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b010).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b010).cast(MetaFloat) == 0.00390625, "\n" }); | |
| print(.{ "0 0000 011 | 0.005859375 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.005859375)))), " | ", p("f", F8.init(0, 0b0000, 0b011).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b011).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b011).cast(MetaFloat) == 0.005859375, "\n" }); | |
| print(.{ "0 0000 100 | 0.0078125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.007812500)))), " | ", p("f", F8.init(0, 0b0000, 0b100).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b100).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b100).cast(MetaFloat) == 0.0078125, "\n" }); | |
| print(.{ "0 0000 111 | 0.013671875 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.013671875)))), " | ", p("f", F8.init(0, 0b0000, 0b111).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b111).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b111).cast(MetaFloat) == 0.013671875, "\n" }); | |
| print(.{ "0 0001 000 | 0.015625 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.015625000)))), " | ", p("f", F8.init(0, 0b0001, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0001, 0b000).cast(MetaFloat)), " | ", F8.init(0, 0b0001, 0b000).cast(MetaFloat) == 0.015625, "\n" }); | |
| print(.{ "0 0001 001 | 0.017578125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.017578125)))), " | ", p("f", F8.init(0, 0b0001, 0b001).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0001, 0b001).cast(MetaFloat)), " | ", F8.init(0, 0b0001, 0b001).cast(MetaFloat) == 0.017578125, "\n" }); | |
| print(.{ "0 0001 010 | 0.01953125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.019531250)))), " | ", p("f", F8.init(0, 0b0001, 0b010).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0001, 0b010).cast(MetaFloat)), " | ", F8.init(0, 0b0001, 0b010).cast(MetaFloat) == 0.01953125, "\n" }); | |
| print(.{ "0 0001 100 | 0.0234375 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.023437500)))), " | ", p("f", F8.init(0, 0b0001, 0b100).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0001, 0b100).cast(MetaFloat)), " | ", F8.init(0, 0b0001, 0b100).cast(MetaFloat) == 0.0234375, "\n" }); | |
| print(.{ "0 0001 111 | 0.029296875 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.029296875)))), " | ", p("f", F8.init(0, 0b0001, 0b111).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0001, 0b111).cast(MetaFloat)), " | ", F8.init(0, 0b0001, 0b111).cast(MetaFloat) == 0.029296875, "\n" }); | |
| print(.{ "0 0010 000 | 0.03125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.031250000)))), " | ", p("f", F8.init(0, 0b0010, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0010, 0b000).cast(MetaFloat)), " | ", F8.init(0, 0b0010, 0b000).cast(MetaFloat) == 0.03125, "\n" }); | |
| print(.{ "0 0100 000 | 0.125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.125000000)))), " | ", p("f", F8.init(0, 0b0100, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0100, 0b000).cast(MetaFloat)), " | ", F8.init(0, 0b0100, 0b000).cast(MetaFloat) == 0.125, "\n" }); | |
| print(.{ "0 1000 000 | 2.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 2.000000000)))), " | ", p("f", F8.init(0, 0b1000, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b1000, 0b000).cast(MetaFloat)), " | ", F8.init(0, 0b1000, 0b000).cast(MetaFloat) == 2.0, "\n" }); | |
| print(.{ "0 1111 000 | inf | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(std.math.inf(MetaFloat)))), " | ", p("f", F8.init(0, 0b1111, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b1111, 0b000).cast(MetaFloat)), " | ", std.math.isPositiveInf(F8.init(0, 0b1111, 0b000).cast(MetaFloat)), "\n" }); | |
| } | |
| // inline for (.{ 16, 32, 64, 128 }) |bits| { | |
| { | |
| print(.{"-" ** 80 ++ "\n"}); | |
| print(.{"F32 to f16\n"}); | |
| print(.{"given | expected | casted\n"}); | |
| print(.{"F32 | f16 binary | F16 | ok\n"}); | |
| print(.{"seeeeeeeefffffffffffffffffffffff seeeeeffffffffff s eeeee ffffffffff\n"}); | |
| // seeeeeeeefffffffffffffffffffffff | |
| print(.{ p("b:0>32", @as(u32, 0b00000000000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b00000000000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00000000000000000000000000000001)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000000000000000000000000001)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b00000000000000000000001).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00000000000010000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000000010000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b00010000000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00000000000011100000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000000011100000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b00011100000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00000000000100000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000000100000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b00100000000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00000000001000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000001000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b01000000000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00000000010000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000010000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b10000000000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00000000100000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000100000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000001, 0b00000000000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00000001000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000001000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000010, 0b00000000000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00000010000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000010000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000100, 0b00000000000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00000100000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000100000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00001000, 0b00000000000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00001000000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00001000000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00010000, 0b00000000000000000000000).cast(F16)), "\n" }); | |
| print(.{ p("b:0>32", @as(u32, 0b00111000000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00111000000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b01110000, 0b00000000000000000000000).cast(F16)), "\n" }); | |
| } | |
| } | |
| test "F4 to F4" { | |
| @setEvalBranchQuota(100000); | |
| // Values should produce the same value, if subnormal values are enabled | |
| print(.{"F4 (or 1.2.1-float, or 4-bit float) to F4\n"}); | |
| inline for (0..1) |sign| { | |
| inline for (0..1 << 2) |exponent| { | |
| inline for (0..1 << 1) |mantissa| { | |
| try std.testing.expectFmt(comptimePrint("{b} {b:0>2} {b}", .{ sign, exponent, mantissa }), "{f}", .{F4.init(sign, exponent, mantissa).cast(F4)}); | |
| } | |
| } | |
| } | |
| } | |
| test "F8 to F8" { | |
| @setEvalBranchQuota(100000); | |
| // Values should produce the same value, if subnormal values are enabled | |
| print(.{"F8 (or 1.4.3-float, or 8-bit float, or Minifloat) to F8\n"}); | |
| inline for (0..1) |sign| { | |
| inline for (0..1 << 4) |exponent| { | |
| inline for (0..1 << 3) |mantissa| { | |
| try std.testing.expectFmt(std.fmt.comptimePrint("{b} {b:0>4} {b:0>3}", .{ sign, exponent, mantissa }), "{f}", .{F8.init(sign, exponent, mantissa).cast(F8)}); | |
| } | |
| } | |
| } | |
| } | |
| test "F4 to F8" { | |
| @setEvalBranchQuota(100000); | |
| // 1.2.1-float values: https://en.wikipedia.org/wiki/Minifloat#4_bits_and_fewer, accessed on 2024-09-07 | |
| // 1.4.3-float values: https://en.wikipedia.org/wiki/Minifloat#Table_of_values, accessed on 2024-09-07 | |
| inline for (0..1) |sign| { | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 0000 000", "{f}", .{F4.init(sign, 0b00, 0).cast(F8)}); // 0.0 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 0110 000", "{f}", .{F4.init(sign, 0b00, 1).cast(F8)}); // 0.5 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 0111 000", "{f}", .{F4.init(sign, 0b01, 0).cast(F8)}); // 1.0 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 0111 100", "{f}", .{F4.init(sign, 0b01, 1).cast(F8)}); // 1.5 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 1000 000", "{f}", .{F4.init(sign, 0b10, 0).cast(F8)}); // 2.0 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 1000 100", "{f}", .{F4.init(sign, 0b10, 1).cast(F8)}); // 3.0 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 1111 000", "{f}", .{F4.init(sign, 0b11, 0).cast(F8)}); // inf | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 1111 100", "{f}", .{F4.init(sign, 0b11, 1).cast(F8)}); // nan | |
| } | |
| } | |
| test "F8 to F4" { | |
| @setEvalBranchQuota(100000); | |
| // 1.2.1-float values: https://en.wikipedia.org/wiki/Minifloat#4_bits_and_fewer, accessed on 2024-09-07 | |
| // 1.4.3-float values: https://en.wikipedia.org/wiki/Minifloat#Table_of_values, accessed on 2024-09-07 | |
| inline for (0..1) |sign| { | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 00 0", "{f}", .{F8.init(sign, 0b0000, 0b000).cast(F4)}); // 0.0 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 00 1", "{f}", .{F8.init(sign, 0b0110, 0b000).cast(F4)}); // 0.5 // FIXME: subnormal handling | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 01 0", "{f}", .{F8.init(sign, 0b0111, 0b000).cast(F4)}); // 1.0 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 01 1", "{f}", .{F8.init(sign, 0b0111, 0b100).cast(F4)}); // 1.5 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 10 0", "{f}", .{F8.init(sign, 0b1000, 0b000).cast(F4)}); // 2.0 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 10 1", "{f}", .{F8.init(sign, 0b1000, 0b100).cast(F4)}); // 3.0 | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 11 0", "{f}", .{F8.init(sign, 0b1111, 0b000).cast(F4)}); // inf | |
| try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 11 1", "{f}", .{F8.init(sign, 0b1111, 0b100).cast(F4)}); // nan | |
| } | |
| } | |
| // Just a helper function to make the test code more readable | |
| inline fn expectEqualSwapped(expected: anytype, actual: anytype) !void { | |
| return std.testing.expectEqual(actual, expected); | |
| } | |
| test "F4 to meta float" { | |
| @setEvalBranchQuota(100000); | |
| // 1.2.1-float values: https://en.wikipedia.org/wiki/Minifloat#4_bits_and_fewer, accessed on 2024-09-07 | |
| inline for (.{ 16, 32, 64, 128 }) |bits| { | |
| print(.{ "F4 (or 1.2.1-float, or 4-bit float) to f", bits, " (meta float)\n" }); | |
| const TestFloat = std.meta.Float(bits); | |
| inline for (0..1) |sign| { | |
| try expectEqualSwapped(F4.init(sign, 0b00, 0).cast(TestFloat), (if (sign > 0) -0.0 else 0.0)); | |
| try expectEqualSwapped(F4.init(sign, 0b00, 1).cast(TestFloat), (if (sign > 0) -0.5 else 0.5)); | |
| try expectEqualSwapped(F4.init(sign, 0b01, 0).cast(TestFloat), (if (sign > 0) -1.0 else 1.0)); | |
| try expectEqualSwapped(F4.init(sign, 0b01, 1).cast(TestFloat), (if (sign > 0) -1.5 else 1.5)); | |
| try expectEqualSwapped(F4.init(sign, 0b10, 0).cast(TestFloat), (if (sign > 0) -2.0 else 2.0)); | |
| try expectEqualSwapped(F4.init(sign, 0b10, 1).cast(TestFloat), (if (sign > 0) -3.0 else 3.0)); | |
| try expectEqualSwapped(F4.init(sign, 0b11, 0).cast(TestFloat), (if (sign > 0) -std.math.inf(TestFloat) else std.math.inf(TestFloat))); | |
| try std.testing.expect(std.math.isNan(F4.init(sign, 0b11, 1).cast(TestFloat))); | |
| } | |
| } | |
| } | |
| test "F8 to meta float" { | |
| @setEvalBranchQuota(100000); | |
| // 1.4.3-float values: https://en.wikipedia.org/wiki/Minifloat#Table_of_values, accessed on 2024-09-07 | |
| inline for (.{ 16, 32, 64, 128 }) |bits| { | |
| print(.{ "F8 (or 1.4.3-float, or 8-bit float, or Minifloat) to f", bits, " (meta float)\n" }); | |
| const TestFloat = std.meta.Float(bits); | |
| try expectEqualSwapped(F8.init(0, 0b0000, 0b000).cast(TestFloat), 0.0); | |
| try expectEqualSwapped(F8.init(0, 0b0000, 0b001).cast(TestFloat), 0.001953125); | |
| try expectEqualSwapped(F8.init(0, 0b0000, 0b010).cast(TestFloat), 0.00390625); | |
| try expectEqualSwapped(F8.init(0, 0b0000, 0b100).cast(TestFloat), 0.0078125); | |
| try expectEqualSwapped(F8.init(0, 0b0000, 0b111).cast(TestFloat), 0.013671875); | |
| try expectEqualSwapped(F8.init(0, 0b0001, 0b000).cast(TestFloat), 0.015625); | |
| try expectEqualSwapped(F8.init(0, 0b0001, 0b001).cast(TestFloat), 0.017578125); | |
| try expectEqualSwapped(F8.init(0, 0b0001, 0b010).cast(TestFloat), 0.01953125); | |
| try expectEqualSwapped(F8.init(0, 0b0001, 0b100).cast(TestFloat), 0.0234375); | |
| try expectEqualSwapped(F8.init(0, 0b0001, 0b111).cast(TestFloat), 0.029296875); | |
| try expectEqualSwapped(F8.init(0, 0b0010, 0b000).cast(TestFloat), 0.03125); | |
| try expectEqualSwapped(F8.init(0, 0b0010, 0b001).cast(TestFloat), 0.03515625); | |
| try expectEqualSwapped(F8.init(0, 0b0010, 0b010).cast(TestFloat), 0.0390625); | |
| try expectEqualSwapped(F8.init(0, 0b0010, 0b100).cast(TestFloat), 0.046875); | |
| try expectEqualSwapped(F8.init(0, 0b0010, 0b111).cast(TestFloat), 0.05859375); | |
| try expectEqualSwapped(F8.init(0, 0b0100, 0b000).cast(TestFloat), 0.125); | |
| try expectEqualSwapped(F8.init(0, 0b0100, 0b001).cast(TestFloat), 0.140625); | |
| try expectEqualSwapped(F8.init(0, 0b0100, 0b010).cast(TestFloat), 0.15625); | |
| try expectEqualSwapped(F8.init(0, 0b0100, 0b100).cast(TestFloat), 0.1875); | |
| try expectEqualSwapped(F8.init(0, 0b0100, 0b111).cast(TestFloat), 0.234375); | |
| try expectEqualSwapped(F8.init(0, 0b0111, 0b000).cast(TestFloat), 1); | |
| try expectEqualSwapped(F8.init(0, 0b0111, 0b001).cast(TestFloat), 1.125); | |
| try expectEqualSwapped(F8.init(0, 0b0111, 0b010).cast(TestFloat), 1.25); | |
| try expectEqualSwapped(F8.init(0, 0b0111, 0b100).cast(TestFloat), 1.5); | |
| try expectEqualSwapped(F8.init(0, 0b0111, 0b111).cast(TestFloat), 1.875); | |
| try expectEqualSwapped(F8.init(0, 0b1000, 0b000).cast(TestFloat), 2); | |
| try expectEqualSwapped(F8.init(0, 0b1000, 0b001).cast(TestFloat), 2.25); | |
| try expectEqualSwapped(F8.init(0, 0b1000, 0b010).cast(TestFloat), 2.5); | |
| try expectEqualSwapped(F8.init(0, 0b1000, 0b100).cast(TestFloat), 3); | |
| try expectEqualSwapped(F8.init(0, 0b1000, 0b111).cast(TestFloat), 3.75); | |
| try expectEqualSwapped(F8.init(0, 0b1110, 0b000).cast(TestFloat), 128); | |
| try expectEqualSwapped(F8.init(0, 0b1110, 0b001).cast(TestFloat), 144); | |
| try expectEqualSwapped(F8.init(0, 0b1110, 0b010).cast(TestFloat), 160); | |
| try expectEqualSwapped(F8.init(0, 0b1110, 0b100).cast(TestFloat), 192); | |
| try expectEqualSwapped(F8.init(0, 0b1110, 0b111).cast(TestFloat), 240); | |
| try std.testing.expect(std.math.isPositiveInf(F8.init(0, 0b1111, 0b000).cast(TestFloat))); | |
| try std.testing.expect(std.math.isNan(F8.init(0, 0b1111, 0b001).cast(TestFloat))); | |
| try std.testing.expect(std.math.isNan(F8.init(0, 0b1111, 0b010).cast(TestFloat))); | |
| try std.testing.expect(std.math.isNan(F8.init(0, 0b1111, 0b100).cast(TestFloat))); | |
| try std.testing.expect(std.math.isNan(F8.init(0, 0b1111, 0b111).cast(TestFloat))); | |
| } | |
| } | |
| test "F4 Addition" { | |
| const one = F4.init(0, 0b01, 0); // 1.0 | |
| const one_five = F4.init(0, 0b01, 1); // 1.5 | |
| const two = F4.init(0, 0b10, 0); // 2.0 | |
| const three = F4.init(0, 0b10, 1); // 3.0 | |
| const zero = F4.init(0, 0, 0); | |
| // 1.0 + 1.0 = 2.0 | |
| try std.testing.expectEqual(two, one.add(one)); | |
| // 1.0 + 0.5 = 1.5 | |
| const half = F4.init(0, 0b00, 1); // 0.5 (subnormal in F4) | |
| try std.testing.expectEqual(one_five, one.add(half)); | |
| // 1.5 + 1.5 = 3.0 | |
| try std.testing.expectEqual(three, one_five.add(one_five)); | |
| // x + 0 = x | |
| try std.testing.expectEqual(one, one.add(zero)); | |
| // 2.0 + (-1.0) = 1.0 | |
| const neg_one = F4.init(1, 0b01, 0); | |
| try std.testing.expectEqual(one, two.add(neg_one)); | |
| } | |
| test "F4 Multiplication" { | |
| const one = F4.init(0, 0b01, 0); // 1.0 | |
| const two = F4.init(0, 0b10, 0); // 2.0 | |
| const three = F4.init(0, 0b10, 1); // 3.0 | |
| const one_five = F4.init(0, 0b01, 1); // 1.5 | |
| // 1.0 * 2.0 = 2.0 | |
| try std.testing.expectEqual(two, one.mul(two)); | |
| // 1.5 * 2.0 = 3.0 | |
| try std.testing.expectEqual(three, one_five.mul(two)); | |
| // 2.0 * 1.5 = 3.0 (Commutative) | |
| try std.testing.expectEqual(three, two.mul(one_five)); | |
| // -1.0 * 2.0 = -2.0 | |
| const neg_one = F4.init(1, 0b01, 0); | |
| const neg_two = F4.init(1, 0b10, 0); | |
| try std.testing.expectEqual(neg_two, neg_one.mul(two)); | |
| // Truncation: 1.5 * 1.5 = 2.25 -> truncated to 2.0 in F4 | |
| try std.testing.expectEqual(two, one_five.mul(one_five)); | |
| } | |
| test "Special Cases: Infinity and NaN" { | |
| const inf = F4.init(0, 0b11, 0); | |
| const neg_inf = F4.init(1, 0b11, 0); | |
| const nan = F4.init(0, 0b11, 1); | |
| const one = F4.init(0, 0b01, 0); | |
| const zero = F4.init(0, 0, 0); | |
| // Inf + x = Inf | |
| try std.testing.expectEqual(inf, inf.add(one)); | |
| // Inf - Inf = NaN | |
| const res = inf.add(neg_inf); | |
| try std.testing.expect(res.biased_exponent == 0b11 and res.fraction != 0); | |
| // NaN propagation | |
| try std.testing.expect((one.add(nan)).biased_exponent == 0b11); | |
| try std.testing.expect((one.mul(nan)).biased_exponent == 0b11); | |
| // 0 * Inf = NaN | |
| const zero_inf = zero.mul(inf); | |
| try std.testing.expect(zero_inf.biased_exponent == 0b11 and zero_inf.fraction != 0); | |
| // Inf * Inf = Inf | |
| try std.testing.expectEqual(inf, inf.mul(inf)); | |
| // Inf * -1 = -Inf | |
| const neg_one = F4.init(1, 0b01, 0); | |
| try std.testing.expectEqual(neg_inf, inf.mul(neg_one)); | |
| } | |
| test "F4 Overflow and Underflow" { | |
| const three = F4.init(0, 0b10, 1); // Max finite value in F4 is 3.0 | |
| const one = F4.init(0, 0b01, 0); | |
| const inf = F4.init(0, 0b11, 0); | |
| // 3.0 + 1.0 = 4.0 (Overflows F4 range) | |
| try std.testing.expectEqual(inf, three.add(one)); | |
| // 3.0 * 2.0 = 6.0 (Overflows F4 range) | |
| const two = F4.init(0, 0b10, 0); | |
| try std.testing.expectEqual(inf, three.mul(two)); | |
| // Underflow to zero (if subnormals can't represent it) | |
| // Smallest subnormal in F4 is 0.5 | |
| const half = F4.init(0, 0b00, 1); | |
| // 0.5 * 0.5 = 0.25 (Underflow) | |
| try std.testing.expectEqual(F4.init(0, 0, 0), half.mul(half)); | |
| } | |
| test "F8 Arithmetic (Minifloat)" { | |
| print(.{"Arithm ok"}); | |
| // F8: 1 sign, 4 exponent (bias 7), 3 fraction | |
| const f8_one = F8.init(0, 7, 0); // 1.0 | |
| const f8_two = F8.init(0, 8, 0); // 2.0 | |
| // 1.0 + 2.0 = 3.0 (Exponent 8, mantissa 0.5 -> 1.5 * 2^(8-7) = 3) | |
| const expected_three = F8.init(0, 8, 0b100); | |
| try std.testing.expectEqual(expected_three, f8_one.add(f8_two)); | |
| // 2.0 * 2.0 = 4.0 (Exponent 9, mantissa 0 -> 1.0 * 2^(9-7) = 4) | |
| const expected_four = F8.init(0, 9, 0); | |
| try std.testing.expectEqual(expected_four, f8_two.mul(f8_two)); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment