Skip to content

Instantly share code, notes, and snippets.

@ivanstepanovftw
Last active March 6, 2026 18:33
Show Gist options
  • Select an option

  • Save ivanstepanovftw/f8223bd051bcd128351571385aa7772c to your computer and use it in GitHub Desktop.

Select an option

Save ivanstepanovftw/f8223bd051bcd128351571385aa7772c to your computer and use it in GitHub Desktop.
IEEE-754 binary(any)
// TODO: normalfloat https://arxiv.org/pdf/2305.14314
// TODO: NVFP4
const std = @import("std");
const math = std.math;
const pretty = @import("pretty.zig");
const print = pretty.print;
const p = pretty.p;
const comptimePrint = std.fmt.comptimePrint;
comptime {
@setEvalBranchQuota(100000);
}
/// Custom representation of IEEE 754 single precision floating point number.
/// https://en.wikipedia.org/wiki/IEEE_754
/// https://github.com/ziglang/zig/blob/f29bdd6746691d0a547140e435056a000419480f/lib/std/math/float.zig#L13
/// https://github.com/ziglang/zig/blob/f29bdd6746691d0a547140e435056a000419480f/lib/std/math.zig#L1725
pub fn Float(
comptime sign_bits: comptime_int,
comptime exponent_bits: comptime_int,
comptime fraction_bits: comptime_int,
comptime subnormals: bool,
) type {
return packed struct {
const Self = @This();
pub const Sign = std.meta.Int(.unsigned, sign_bits);
pub const BiasedExponent = std.meta.Int(.unsigned, exponent_bits);
pub const Fraction = std.meta.Int(.unsigned, fraction_bits);
pub const exponent_bias = (1 << (exponent_bits - 1)) - 1; // https://en.wikipedia.org/wiki/Exponent_bias
sign: Sign,
biased_exponent: BiasedExponent,
fraction: Fraction,
pub fn init(sign: Sign, biased_exponent: BiasedExponent, fraction: Fraction) Self {
return Self{ .sign = sign, .biased_exponent = biased_exponent, .fraction = fraction };
}
pub fn cast(self: Self, comptime Other: type) Other {
switch (@typeInfo(Other)) {
.float => {
const bits = @bitSizeOf(Other);
const OtherStruct = switch (bits) {
16 => Float(1, 5, 10, true),
32 => Float(1, 8, 23, true),
64 => Float(1, 11, 52, true),
// 80 => Float(1, 15, 64, true), // TODO: f80 is not supported yet because of implicit leading bit in the fraction
128 => Float(1, 15, 112, true),
else => unreachable,
};
const OtherBinary = std.meta.Int(.unsigned, bits);
const other_struct = self.cast(OtherStruct);
var other_binary: OtherBinary = 0;
other_binary |= @as(OtherBinary, other_struct.sign) << (bits - 1);
other_binary |= @as(OtherBinary, other_struct.biased_exponent) << (bits - 1 - @bitSizeOf(OtherStruct.BiasedExponent));
other_binary |= @as(OtherBinary, other_struct.fraction) << (bits - 1 - @bitSizeOf(OtherStruct.BiasedExponent) - @bitSizeOf(OtherStruct.Fraction));
return @bitCast(other_binary);
},
.@"struct" => { // Assume the struct is a custom float
const sign: Other.Sign = @as(Other.Sign, self.sign);
const self_inf = (1 << @bitSizeOf(Self.BiasedExponent)) - 1;
const other_inf = (1 << @bitSizeOf(Other.BiasedExponent)) - 1;
// Infinity and NaN cases
if (self.biased_exponent == self_inf) {
if (self.fraction == 0) {
return Other.init(sign, @intCast(other_inf), 0); // Inf
} else {
// NaN
const s_bits = @bitSizeOf(Self.Fraction);
const o_bits = @bitSizeOf(Other.Fraction);
var o_frac: Other.Fraction = 0;
if (s_bits > o_bits) {
o_frac = @truncate(self.fraction >> @intCast(s_bits - o_bits));
} else {
o_frac = @as(Other.Fraction, self.fraction) << @intCast(o_bits - s_bits);
}
if (o_frac == 0) o_frac = 1; // Preserve NaN payload
return Other.init(sign, @intCast(other_inf), o_frac);
}
}
// Zero case
if (self.biased_exponent == 0 and self.fraction == 0) {
return Other.init(sign, 0, 0);
}
// ------------------------------------------------------------------
// 1. Extract true exponent and integer significand (value = M * 2^E)
// ------------------------------------------------------------------
var M: u128 = self.fraction;
var E: i32 = 0;
const S_bias = @as(i32, @intCast(Self.exponent_bias));
const S_Fs = @as(i32, @intCast(@bitSizeOf(Self.Fraction)));
if (self.biased_exponent == 0) {
// Source is Subnormal (no implicit 1)
E = 1 - S_bias - S_Fs;
} else {
// Source is Normal (add implicit 1)
M |= (@as(u128, 1) << @intCast(S_Fs));
E = @as(i32, @intCast(self.biased_exponent)) - S_bias - S_Fs;
}
const O_bias = @as(i32, @intCast(Other.exponent_bias));
const O_Fs = @as(i32, @intCast(@bitSizeOf(Other.Fraction)));
// ------------------------------------------------------------------
// 2. Normalize M so its MSB is exactly positioned at O_Fs
// ------------------------------------------------------------------
const msb_idx = 127 - @as(i32, @intCast(@clz(M)));
var target_M: u128 = 0;
var target_E: i32 = E;
if (msb_idx < O_Fs) {
const shl = @as(u7, @intCast(O_Fs - msb_idx));
target_M = M << shl;
target_E -= @as(i32, @intCast(shl));
} else if (msb_idx > O_Fs) {
const shr = @as(u7, @intCast(msb_idx - O_Fs));
target_M = M >> shr;
target_E += @as(i32, @intCast(shr));
} else {
target_M = M;
target_E = E;
}
// ------------------------------------------------------------------
// 3. Pack into the Target format
// ------------------------------------------------------------------
// Calculate target biased exponent assuming it's a normal number
const final_O_exp = target_E + O_bias + O_Fs;
if (final_O_exp >= other_inf) {
// Overflow to Infinity
return Other.init(sign, @intCast(other_inf), 0);
} else if (final_O_exp > 0) {
// Target is Normal
const mask = (@as(u128, 1) << @intCast(O_Fs)) - 1;
const o_frac = @as(Other.Fraction, @truncate(target_M & mask));
return Other.init(sign, @intCast(final_O_exp), o_frac);
} else {
// Target is Subnormal or Underflow
if (subnormals) {
// If exponent <= 0, we right-shift the mantissa
const shift_right = 1 - final_O_exp;
if (shift_right >= 128) {
return Other.init(sign, 0, 0); // Underflow to absolute zero
} else {
const shr = @as(u7, @intCast(shift_right));
const o_frac = @as(Other.Fraction, @truncate(target_M >> shr));
return Other.init(sign, 0, o_frac);
}
} else {
// Subnormals are flushed to zero
return Other.init(sign, 0, 0);
}
}
},
else => unreachable,
}
}
/// Internal representation for arithmetic
const Unpacked = struct {
sign: Sign,
exp: i32,
mantissa: u128, // Includes implicit bit
is_nan: bool = false,
is_inf: bool = false,
const implicit_bit = @as(u128, 1) << fraction_bits;
};
fn unpack(self: Self) Unpacked {
const max_exp = (1 << exponent_bits) - 1;
if (self.biased_exponent == max_exp) {
return .{
.sign = self.sign,
.exp = 0,
.mantissa = self.fraction,
.is_nan = self.fraction != 0,
.is_inf = self.fraction == 0,
};
}
if (self.biased_exponent == 0) {
if (self.fraction == 0) return .{ .sign = self.sign, .exp = -exponent_bias, .mantissa = 0 };
// Subnormal
return .{
.sign = self.sign,
.exp = 1 - exponent_bias,
.mantissa = self.fraction,
};
}
// Normal
return .{
.sign = self.sign,
.exp = @as(i32, @intCast(self.biased_exponent)) - exponent_bias,
.mantissa = Unpacked.implicit_bit | self.fraction,
};
}
fn pack(unpacked: Unpacked) Self {
if (unpacked.is_nan) return Self.init(unpacked.sign, (1 << exponent_bits) - 1, 1);
if (unpacked.is_inf) return Self.init(unpacked.sign, (1 << exponent_bits) - 1, 0);
if (unpacked.mantissa == 0) return Self.init(unpacked.sign, 0, 0);
var m = unpacked.mantissa;
var e = unpacked.exp;
// 1. Normalize: Ensure MSB is at the implicit bit position
const msb = 127 - @clz(m);
const target_bit = fraction_bits;
if (msb > target_bit) {
const shift = @as(u7, @intCast(msb - target_bit));
m >>= shift;
e += shift;
} else if (msb < target_bit) {
const shift = @as(u7, @intCast(target_bit - msb));
m <<= shift;
e -= shift;
}
// 2. Handle Exponent range
const biased_e = e + exponent_bias;
if (biased_e >= (1 << exponent_bits) - 1) {
return Self.init(unpacked.sign, (1 << exponent_bits) - 1, 0); // Overflow to Inf
}
if (biased_e <= 0) {
if (!subnormals) return Self.init(unpacked.sign, 0, 0);
// Subnormal handling
const shift = @as(u7, @intCast(1 - biased_e));
if (shift > fraction_bits + 1) return Self.init(unpacked.sign, 0, 0);
m >>= shift;
return Self.init(unpacked.sign, 0, @truncate(m));
}
return Self.init(unpacked.sign, @intCast(biased_e), @truncate(m ^ Unpacked.implicit_bit));
}
pub fn add(self: Self, other: Self) Self {
const a = self.unpack();
const b = other.unpack();
// Handle Specials
if (a.is_nan or b.is_nan) return pack(.{ .sign = 0, .exp = 0, .mantissa = 0, .is_nan = true });
if (a.is_inf and b.is_inf and a.sign != b.sign) return pack(.{ .sign = 0, .exp = 0, .mantissa = 0, .is_nan = true });
if (a.is_inf) return self;
if (b.is_inf) return other;
// Align exponents
var m_a = a.mantissa;
var m_b = b.mantissa;
var res_exp = a.exp;
if (a.exp > b.exp) {
const diff = @as(u7, @intCast(@min(127, a.exp - b.exp)));
m_b >>= diff;
res_exp = a.exp;
} else if (b.exp > a.exp) {
const diff = @as(u7, @intCast(@min(127, b.exp - a.exp)));
m_a >>= diff;
res_exp = b.exp;
}
// Add/Sub significands
var res_mant: u128 = 0;
var res_sign: Sign = a.sign;
if (a.sign == b.sign) {
res_mant = m_a + m_b;
} else {
if (m_a >= m_b) {
res_mant = m_a - m_b;
res_sign = a.sign;
} else {
res_mant = m_b - m_a;
res_sign = b.sign;
}
}
return pack(.{ .sign = res_sign, .exp = res_exp, .mantissa = res_mant });
}
pub fn mul(self: Self, other: Self) Self {
const a = self.unpack();
const b = other.unpack();
const res_sign = a.sign ^ b.sign;
// Handle Specials
if (a.is_nan or b.is_nan) return pack(.{ .sign = res_sign, .exp = 0, .mantissa = 0, .is_nan = true });
if ((a.is_inf and b.mantissa == 0 and b.exp == -exponent_bias) or
(b.is_inf and a.mantissa == 0 and a.exp == -exponent_bias))
{
return pack(.{ .sign = res_sign, .exp = 0, .mantissa = 0, .is_nan = true });
}
if (a.is_inf or b.is_inf) return pack(.{ .sign = res_sign, .exp = 0, .mantissa = 0, .is_inf = true });
if (a.mantissa == 0 or b.mantissa == 0) return Self.init(res_sign, 0, 0);
// Multiply significands
// Result is in range [1, 4) if both are normal
const res_mant = (a.mantissa * b.mantissa) >> fraction_bits;
const res_exp = a.exp + b.exp;
return pack(.{ .sign = res_sign, .exp = res_exp, .mantissa = res_mant });
}
// const Shift = std.meta.Int(.unsigned, @max(exponent_bits, fraction_bits));
//
// pub fn shiftRight(self: Self, shift: Shift) Self {
// if (shift == 0) return self;
// // return Self.init(self.sign, self.exponent -| shift, @shlExact(self.significand, shift));
// return Self.init(self.sign, @truncate(self.biased_exponent -| shift), self.fraction);
// }
pub fn format(self: Self, writer: anytype) !void {
const s_fmt = "{b:0>" ++ comptimePrint("{d}", .{sign_bits}) ++ "}";
const e_fmt = "{b:0>" ++ comptimePrint("{d}", .{exponent_bits}) ++ "}";
const m_fmt = "{b:0>" ++ comptimePrint("{d}", .{fraction_bits}) ++ "}";
try writer.print(
s_fmt ++ " " ++ e_fmt ++ " " ++ m_fmt,
.{ self.sign, self.biased_exponent, self.fraction },
);
}
};
}
const F4 = Float(1, 2, 1, true);
const F8 = Float(1, 4, 3, true);
const F16 = Float(1, 5, 10, true);
const BF16 = Float(1, 8, 7, true);
const F32 = Float(1, 8, 23, true);
const F64 = Float(1, 11, 52, true);
const F128 = Float(1, 15, 112, true);
pub fn main() void {
@setEvalBranchQuota(50000);
{
const a: u3 = 1;
const b: u10 = @as(u6969, @intCast(a)) << (@as(u6969, @bitSizeOf(@TypeOf(a))) + @clz(a));
print(.{ "a: ", a, "\n" });
print(.{ "b: ", b, "\n" });
}
{
print(.{"F4 to F4\n"});
print(.{"expected | casted\n"});
print(.{"F4 | decimal | F4\n"});
print(.{ "0 00 0 | 0.0 | ", p("f", F4.init(0, 0b00, 0).cast(F4)), "\n" });
print(.{ "0 00 1 | 0.5 | ", p("f", F4.init(0, 0b00, 1).cast(F4)), "\n" });
print(.{ "0 01 0 | 1.0 | ", p("f", F4.init(0, 0b01, 0).cast(F4)), "\n" });
print(.{ "0 01 1 | 1.5 | ", p("f", F4.init(0, 0b01, 1).cast(F4)), "\n" });
print(.{ "0 10 0 | 2.0 | ", p("f", F4.init(0, 0b10, 0).cast(F4)), "\n" });
print(.{ "0 10 1 | 3.0 | ", p("f", F4.init(0, 0b10, 1).cast(F4)), "\n" });
}
{
print(.{"-" ** 80 ++ "\n"});
print(.{"F4 to F8\n"});
print(.{"given | expected | casted\n"});
print(.{"F4 | decimal | F8 | F8\n"});
print(.{ "0 00 0 | 0.0 | 0 0000 000 | ", p("f", F4.init(0, 0b00, 0).cast(F8)), "\n" });
print(.{ "0 00 1 | 0.5 | 0 0110 000 | ", p("f", F4.init(0, 0b00, 1).cast(F8)), "\n" });
print(.{ "0 01 0 | 1.0 | 0 0111 000 | ", p("f", F4.init(0, 0b01, 0).cast(F8)), "\n" });
print(.{ "0 01 1 | 1.5 | 0 0111 100 | ", p("f", F4.init(0, 0b01, 1).cast(F8)), "\n" });
print(.{ "0 10 0 | 2.0 | 0 1000 000 | ", p("f", F4.init(0, 0b10, 0).cast(F8)), "\n" });
print(.{ "0 10 1 | 3.0 | 0 1000 100 | ", p("f", F4.init(0, 0b10, 1).cast(F8)), "\n" });
}
inline for (.{16}) |bits| {
const CustomFloat = switch (bits) {
16 => F16,
32 => F32,
64 => F64,
128 => F128,
else => unreachable,
};
const MetaFloat = std.meta.Float(bits);
const MetaUnsigned = std.meta.Int(.unsigned, bits);
print(.{"-" ** 80 ++ "\n"});
print(.{ "F8 to f", bits, "\n" });
print(.{"given | expected | casted\n"});
print(.{ "F4 | decimal | f", bits, " binary | F", bits, " | f", bits, " decimal | ok\n" });
print(.{" seeeeeffffffffff\n"});
print(.{ "0 00 0 | 0.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.0)))), " | ", p("f", F4.init(0, 0b00, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b00, 0).cast(MetaFloat)), " | ", F4.init(0, 0b00, 0).cast(MetaFloat) == 0.0, "\n" });
print(.{ "0 00 1 | 0.5 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.5)))), " | ", p("f", F4.init(0, 0b00, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b00, 1).cast(MetaFloat)), " | ", F4.init(0, 0b00, 1).cast(MetaFloat) == 0.5, "\n" });
print(.{ "0 01 0 | 1.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 1.0)))), " | ", p("f", F4.init(0, 0b01, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b01, 0).cast(MetaFloat)), " | ", F4.init(0, 0b01, 0).cast(MetaFloat) == 1.0, "\n" });
print(.{ "0 01 1 | 1.5 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 1.5)))), " | ", p("f", F4.init(0, 0b01, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b01, 1).cast(MetaFloat)), " | ", F4.init(0, 0b01, 1).cast(MetaFloat) == 1.5, "\n" });
print(.{ "0 10 0 | 2.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 2.0)))), " | ", p("f", F4.init(0, 0b10, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b10, 0).cast(MetaFloat)), " | ", F4.init(0, 0b10, 0).cast(MetaFloat) == 2.0, "\n" });
print(.{ "0 10 1 | 3.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 3.0)))), " | ", p("f", F4.init(0, 0b10, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b10, 1).cast(MetaFloat)), " | ", F4.init(0, 0b10, 1).cast(MetaFloat) == 3.0, "\n" });
print(.{ "0 11 0 | inf | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(std.math.inf(MetaFloat)))), " | ", p("f", F4.init(0, 0b11, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b11, 0).cast(MetaFloat)), " | ", std.math.isPositiveInf(F4.init(0, 0b11, 0).cast(MetaFloat)), "\n" });
print(.{ "0 11 1 | nan | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(std.math.nan(MetaFloat)))), " | ", p("f", F4.init(0, 0b11, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(0, 0b11, 1).cast(MetaFloat)), " | ", std.math.isNan(F4.init(0, 0b11, 1).cast(MetaFloat)), "\n" });
print(.{ "1 00 0 | -0.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, -0.0)))), " | ", p("f", F4.init(1, 0b00, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(1, 0b00, 0).cast(MetaFloat)), " | ", F4.init(1, 0b00, 0).cast(MetaFloat) == -0.0, "\n" });
print(.{ "1 00 1 | -0.5 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, -0.5)))), " | ", p("f", F4.init(1, 0b00, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(1, 0b00, 1).cast(MetaFloat)), " | ", F4.init(1, 0b00, 1).cast(MetaFloat) == -0.5, "\n" });
print(.{ "1 10 1 | -3.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, -3.0)))), " | ", p("f", F4.init(1, 0b10, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(1, 0b10, 1).cast(MetaFloat)), " | ", F4.init(1, 0b10, 1).cast(MetaFloat) == -3.0, "\n" });
print(.{ "1 11 0 | -inf | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(-std.math.inf(MetaFloat)))), " | ", p("f", F4.init(1, 0b11, 0).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(1, 0b11, 0).cast(MetaFloat)), " | ", std.math.isNegativeInf(F4.init(1, 0b11, 0).cast(MetaFloat)), "\n" });
print(.{ "1 11 1 | nan | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(-std.math.nan(MetaFloat)))), " | ", p("f", F4.init(1, 0b11, 1).cast(CustomFloat)), " | ", p("d:>11.1", F4.init(1, 0b11, 1).cast(MetaFloat)), " | ", std.math.isNan(F4.init(1, 0b11, 1).cast(MetaFloat)), "\n" });
}
// https://en.wikipedia.org/wiki/Minifloat#Table_of_values
// inline for (.{ 16, 32, 64, 128 }) |bits| {
inline for (.{16}) |bits| {
const CustomFloat = switch (bits) {
16 => F16,
32 => F32,
64 => F64,
128 => F128,
else => unreachable,
};
const MetaFloat = std.meta.Float(bits);
const MetaUnsigned = std.meta.Int(.unsigned, bits);
print(.{"-" ** 80 ++ "\n"});
print(.{ "F8 to f", bits, "\n" });
print(.{"given | expected | casted\n"});
print(.{ "F8 | decimal | f", bits, " binary | F", bits, " | f", bits, " decimal | ok\n" });
print(.{" seeeeeffffffffff\n"});
print(.{ "0 0000 000 | 0.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.000000000)))), " | ", p("f", F8.init(0, 0b0000, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b000).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b000).cast(MetaFloat) == 0.0, "\n" });
print(.{ "0 0000 001 | 0.001953125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.001953125)))), " | ", p("f", F8.init(0, 0b0000, 0b001).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b001).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b001).cast(MetaFloat) == 0.001953125, "\n" });
print(.{ "0 0000 010 | 0.00390625 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.003906250)))), " | ", p("f", F8.init(0, 0b0000, 0b010).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b010).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b010).cast(MetaFloat) == 0.00390625, "\n" });
print(.{ "0 0000 011 | 0.005859375 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.005859375)))), " | ", p("f", F8.init(0, 0b0000, 0b011).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b011).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b011).cast(MetaFloat) == 0.005859375, "\n" });
print(.{ "0 0000 100 | 0.0078125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.007812500)))), " | ", p("f", F8.init(0, 0b0000, 0b100).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b100).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b100).cast(MetaFloat) == 0.0078125, "\n" });
print(.{ "0 0000 111 | 0.013671875 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.013671875)))), " | ", p("f", F8.init(0, 0b0000, 0b111).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0000, 0b111).cast(MetaFloat)), " | ", F8.init(0, 0b0000, 0b111).cast(MetaFloat) == 0.013671875, "\n" });
print(.{ "0 0001 000 | 0.015625 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.015625000)))), " | ", p("f", F8.init(0, 0b0001, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0001, 0b000).cast(MetaFloat)), " | ", F8.init(0, 0b0001, 0b000).cast(MetaFloat) == 0.015625, "\n" });
print(.{ "0 0001 001 | 0.017578125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.017578125)))), " | ", p("f", F8.init(0, 0b0001, 0b001).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0001, 0b001).cast(MetaFloat)), " | ", F8.init(0, 0b0001, 0b001).cast(MetaFloat) == 0.017578125, "\n" });
print(.{ "0 0001 010 | 0.01953125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.019531250)))), " | ", p("f", F8.init(0, 0b0001, 0b010).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0001, 0b010).cast(MetaFloat)), " | ", F8.init(0, 0b0001, 0b010).cast(MetaFloat) == 0.01953125, "\n" });
print(.{ "0 0001 100 | 0.0234375 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.023437500)))), " | ", p("f", F8.init(0, 0b0001, 0b100).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0001, 0b100).cast(MetaFloat)), " | ", F8.init(0, 0b0001, 0b100).cast(MetaFloat) == 0.0234375, "\n" });
print(.{ "0 0001 111 | 0.029296875 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.029296875)))), " | ", p("f", F8.init(0, 0b0001, 0b111).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0001, 0b111).cast(MetaFloat)), " | ", F8.init(0, 0b0001, 0b111).cast(MetaFloat) == 0.029296875, "\n" });
print(.{ "0 0010 000 | 0.03125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.031250000)))), " | ", p("f", F8.init(0, 0b0010, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0010, 0b000).cast(MetaFloat)), " | ", F8.init(0, 0b0010, 0b000).cast(MetaFloat) == 0.03125, "\n" });
print(.{ "0 0100 000 | 0.125 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 0.125000000)))), " | ", p("f", F8.init(0, 0b0100, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b0100, 0b000).cast(MetaFloat)), " | ", F8.init(0, 0b0100, 0b000).cast(MetaFloat) == 0.125, "\n" });
print(.{ "0 1000 000 | 2.0 | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(@as(MetaFloat, 2.000000000)))), " | ", p("f", F8.init(0, 0b1000, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b1000, 0b000).cast(MetaFloat)), " | ", F8.init(0, 0b1000, 0b000).cast(MetaFloat) == 2.0, "\n" });
print(.{ "0 1111 000 | inf | ", p(comptimePrint("b:0>{d}", .{bits}), @as(MetaUnsigned, @bitCast(std.math.inf(MetaFloat)))), " | ", p("f", F8.init(0, 0b1111, 0b000).cast(CustomFloat)), " | ", p("d:11.9", F8.init(0, 0b1111, 0b000).cast(MetaFloat)), " | ", std.math.isPositiveInf(F8.init(0, 0b1111, 0b000).cast(MetaFloat)), "\n" });
}
// inline for (.{ 16, 32, 64, 128 }) |bits| {
{
print(.{"-" ** 80 ++ "\n"});
print(.{"F32 to f16\n"});
print(.{"given | expected | casted\n"});
print(.{"F32 | f16 binary | F16 | ok\n"});
print(.{"seeeeeeeefffffffffffffffffffffff seeeeeffffffffff s eeeee ffffffffff\n"});
// seeeeeeeefffffffffffffffffffffff
print(.{ p("b:0>32", @as(u32, 0b00000000000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b00000000000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00000000000000000000000000000001)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000000000000000000000000001)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b00000000000000000000001).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00000000000010000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000000010000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b00010000000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00000000000011100000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000000011100000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b00011100000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00000000000100000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000000100000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b00100000000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00000000001000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000001000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b01000000000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00000000010000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000010000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000000, 0b10000000000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00000000100000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000000100000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000001, 0b00000000000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00000001000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000001000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000010, 0b00000000000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00000010000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000010000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00000100, 0b00000000000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00000100000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00000100000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00001000, 0b00000000000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00001000000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00001000000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b00010000, 0b00000000000000000000000).cast(F16)), "\n" });
print(.{ p("b:0>32", @as(u32, 0b00111000000000000000000000000000)), " | ", p("b:0>16", @as(u16, @bitCast(@as(f16, @floatCast(@as(f32, @bitCast(@as(u32, 0b00111000000000000000000000000000)))))))), " | ", p("f", F32.init(0, 0b01110000, 0b00000000000000000000000).cast(F16)), "\n" });
}
}
test "F4 to F4" {
@setEvalBranchQuota(100000);
// Values should produce the same value, if subnormal values are enabled
print(.{"F4 (or 1.2.1-float, or 4-bit float) to F4\n"});
inline for (0..1) |sign| {
inline for (0..1 << 2) |exponent| {
inline for (0..1 << 1) |mantissa| {
try std.testing.expectFmt(comptimePrint("{b} {b:0>2} {b}", .{ sign, exponent, mantissa }), "{f}", .{F4.init(sign, exponent, mantissa).cast(F4)});
}
}
}
}
test "F8 to F8" {
@setEvalBranchQuota(100000);
// Values should produce the same value, if subnormal values are enabled
print(.{"F8 (or 1.4.3-float, or 8-bit float, or Minifloat) to F8\n"});
inline for (0..1) |sign| {
inline for (0..1 << 4) |exponent| {
inline for (0..1 << 3) |mantissa| {
try std.testing.expectFmt(std.fmt.comptimePrint("{b} {b:0>4} {b:0>3}", .{ sign, exponent, mantissa }), "{f}", .{F8.init(sign, exponent, mantissa).cast(F8)});
}
}
}
}
test "F4 to F8" {
@setEvalBranchQuota(100000);
// 1.2.1-float values: https://en.wikipedia.org/wiki/Minifloat#4_bits_and_fewer, accessed on 2024-09-07
// 1.4.3-float values: https://en.wikipedia.org/wiki/Minifloat#Table_of_values, accessed on 2024-09-07
inline for (0..1) |sign| {
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 0000 000", "{f}", .{F4.init(sign, 0b00, 0).cast(F8)}); // 0.0
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 0110 000", "{f}", .{F4.init(sign, 0b00, 1).cast(F8)}); // 0.5
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 0111 000", "{f}", .{F4.init(sign, 0b01, 0).cast(F8)}); // 1.0
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 0111 100", "{f}", .{F4.init(sign, 0b01, 1).cast(F8)}); // 1.5
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 1000 000", "{f}", .{F4.init(sign, 0b10, 0).cast(F8)}); // 2.0
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 1000 100", "{f}", .{F4.init(sign, 0b10, 1).cast(F8)}); // 3.0
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 1111 000", "{f}", .{F4.init(sign, 0b11, 0).cast(F8)}); // inf
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 1111 100", "{f}", .{F4.init(sign, 0b11, 1).cast(F8)}); // nan
}
}
test "F8 to F4" {
@setEvalBranchQuota(100000);
// 1.2.1-float values: https://en.wikipedia.org/wiki/Minifloat#4_bits_and_fewer, accessed on 2024-09-07
// 1.4.3-float values: https://en.wikipedia.org/wiki/Minifloat#Table_of_values, accessed on 2024-09-07
inline for (0..1) |sign| {
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 00 0", "{f}", .{F8.init(sign, 0b0000, 0b000).cast(F4)}); // 0.0
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 00 1", "{f}", .{F8.init(sign, 0b0110, 0b000).cast(F4)}); // 0.5 // FIXME: subnormal handling
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 01 0", "{f}", .{F8.init(sign, 0b0111, 0b000).cast(F4)}); // 1.0
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 01 1", "{f}", .{F8.init(sign, 0b0111, 0b100).cast(F4)}); // 1.5
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 10 0", "{f}", .{F8.init(sign, 0b1000, 0b000).cast(F4)}); // 2.0
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 10 1", "{f}", .{F8.init(sign, 0b1000, 0b100).cast(F4)}); // 3.0
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 11 0", "{f}", .{F8.init(sign, 0b1111, 0b000).cast(F4)}); // inf
try std.testing.expectFmt(comptimePrint("{d}", .{sign}) ++ " 11 1", "{f}", .{F8.init(sign, 0b1111, 0b100).cast(F4)}); // nan
}
}
// Just a helper function to make the test code more readable
inline fn expectEqualSwapped(expected: anytype, actual: anytype) !void {
return std.testing.expectEqual(actual, expected);
}
test "F4 to meta float" {
@setEvalBranchQuota(100000);
// 1.2.1-float values: https://en.wikipedia.org/wiki/Minifloat#4_bits_and_fewer, accessed on 2024-09-07
inline for (.{ 16, 32, 64, 128 }) |bits| {
print(.{ "F4 (or 1.2.1-float, or 4-bit float) to f", bits, " (meta float)\n" });
const TestFloat = std.meta.Float(bits);
inline for (0..1) |sign| {
try expectEqualSwapped(F4.init(sign, 0b00, 0).cast(TestFloat), (if (sign > 0) -0.0 else 0.0));
try expectEqualSwapped(F4.init(sign, 0b00, 1).cast(TestFloat), (if (sign > 0) -0.5 else 0.5));
try expectEqualSwapped(F4.init(sign, 0b01, 0).cast(TestFloat), (if (sign > 0) -1.0 else 1.0));
try expectEqualSwapped(F4.init(sign, 0b01, 1).cast(TestFloat), (if (sign > 0) -1.5 else 1.5));
try expectEqualSwapped(F4.init(sign, 0b10, 0).cast(TestFloat), (if (sign > 0) -2.0 else 2.0));
try expectEqualSwapped(F4.init(sign, 0b10, 1).cast(TestFloat), (if (sign > 0) -3.0 else 3.0));
try expectEqualSwapped(F4.init(sign, 0b11, 0).cast(TestFloat), (if (sign > 0) -std.math.inf(TestFloat) else std.math.inf(TestFloat)));
try std.testing.expect(std.math.isNan(F4.init(sign, 0b11, 1).cast(TestFloat)));
}
}
}
test "F8 to meta float" {
@setEvalBranchQuota(100000);
// 1.4.3-float values: https://en.wikipedia.org/wiki/Minifloat#Table_of_values, accessed on 2024-09-07
inline for (.{ 16, 32, 64, 128 }) |bits| {
print(.{ "F8 (or 1.4.3-float, or 8-bit float, or Minifloat) to f", bits, " (meta float)\n" });
const TestFloat = std.meta.Float(bits);
try expectEqualSwapped(F8.init(0, 0b0000, 0b000).cast(TestFloat), 0.0);
try expectEqualSwapped(F8.init(0, 0b0000, 0b001).cast(TestFloat), 0.001953125);
try expectEqualSwapped(F8.init(0, 0b0000, 0b010).cast(TestFloat), 0.00390625);
try expectEqualSwapped(F8.init(0, 0b0000, 0b100).cast(TestFloat), 0.0078125);
try expectEqualSwapped(F8.init(0, 0b0000, 0b111).cast(TestFloat), 0.013671875);
try expectEqualSwapped(F8.init(0, 0b0001, 0b000).cast(TestFloat), 0.015625);
try expectEqualSwapped(F8.init(0, 0b0001, 0b001).cast(TestFloat), 0.017578125);
try expectEqualSwapped(F8.init(0, 0b0001, 0b010).cast(TestFloat), 0.01953125);
try expectEqualSwapped(F8.init(0, 0b0001, 0b100).cast(TestFloat), 0.0234375);
try expectEqualSwapped(F8.init(0, 0b0001, 0b111).cast(TestFloat), 0.029296875);
try expectEqualSwapped(F8.init(0, 0b0010, 0b000).cast(TestFloat), 0.03125);
try expectEqualSwapped(F8.init(0, 0b0010, 0b001).cast(TestFloat), 0.03515625);
try expectEqualSwapped(F8.init(0, 0b0010, 0b010).cast(TestFloat), 0.0390625);
try expectEqualSwapped(F8.init(0, 0b0010, 0b100).cast(TestFloat), 0.046875);
try expectEqualSwapped(F8.init(0, 0b0010, 0b111).cast(TestFloat), 0.05859375);
try expectEqualSwapped(F8.init(0, 0b0100, 0b000).cast(TestFloat), 0.125);
try expectEqualSwapped(F8.init(0, 0b0100, 0b001).cast(TestFloat), 0.140625);
try expectEqualSwapped(F8.init(0, 0b0100, 0b010).cast(TestFloat), 0.15625);
try expectEqualSwapped(F8.init(0, 0b0100, 0b100).cast(TestFloat), 0.1875);
try expectEqualSwapped(F8.init(0, 0b0100, 0b111).cast(TestFloat), 0.234375);
try expectEqualSwapped(F8.init(0, 0b0111, 0b000).cast(TestFloat), 1);
try expectEqualSwapped(F8.init(0, 0b0111, 0b001).cast(TestFloat), 1.125);
try expectEqualSwapped(F8.init(0, 0b0111, 0b010).cast(TestFloat), 1.25);
try expectEqualSwapped(F8.init(0, 0b0111, 0b100).cast(TestFloat), 1.5);
try expectEqualSwapped(F8.init(0, 0b0111, 0b111).cast(TestFloat), 1.875);
try expectEqualSwapped(F8.init(0, 0b1000, 0b000).cast(TestFloat), 2);
try expectEqualSwapped(F8.init(0, 0b1000, 0b001).cast(TestFloat), 2.25);
try expectEqualSwapped(F8.init(0, 0b1000, 0b010).cast(TestFloat), 2.5);
try expectEqualSwapped(F8.init(0, 0b1000, 0b100).cast(TestFloat), 3);
try expectEqualSwapped(F8.init(0, 0b1000, 0b111).cast(TestFloat), 3.75);
try expectEqualSwapped(F8.init(0, 0b1110, 0b000).cast(TestFloat), 128);
try expectEqualSwapped(F8.init(0, 0b1110, 0b001).cast(TestFloat), 144);
try expectEqualSwapped(F8.init(0, 0b1110, 0b010).cast(TestFloat), 160);
try expectEqualSwapped(F8.init(0, 0b1110, 0b100).cast(TestFloat), 192);
try expectEqualSwapped(F8.init(0, 0b1110, 0b111).cast(TestFloat), 240);
try std.testing.expect(std.math.isPositiveInf(F8.init(0, 0b1111, 0b000).cast(TestFloat)));
try std.testing.expect(std.math.isNan(F8.init(0, 0b1111, 0b001).cast(TestFloat)));
try std.testing.expect(std.math.isNan(F8.init(0, 0b1111, 0b010).cast(TestFloat)));
try std.testing.expect(std.math.isNan(F8.init(0, 0b1111, 0b100).cast(TestFloat)));
try std.testing.expect(std.math.isNan(F8.init(0, 0b1111, 0b111).cast(TestFloat)));
}
}
test "F4 Addition" {
const one = F4.init(0, 0b01, 0); // 1.0
const one_five = F4.init(0, 0b01, 1); // 1.5
const two = F4.init(0, 0b10, 0); // 2.0
const three = F4.init(0, 0b10, 1); // 3.0
const zero = F4.init(0, 0, 0);
// 1.0 + 1.0 = 2.0
try std.testing.expectEqual(two, one.add(one));
// 1.0 + 0.5 = 1.5
const half = F4.init(0, 0b00, 1); // 0.5 (subnormal in F4)
try std.testing.expectEqual(one_five, one.add(half));
// 1.5 + 1.5 = 3.0
try std.testing.expectEqual(three, one_five.add(one_five));
// x + 0 = x
try std.testing.expectEqual(one, one.add(zero));
// 2.0 + (-1.0) = 1.0
const neg_one = F4.init(1, 0b01, 0);
try std.testing.expectEqual(one, two.add(neg_one));
}
test "F4 Multiplication" {
const one = F4.init(0, 0b01, 0); // 1.0
const two = F4.init(0, 0b10, 0); // 2.0
const three = F4.init(0, 0b10, 1); // 3.0
const one_five = F4.init(0, 0b01, 1); // 1.5
// 1.0 * 2.0 = 2.0
try std.testing.expectEqual(two, one.mul(two));
// 1.5 * 2.0 = 3.0
try std.testing.expectEqual(three, one_five.mul(two));
// 2.0 * 1.5 = 3.0 (Commutative)
try std.testing.expectEqual(three, two.mul(one_five));
// -1.0 * 2.0 = -2.0
const neg_one = F4.init(1, 0b01, 0);
const neg_two = F4.init(1, 0b10, 0);
try std.testing.expectEqual(neg_two, neg_one.mul(two));
// Truncation: 1.5 * 1.5 = 2.25 -> truncated to 2.0 in F4
try std.testing.expectEqual(two, one_five.mul(one_five));
}
test "Special Cases: Infinity and NaN" {
const inf = F4.init(0, 0b11, 0);
const neg_inf = F4.init(1, 0b11, 0);
const nan = F4.init(0, 0b11, 1);
const one = F4.init(0, 0b01, 0);
const zero = F4.init(0, 0, 0);
// Inf + x = Inf
try std.testing.expectEqual(inf, inf.add(one));
// Inf - Inf = NaN
const res = inf.add(neg_inf);
try std.testing.expect(res.biased_exponent == 0b11 and res.fraction != 0);
// NaN propagation
try std.testing.expect((one.add(nan)).biased_exponent == 0b11);
try std.testing.expect((one.mul(nan)).biased_exponent == 0b11);
// 0 * Inf = NaN
const zero_inf = zero.mul(inf);
try std.testing.expect(zero_inf.biased_exponent == 0b11 and zero_inf.fraction != 0);
// Inf * Inf = Inf
try std.testing.expectEqual(inf, inf.mul(inf));
// Inf * -1 = -Inf
const neg_one = F4.init(1, 0b01, 0);
try std.testing.expectEqual(neg_inf, inf.mul(neg_one));
}
test "F4 Overflow and Underflow" {
const three = F4.init(0, 0b10, 1); // Max finite value in F4 is 3.0
const one = F4.init(0, 0b01, 0);
const inf = F4.init(0, 0b11, 0);
// 3.0 + 1.0 = 4.0 (Overflows F4 range)
try std.testing.expectEqual(inf, three.add(one));
// 3.0 * 2.0 = 6.0 (Overflows F4 range)
const two = F4.init(0, 0b10, 0);
try std.testing.expectEqual(inf, three.mul(two));
// Underflow to zero (if subnormals can't represent it)
// Smallest subnormal in F4 is 0.5
const half = F4.init(0, 0b00, 1);
// 0.5 * 0.5 = 0.25 (Underflow)
try std.testing.expectEqual(F4.init(0, 0, 0), half.mul(half));
}
test "F8 Arithmetic (Minifloat)" {
print(.{"Arithm ok"});
// F8: 1 sign, 4 exponent (bias 7), 3 fraction
const f8_one = F8.init(0, 7, 0); // 1.0
const f8_two = F8.init(0, 8, 0); // 2.0
// 1.0 + 2.0 = 3.0 (Exponent 8, mantissa 0.5 -> 1.5 * 2^(8-7) = 3)
const expected_three = F8.init(0, 8, 0b100);
try std.testing.expectEqual(expected_three, f8_one.add(f8_two));
// 2.0 * 2.0 = 4.0 (Exponent 9, mantissa 0 -> 1.0 * 2^(9-7) = 4)
const expected_four = F8.init(0, 9, 0);
try std.testing.expectEqual(expected_four, f8_two.mul(f8_two));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment