Skip to content

Instantly share code, notes, and snippets.

@wolfspider
Created August 2, 2024 14:33
Show Gist options
  • Select an option

  • Save wolfspider/28e940647eed1385d38bb972f54e8b2e to your computer and use it in GitHub Desktop.

Select an option

Save wolfspider/28e940647eed1385d38bb972f54e8b2e to your computer and use it in GitHub Desktop.
Low* GCTypes exploration
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
// Define the list structure
typedef struct Prims_list__uint32_t {
enum { Prims_Cons, Prims_Nil, Prims_Null } tag;
union {
struct {
struct Prims_list__uint32_t *tl;
uint32_t hd[1];
};
struct {
struct Prims_list__uint32_t *ntl;
uint32_t nd[];
};
};
} Prims_list__uint32_t;
// Function to create a new Cons node
Prims_list__uint32_t *cons(uint32_t hd, Prims_list__uint32_t *tl) {
Prims_list__uint32_t *node = (Prims_list__uint32_t *)malloc(sizeof(Prims_list__uint32_t));
node->tag = Prims_Cons;
node->hd[1] = hd;
node->tl = tl;
return node;
}
Prims_list__uint32_t *ncons(Prims_list__uint32_t *tl) {
Prims_list__uint32_t *node = (Prims_list__uint32_t *)malloc(sizeof(Prims_list__uint32_t));
node->tag = Prims_Null;
node->ntl = tl;
return node;
}
// Function to create a new Nil node
Prims_list__uint32_t *nil() {
Prims_list__uint32_t *node = (Prims_list__uint32_t *)malloc(sizeof(Prims_list__uint32_t));
node->tag = Prims_Nil;
node->tl = NULL;
return node;
}
// Deep copy function
Prims_list__uint32_t *deep_copy_list(Prims_list__uint32_t *list) {
if (list == NULL) {
return NULL;
}
if (list->tag == Prims_Nil) {
return nil();
}
if (list->tag == Prims_Null) {
Prims_list__uint32_t *new_list = ncons(NULL);
new_list->tl = deep_copy_list(list->ntl);
return new_list;
}
// Create a new node and copy the head value
Prims_list__uint32_t *new_list = cons(list->hd[1], NULL);
// Recursively copy the rest of the list
new_list->tl = deep_copy_list(list->tl);
return new_list;
}
// Free list function for cleanup
void free_list(Prims_list__uint32_t *list) {
if(list->tag == Prims_Null) {
while (list != NULL) {
Prims_list__uint32_t *next = list->ntl;
free(list);
list = next;
}
}
else {
while (list != NULL) {
Prims_list__uint32_t *next = list->tl;
free(list);
list = next;
}
}
}
// Print list function
void print_list(Prims_list__uint32_t *list) {
printf("[");
while (list != NULL) {
if (list->tag == Prims_Cons) {
printf("%u", list->hd[1]);
list = list->tl;
if (list != NULL) {
printf(" -> ");
}
} else if (list->tag == Prims_Nil) {
printf("Nil");
list = list->tl;
if (list != NULL) {
printf(" -> ");
}
} else if (list->tag == Prims_Null) {
printf("Null");
list = list->ntl;
if (list != NULL) {
printf(" -> ");
}
}
}
printf("]\n");
}
// Test the deep copy function
int main() {
// Create an example list: 0 -> 1 -> Null -> 1 -> 0 -> Nil
Prims_list__uint32_t *original_list = cons(0, cons(1, ncons(cons(1, cons(0, nil())))));
// Print the original list
printf("Original list: ");
print_list(original_list);
// Deep copy the list
Prims_list__uint32_t *copied_list = deep_copy_list(original_list);
// Print the copied list
printf("Copied list: ");
print_list(copied_list);
// Free the original list
free_list(original_list);
// Free the copied list
free_list(copied_list);
return 0;
}
@wolfspider
Copy link
Author

And I think one last thing here: How can we be sure that this transpiled C++ version is anywhere close to the performance of the unmodified C version? We can do the same test as in the benchmark against the original C Code and the Benchmark code with a specific number of iterations like so:

int main() {

  for(int I = 0; I < 50000000; ++I)
  {
    llist();
  }

  return 0;

}

The results are as follows:

➜  low_list git:(master) ✗ clang++ -O3 GCTypes-bench.cc -o GCTypesBench
➜  low_list git:(master) ✗ clang -O3 GCTypes-bench.c -o GCTypesBenchC
➜  low_list git:(master) ✗ time ./GCTypesBench
./GCTypesBench  7.32s user 0.00s system 99% cpu 7.322 total
➜  low_list git:(master) ✗ time ./GCTypesBenchC
./GCTypesBenchC  7.28s user 0.00s system 99% cpu 7.278 total

I would say that is fairly close as well. But how about unoptimized C?

➜  low_list git:(master) ✗ clang -O0 GCTypes-bench.c -o GCTypesBenchC
➜  low_list git:(master) ✗ time ./GCTypesBenchC
./GCTypesBenchC  9.89s user 0.00s system 99% cpu 9.894 total

Pretty solid results so far. Looks like the pseudo-ASM C++ is doing it's job. Unoptimized must surely show something is wrong though right? That code is just really not approachable. These are the results:

➜  low_list git:(master) ✗ clang++ -O0 GCTypes-bench.cc -o GCTypesBench
➜  low_list git:(master) ✗ time ./GCTypesBench
./GCTypesBench  10.79s user 0.00s system 99% cpu 10.792 total

It really could be worse apparently. The only real outlier here is "std" which seems to go from much less efficient to efficient starting at O1.

@wolfspider
Copy link
Author

What exactly is the Rust code doing? Giving it the same treatment and lowering to C you find many more runtime checks and of course Rust does a lot of DCE along the way as well. Here is a section of Rust code transpiled to C from the sample provided before:

static void _ZN10playground4main17he86dcf1ea0a15fc1E(void) {
  __PREFIXALIGN__(8) struct l_array_8_uint8_t llvm_cbe_self_2e_dbg_2e_spill_2e_i __POSTFIXALIGN__(8);    /* Address-exposed local */
  __PREFIXALIGN__(8) struct l_array_8_uint8_t llvm_cbe_f_2e_dbg_2e_spill_2e_i __POSTFIXALIGN__(8);    /* Address-exposed local */
  __PREFIXALIGN__(8) struct l_array_8_uint8_t llvm_cbe_x_2e_dbg_2e_spill_2e_i __POSTFIXALIGN__(8);    /* Address-exposed local */
  __PREFIXALIGN__(8) struct l_array_16_uint8_t llvm_cbe__3_2e_i __POSTFIXALIGN__(8);    /* Address-exposed local */
  __PREFIXALIGN__(8) struct l_array_16_uint8_t _56 __POSTFIXALIGN__(8);    /* Address-exposed local */
  __PREFIXALIGN__(4) struct l_array_4_uint8_t llvm_cbe_y_2e_dbg_2e_spill __POSTFIXALIGN__(4);    /* Address-exposed local */
  __PREFIXALIGN__(4) struct l_array_4_uint8_t llvm_cbe_x_2e_dbg_2e_spill __POSTFIXALIGN__(4);    /* Address-exposed local */
  __PREFIXALIGN__(8) struct l_array_48_uint8_t llvm_cbe__19 __POSTFIXALIGN__(8);    /* Address-exposed local */
  __PREFIXALIGN__(8) struct l_array_16_uint8_t llvm_cbe__16 __POSTFIXALIGN__(8);    /* Address-exposed local */
  __PREFIXALIGN__(8) struct l_array_16_uint8_t llvm_cbe__15 __POSTFIXALIGN__(8);    /* Address-exposed local */
  __PREFIXALIGN__(8) struct l_array_48_uint8_t llvm_cbe__12 __POSTFIXALIGN__(8);    /* Address-exposed local */
  __PREFIXALIGN__(4) struct l_array_4_uint8_t llvm_cbe_result __POSTFIXALIGN__(4);    /* Address-exposed local */
  __PREFIXALIGN__(8) struct l_array_16_uint8_t llvm_cbe__1 __POSTFIXALIGN__(8);    /* Address-exposed local */
  uint32_t _57;
  void* llvm_cbe__21;
  uint32_t _58;
  void* llvm_cbe__22;
  void* llvm_cbe__23;
  uint32_t _59;
  uint32_t llvm_cbe_x;
  void* llvm_cbe__24;
  uint64_t llvm_cbe__30;
  uint32_t llvm_cbe_y;
  struct l_unnamed_2 _60;
  uint32_t llvm_cbe__9_2e_0;
  uint32_t llvm_cbe__10_2e_0;
  void* _61;
  void* _62;

#line 13 "/playground/src/main.rs"
  llvm_cbe__1 = _ZN10playground4test17h706ef2a73c95369dE();
  _57 = *((uint32_t*)&llvm_cbe__1);
  if (((((uint64_t)(uint32_t)_57)) == UINT64_C(1))) {
    goto llvm_cbe_bb3;
  } else {
    goto llvm_cbe_bb2;
  }

llvm_cbe_bb3:
  llvm_cbe__21 = *(void**)(((&((uint8_t*)(&llvm_cbe__1))[((int64_t)8)])));
  _58 = *(uint32_t*)llvm_cbe__21;
  if (((((uint64_t)(uint32_t)_58)) == UINT64_C(1))) {
    goto llvm_cbe_bb4;
  } else {
    goto llvm_cbe_bb2;
  }

llvm_cbe_bb2:
#line 18 "/playground/src/main.rs"
  llvm_cbe__19 = _ZN4core3fmt9Arguments9new_const17hc6a2e34e6c9027c8E((&alloc_f044d3424154ec6b3e934af0d1eab91c));
  goto llvm_cbe_bb10;

llvm_cbe_bb4:
#line 13 "/playground/src/main.rs"
  llvm_cbe__22 = *(void**)(((&((uint8_t*)(&llvm_cbe__1))[((int64_t)8)])));
  llvm_cbe__23 = *(void**)(((&((uint8_t*)llvm_cbe__22)[((int64_t)8)])));
  _59 = *(uint32_t*)llvm_cbe__23;
  if (((((uint64_t)(uint32_t)_59)) == UINT64_C(0))) {
    goto llvm_cbe_bb5;
  } else {
    goto llvm_cbe_bb2;
  }

llvm_cbe_bb5:
#line 14 "/playground/src/main.rs"
  llvm_cbe_x = *(uint32_t*)(((&((uint8_t*)(&llvm_cbe__1))[((int64_t)4)])));
  *((uint32_t*)&llvm_cbe_x_2e_dbg_2e_spill) = llvm_cbe_x;
  llvm_cbe__24 = *(void**)(((&((uint8_t*)(&llvm_cbe__1))[((int64_t)8)])));
  llvm_cbe__30 = ((uint64_t)(uintptr_t)llvm_cbe__24);
  if (((llvm_cbe__30 & 7) == UINT64_C(0))) {
    goto llvm_cbe_bb15;
  } else {
    goto llvm_cbe_panic;
  }

llvm_cbe_bb15:
  llvm_cbe_y = *(uint32_t*)(((&((uint8_t*)llvm_cbe__24)[((int64_t)4)])));
  *((uint32_t*)&llvm_cbe_y_2e_dbg_2e_spill) = llvm_cbe_y;
#line 15 "/playground/src/main.rs"
  _60 = llvm_OC_uadd_OC_with_OC_overflow_OC_i32(llvm_cbe_x, llvm_cbe_y);
  llvm_cbe__9_2e_0 = (_60.field0);
  if (((_60.field1))) {
    goto llvm_cbe_panic1;
  } else {
    goto llvm_cbe_bb6;
  }

llvm_cbe_panic:
#line 14 "/playground/src/main.rs"
  _ZN4core9panicking36panic_misaligned_pointer_dereference17hc1c33e077e1bc82cE(8, llvm_cbe__30, (&alloc_4f86935d518ff736ec845fdded627245));
  __builtin_unreachable();

llvm_cbe_bb6:
#line 15 "/playground/src/main.rs"
  llvm_cbe__10_2e_0 = llvm_sub_u32(llvm_cbe__9_2e_0, 1);
  if ((((uint32_t)llvm_cbe__9_2e_0) < ((uint32_t)1u))) {
    goto llvm_cbe_panic2;
  } else {
    goto llvm_cbe_bb7;
  }

llvm_cbe_panic1:
  _ZN4core9panicking11panic_const24panic_const_add_overflow17h08d3a954a3902361E((&alloc_6689ebc1ea65111c28ba615705391f7a));
  goto llvm_cbe_unreachable;

llvm_cbe_unreachable:
  __builtin_unreachable();

llvm_cbe_bb7:
  *((uint32_t*)&llvm_cbe_result) = llvm_cbe__10_2e_0;
  *((void**)&llvm_cbe_x_2e_dbg_2e_spill_2e_i) = (&llvm_cbe_result);
#line 114 "/rustc/fd8d6fbe505ecf913f5e2ca590c69a7da2789879/library/core/src/fmt/rt.rs"
  *((void**)&llvm_cbe_f_2e_dbg_2e_spill_2e_i) = ((void*)&_ZN4core3fmt3num3imp52__EC_LT_EC_impl_EC_u20_EC_core_OC__OC_fmt_OC__OC_Display_EC_u20_EC_for_EC_u20_EC_u32_EC_GT_EC_3fmt17h65de1fcd5382b9caE);
#line 1807 "/rustc/fd8d6fbe505ecf913f5e2ca590c69a7da2789879/library/core/src/ptr/non_null.rs"
  *((void**)&llvm_cbe_self_2e_dbg_2e_spill_2e_i) = (&llvm_cbe_result);
#line 103 "/rustc/fd8d6fbe505ecf913f5e2ca590c69a7da2789879/library/core/src/fmt/rt.rs"
  *((void**)&llvm_cbe__3_2e_i) = (&llvm_cbe_result);
  *(void**)(((&((uint8_t*)(&llvm_cbe__3_2e_i))[((int64_t)8)]))) = ((void*)&_ZN4core3fmt3num3imp52__EC_LT_EC_impl_EC_u20_EC_core_OC__OC_fmt_OC__OC_Display_EC_u20_EC_for_EC_u20_EC_u32_EC_GT_EC_3fmt17h65de1fcd5382b9caE);
#line 100 "/rustc/fd8d6fbe505ecf913f5e2ca590c69a7da2789879/library/core/src/fmt/rt.rs"
  _61 = memcpy((&llvm_cbe__16), (&llvm_cbe__3_2e_i), 16);
  goto llvm_cbe_bb8;

llvm_cbe_panic2:
#line 15 "/playground/src/main.rs"
  _ZN4core9panicking11panic_const24panic_const_sub_overflow17h293a45d95f19d811E((&alloc_6689ebc1ea65111c28ba615705391f7a));
  goto llvm_cbe_unreachable;

llvm_cbe_bb8:
#line 16 "/playground/src/main.rs"
  _62 = memcpy((((&(&llvm_cbe__15)->array[((int64_t)0)]))), (&llvm_cbe__16), 16);
  llvm_cbe__12 = _ZN4core3fmt9Arguments6new_v117hf04d341004d6cb1bE((&alloc_9bce62b4958e9ae9fca1ec2ed4203a0b), (&llvm_cbe__15));
  goto llvm_cbe_bb9;

llvm_cbe_bb9:
  _ZN3std2io5stdio6_print17ha75bd8669125f6d8E((&llvm_cbe__12));
  goto llvm_cbe_bb17;

llvm_cbe_bb17:
  goto llvm_cbe_bb11;

llvm_cbe_bb11:
#line 20 "/playground/src/main.rs"
  _ZN4core3ptr37drop_in_place_EC_LT_EC_playground_OC__OC_List_EC_GT_EC_17ha1813a0cd7217cffE((&llvm_cbe__1));
  return;
llvm_cbe_bb10:
#line 18 "/playground/src/main.rs"
  _ZN3std2io5stdio6_print17ha75bd8669125f6d8E((&llvm_cbe__19));
  goto llvm_cbe_bb18;

llvm_cbe_bb18:
  goto llvm_cbe_bb11;

}

@wolfspider
Copy link
Author

Rust embeds a lot of safety checks within the binary which is presumably what you would want if the program is modified outside of the author's control possibly. Formal verification can do those checks as well and it doesn't have to be embedded within the application itself. They both represent two different approaches and some of the newest advancements incorporate both by formally verifying Rust within the context of a provable methodology. All of these insights were gathered via the following Low* code:

module GcTypes

open FStar.HyperStack.ST

module U32 = FStar.UInt32

let test (): Stack (list U32.t) (fun _ -> true) (fun _ _ _ -> true) =
  0ul :: 1ul :: []

let main (): Stack FStar.Int32.t (fun _ -> true) (fun _ _ _ -> true) =
  match test () with
  | x :: y :: [] ->
      FStar.Int.Cast.uint32_to_int32 FStar.UInt32.(x +%^ y -%^ 1ul)
  | _ ->
      FStar.Int.Cast.uint32_to_int32 1ul

@wolfspider
Copy link
Author

wolfspider commented Oct 2, 2024

After switching to a more modern machine let's put this all together for the final results. We should know what to expect here:

───────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       │ File: gcTypesCpp.cpp
───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1   │ #include <optional>
   2   │ #include <list>
   3   │ #include <array>
   4   │ #include <iostream>
   56using namespace std;
   78int main()
   9   │ {
  1011for(int I; I < 50000000; ++I)
  12   │     {
  1314   │         list<optional<array<int, 1>>> sourceList {
  15   │         array<int, 1>{0},
  16   │         array<int, 1>{1},
  17   │         {},
  18   │         array<int, 1>{1},
  19   │         array<int, 1>{0}
  20   │         };
  212223   │         list<optional<array<int, 1>>> copiedList(sourceList);
  24   │     };
  2526return 0;
  27   │ }
  28   │
───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
❯ time ./gcTypesCpp
./gcTypesCpp  25.21s user 0.02s system 99% cpu 25.230 total
❯ time ./gcTypesCpp01
./gcTypesCpp01  0.00s user 0.00s system 85% cpu 0.004 total

C++ using std does this operation in 25 seconds but O1 optimizes it out completely and only increments the counter.

(edit: I've decided to follow up and measure this on the same machine by forcing C++ not to optimize. At that point it is doing something different but interesting to look at the performance at any rate.)

───────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       │ File: gcTypesCpp-NonOpt.cpp
───────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1   │ #include <optional>
   2   │ #include <list>
   3   │ #include <array>
   4   │ #include <iostream>
   56using namespace std;
   78volatile int sink = 0; // Prevent optimization by introducing a volatile sink
   910int main() {
  11for (int I = 0; I < 50000000; ++I) {
  12   │         list<optional<array<int, 1>>> sourceList{
  13   │             array<int, 1>{0},
  14   │             array<int, 1>{1},
  15   │             {},
  16   │             array<int, 1>{1},
  17   │             array<int, 1>{0}
  18   │         };
  1920   │         list<optional<array<int, 1>>> copiedList(sourceList);
  2122// Introduce a side effect using a volatile variable
  23   │         sink += copiedList.size();
  24   │     };
  2526return 0;
  27   │ }
───────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
clang++ -std=c++17 gcTypesCpp-NonOpt.cpp -O3 -o gcTypesCpp-NonOpttime ./gcTypesCpp-NonOpt
./gcTypesCpp-NonOpt  5.91s user 0.00s system 99% cpu 5.912 total
───────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       │ File: src/main.rs
───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1#![feature(box_patterns)]
   23#[allow(dead_code)]
   4#[derive(Clone)] // Deriving Clone for the enum
   5enum List {
   6Nil,
   7Cons(u32, Box<List>),
   8}
   910fn test() -> List {
  11List::Cons(0, Box::new(List::Cons(1, Box::new(List::Nil))))
  12}
  1314fn main() {
  15for _i in 1..50000000 {
  16let _list = test();
  1718let _copieidlist = test().clone();
  19}
  20}
───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
❯ time cargo run src/main.rs
   Compiling rust-list-test v0.1.0 (/home/jessebennett/rust-list-test)
    Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.07s
     Running `target/debug/rust-list-test src/main.rs`
cargo run src/main.rs  10.12s user 0.07s system 100% cpu 10.152 total
───────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       │ File: src/main.rs
───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1#![feature(box_patterns)]
   23//#[allow(dead_code)]
   4#[derive(Clone)] // Deriving Clone for the enum
   5enum List {
   6Nil,
   7Cons(u32, Box<List>),
   8}
   910fn test() -> List {
  11List::Cons(0, Box::new(List::Cons(1, Box::new(List::Nil))))
  12}
  1314fn main() {
  15for _i in 1..50000000 {
  16let _list = test();
  1718let _copieidlist = test().clone();
  19}
  20}
───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
❯ time cargo run --release src/main.rs
   Compiling rust-list-test v0.1.0 (/home/jessebennett/rust-list-test)
warning: fields `0` and `1` are never read
 --> src/main.rs:7:10
  |
7 |     Cons(u32, Box<List>),
  |     ---- ^^^  ^^^^^^^^^
  |     |
  |     fields in this variant
  |
  = note: `List` has a derived impl for the trait `Clone`, but this is intentionally ignored during dead code analysis
  = note: `#[warn(dead_code)]` on by default
help: consider changing the fields to be of unit type to suppress this warning while preserving the field numbering, or remove the fields
  |
7 |     Cons((), ()),
  |          ~~  ~~

warning: `rust-list-test` (bin "rust-list-test") generated 1 warning
    Finished `release` profile [optimized] target(s) in 0.08s
     Running `target/release/rust-list-test src/main.rs`
cargo run --release src/main.rs  3.12s user 0.05s system 100% cpu 3.151 total

Testing Rust opts shows that the execution time is cut down by ~70% but yet still clocks in at 3 seconds and still does not optimize the loop out entirely. It is initially faster than the unoptimized C++ by ~250%.

───────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       │ File: gcTypes.c
───────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1#include <stdlib.h>
   2#include <stdint.h>
   3#include <stdio.h>
   45// Define the list structure
   6typedef struct Prims_list__uint32_t {
   7enum { Prims_Cons, Prims_Nil, Prims_Null } tag;
   8union {
   9struct {
  10struct Prims_list__uint32_t *tl;
  11uint32_t hd[1];
  12   │         };
  13struct {
  14struct Prims_list__uint32_t *ntl;
  15uint32_t nd[];
  16   │         };
  17   │     };
  18   │ } Prims_list__uint32_t;
  1920// Function to create a new Cons node
  21Prims_list__uint32_t *cons(uint32_t hd, Prims_list__uint32_t *tl) {
  22Prims_list__uint32_t *node = (Prims_list__uint32_t *)malloc(sizeof(Prims_list__uint32_t));
  23node->tag = Prims_Cons;
  24node->hd[1] = hd;
  25node->tl = tl;
  26return node;
  27   │ }
  2829Prims_list__uint32_t *ncons(Prims_list__uint32_t *tl) {
  30Prims_list__uint32_t *node = (Prims_list__uint32_t *)malloc(sizeof(Prims_list__uint32_t));
  31node->tag = Prims_Null;
  32node->ntl = tl;
  33return node;
  34   │ }
  3536// Function to create a new Nil node
  37Prims_list__uint32_t *nil() {
  38Prims_list__uint32_t *node = (Prims_list__uint32_t *)malloc(sizeof(Prims_list__uint32_t));
  39node->tag = Prims_Nil;
  40node->tl = NULL;
  41return node;
  42   │ }
  4344// Deep copy function
  45Prims_list__uint32_t *deep_copy_list(Prims_list__uint32_t *list) {
  46if (list == NULL) {
  47return NULL;
  48   │     }
  4950if (list->tag == Prims_Nil) {
  51return nil();
  52   │     }
  5354if (list->tag == Prims_Null) {
  5556Prims_list__uint32_t *new_list = ncons(NULL);
  5758new_list->tl = deep_copy_list(list->ntl);
  5960return new_list;
  61   │     }
  6263// Create a new node and copy the head value
  64Prims_list__uint32_t *new_list = cons(list->hd[1], NULL);
  6566// Recursively copy the rest of the list
  67new_list->tl = deep_copy_list(list->tl);
  6869return new_list;
  70   │ }
  7172// Free list function for cleanup
  73void free_list(Prims_list__uint32_t *list) {
  7475if(list->tag == Prims_Null) {
  76while (list != NULL) {
  77Prims_list__uint32_t *next = list->ntl;
  78free(list);
  79list = next;
  80   │        }
  81   │     }
  82else {
  83while (list != NULL) {
  84Prims_list__uint32_t *next = list->tl;
  85free(list);
  86list = next;
  87   │         }
  88   │     }
  89   │ }
  9091// Print list function
  92void print_list(Prims_list__uint32_t *list) {
  93printf("[");
  94while (list != NULL) {
  95if (list->tag == Prims_Cons) {
  96printf("%u", list->hd[1]);
  97list = list->tl;
  98if (list != NULL) {
  99printf(" -> ");
 100   │             }
 101   │         } else if (list->tag == Prims_Nil) {
 102printf("Nil");
 103list = list->tl;
 104if (list != NULL) {
 105printf(" -> ");
 106   │             }
 107   │         } else if (list->tag == Prims_Null) {
 108printf("Null");
 109list = list->ntl;
 110if (list != NULL) {
 111printf(" -> ");
 112   │             }
 113   │         }
 114   │     }
 115printf("]\n");
 116   │ }
 117118// Test the deep copy function
 119int main() {
 120121for(int I = 0; I < 50000000; ++I)
 122   │     {
 123// Create an example list: 0 -> 1 -> Null -> 1 -> 0 -> Nil
 124Prims_list__uint32_t *original_list = cons(0, cons(1, ncons(cons(1, cons(0, nil())))));
 125126Prims_list__uint32_t *copied_list = deep_copy_list(original_list);
 127128free_list(original_list);
 129130free_list(copied_list);
 131   │     }
 132133return 0;
 134   │ }
───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
❯ time ./gcTypes
./gcTypes  6.68s user 0.00s system 99% cpu 6.686 total
❯ time ./gcTypes01
./gcTypes01  5.19s user 0.00s system 99% cpu 5.191 total

And finally we have C with light assistance from Low*. At 6.68s this is the fastest bar-none. I have even left in the code for the full implementation just as these other languages are pulling in the entire libraries to accomplish the same thing. Optimization shaves 1 second off and then it becomes the slowest but no effort has been made to actually do anything about that. This is just one test and many languages are multi-faceted when it comes to performance but these consistent results are showing the value of formal verification. I will leave you with wise words from one of the luminaries of my time who went by the name "Biggie Smalls":

You can be as good as the best of them. But as bad as the worst, so don't test me (Get money) You better move over (Get money)

Notorious B.I.G, Young M.A.F.I.A., Gettin' Money (Get Money Remix)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment