Cranelift

#idempotent

Cranelift IR:

Functions are compiled individually
The instructions in the function body use and produce values in SSA form i.e. Single Static Assignment
- SSA construction pushed to the frontend - done with the cranelift_frontend crate
- The cranelift_frontend crate contains utilities for translating from programs containing multiple assignments to the same variables into SSA form for Cranelift [IR].
Cranelift’s IR is typed i.e. All SSA values have a type which determines the size and shape of the value.
- Integer types:
- Floating point types:
- SIMD vector types:
Memory instructions:
- has fully general load and store instructions for accessing memory.
Uses traditional CFG (control-flow graphs) using basic blocks.

rustc’s cg_clif compiles the following:

../rustc_codegen_cranelift/dist/bin/rustc-clif --emit=llvm-ir src/main.rs

#[no_mangle]
fn test(a: u8, b: u8) -> u8 {
    if a > b {
        let c = a - b;
        c
    } else {
        let c = a + b;
        c
    }
}

to this

; compiler options
set opt_level=none
set tls_model=macho
set libcall_call_conv=isa_default
set probestack_size_log2=12
set probestack_strategy=inline
set bb_padding_log2_minus_one=0
set regalloc_checker=0
set regalloc_verbose_logs=0
set enable_alias_analysis=1
set enable_verifier=0
set enable_pcc=0
set is_pic=1
set use_colocated_libcalls=0
set enable_float=1
set enable_nan_canonicalization=0
set enable_pinned_reg=0
set enable_atomics=1
set enable_safepoints=0
set enable_llvm_abi_extensions=1
set unwind_info=1
set preserve_frame_pointers=1
set machine_code_cfg_info=0
set enable_probestack=1
set enable_jump_tables=1
set enable_heap_access_spectre_mitigation=1
set enable_table_access_spectre_mitigation=1
set enable_incremental_compilation_cache_checks=0
target aarch64 has_lse=0 has_pauth=0 sign_return_address_all=0 sign_return_address=0 sign_return_address_with_bkey=0 use_bti=0


function u0:9(i8, i8) -> i8 apple_aarch64 {
; symbol test
; instance Instance { def: Item(DefId(0:4 ~ main[1152]::test)), args: [] }
; abi FnAbi { args: [ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }, ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }], ret: ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }, c_variadic: false, fixed_count: 2, conv: Rust, can_unwind: false }

; kind  loc.idx   param    pass mode                            ty
; ssa   _0    u8                                1b 1, 1              var=0
; ret   _0      -          Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8
; arg   _1      = v0       Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8
; arg   _2      = v1       Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8

; kind  local ty                              size align (abi,pref)
; ssa   _1    u8                                1b 1, 1              var=1
; ssa   _2    u8                                1b 1, 1              var=2
; ssa   _3    bool                              1b 1, 1              var=3
; ssa   _4    u8                                1b 1, 1              var=4
; ssa   _5    (u8, bool)                        2b 1, 8              var=(5, 6)
; ssa   _6    u8                                1b 1, 1              var=7
; ssa   _7    (u8, bool)                        2b 1, 8              var=(8, 9)

    gv0 = symbol colocated userextname0 ; alloc10
    gv1 = symbol colocated userextname2 ; alloc11
    sig0 = (i64) apple_aarch64
    sig1 = (i64) apple_aarch64
    fn0 = u0:11 sig0 ; "_ZN4core9panicking11panic_const24panic_const_sub_overflow17h69a7109fa301a030E"
    fn1 = u0:12 sig1 ; "_ZN4core9panicking11panic_const24panic_const_add_overflow17h1c3192e1cfd46512E"

block0(v0: i8, v1: i8):
    v2 -> v0
    v5 -> v0
    v11 -> v0
    v3 -> v1
    v6 -> v1
    v12 -> v1
    nop 
; write_cvalue: Var(_1, var1): u8 <- ByVal(v0): u8
; write_cvalue: Var(_2, var2): u8 <- ByVal(v1): u8
    jump block1

block1:
    nop 
; _3 = Gt(copy _1, copy _2)
    v4 = icmp.i8 ugt v2, v3
; write_cvalue: Var(_3, var3): bool <- ByVal(v4): bool
; 
; switchInt(move _3)
    brif v4, block2, block4

block2:
    nop 
; _5 = SubWithOverflow(copy _1, copy _2)
    v7 = isub.i8 v5, v6
    v10 -> v7
    v8 = icmp ugt v7, v5
; write_cvalue: VarPair(_5, var5, var6): (u8, bool) <- ByValPair(v7, v8): (u8, bool)
; 
; assert(!move (_5.1: bool), "attempt to compute `{} - {}`, which would overflow", copy _1, copy _2)
    brif v8, block7, block3

block7 cold:
    nop 
    v9 = global_value.i64 gv0
    call fn0(v9)
; lib_call _ZN4core9panicking11panic_const24panic_const_sub_overflow17h69a7109fa301a030E
    trap unreachable

block3:
    nop 
; _4 = move (_5.0: u8)
; write_cvalue: Var(_4, var4): u8 <- ByVal(v10): u8
; _0 = copy _4
; write_cvalue: Var(_0, var0): u8 <- ByVal(v10): u8
; 
; goto
    return v10

block4:
    nop 
; _7 = AddWithOverflow(copy _1, copy _2)
    v13 = iadd.i8 v11, v12
    v16 -> v13
    v14 = icmp ult v13, v11
; write_cvalue: VarPair(_7, var8, var9): (u8, bool) <- ByValPair(v13, v14): (u8, bool)
; 
; assert(!move (_7.1: bool), "attempt to compute `{} + {}`, which would overflow", copy _1, copy _2)
    brif v14, block8, block5

block8 cold:
    nop 
    v15 = global_value.i64 gv1
    call fn1(v15)
; lib_call _ZN4core9panicking11panic_const24panic_const_add_overflow17h1c3192e1cfd46512E
    trap unreachable

block5:
    nop 
; _6 = move (_7.0: u8)
; write_cvalue: Var(_6, var7): u8 <- ByVal(v16): u8
; _0 = copy _6
; write_cvalue: Var(_0, var0): u8 <- ByVal(v16): u8
; 
; goto
    return v16

block6:
    v18 = iconst.i8 0
    v17 -> v18
    nop 
; 
; return
    return v17  ; v17 = 0
}

Notes:

cranelift-module uses u0:X to refer to function X within the current module (can be an import) and u1:Y to refer to data object Y.
The v2 -> v0 are aliases. So in this case all references to v2 are implicitly replaced with v0 during compilation. They mostly exist to make the SSA lowering in cranelift-frontend easier.

struct FnAbi {
    args: [
        ArgAbi {
            layout: TyAndLayout {
                ty: u8,
                layout: Layout {
                    size: Size(1 bytes),
                    align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
                    abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
                    fields: Primitive,
                    largest_niche: None,
                    variants: Single { index: 0 },
                    max_repr_align: None,
                    unadjusted_abi_align: Align(1 bytes),
                }
            },
            mode: Direct(ArgAttributes { 
                regular: NoUndef, 
                arg_ext: None, 
                pointee_size: Size(0 bytes), 
                pointee_align: None 
            })
        },
        ArgAbi {
            layout: TyAndLayout {
                ty: u8,
                layout: Layout {
                    size: Size(1 bytes),
                    align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
                    abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
                    fields: Primitive,
                    largest_niche: None,
                    variants: Single { index: 0 },
                    max_repr_align: None,
                    unadjusted_abi_align: Align(1 bytes),
                }
            },
            mode: Direct(ArgAttributes { 
                regular: NoUndef, 
                arg_ext: None, 
                pointee_size: Size(0 bytes), 
                pointee_align: None 
            })
        }
    ],
    ret: ArgAbi {
        layout: TyAndLayout {
            ty: u8,
            layout: Layout {
                size: Size(1 bytes),
                align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
                abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
                fields: Primitive,
                largest_niche: None,
                variants: Single { index: 0 },
                max_repr_align: None,
                unadjusted_abi_align: Align(1 bytes),
            }
        },
        mode: Direct(ArgAttributes { 
            regular: NoUndef, 
            arg_ext: None, 
            pointee_size: Size(0 bytes), 
            pointee_align: None 
        })
    },
    c_variadic: false,
    fixed_count: 2,
    conv: Rust,
    can_unwind: false
}

A High-Level Overview of Cranelift’s Pipeline:

CLIF -> legalization -> mid-end egraph rewrites if enabled (rules in ISLE) -> lowering to backend-specific VCode (rules in ISLE) -> regalloc -> binary emission

Cranelift Code-generator:

translates Cranelift IR to machine level IR (with machine op codes)
- we are actually mapping the Cranelift IR to a particular target architecture
  - For example: Cranelift IR’s iadd instruction takes two parameters and produces a parameter of the same type.
  - we can map this to an equivalent instruction in say x86-64 and call it an encoding.
  - Legalization - the goal is to turn IR instructions into instructions that are legal for a target machine or architecture.
Code-generators focus on optimizations such as -
- GVN (Global value numbering)
- LICM (Loop invariant code motion)
- DCE (Dead code elimination)

For each codegend function three files are emitted.

The .unopt.clif file contains what cg_clif emits.
The .opt.clif file contains the result of optimizing by Cranelift and
The .vcode file contains the ir right before cranelift emits machine code bytes.

Testing Cranelift

The Cranelift project includes clif-util, a Cranelift code generator utility that can be used for various tasks such as testing, execution, and interpretation.

    Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.37s
     Running `/Users/nihal.pasham/devspace/compiler/wasmtime/target/debug/clif-util -h`
Cranelift code generator utility

Usage: clif-util <COMMAND>

Commands:
  test            Run Cranelift tests
  run             Execute clif code and verify with test expressions
  interpret       Interpret clif code
  cat             Outputs .clif file
  print-cfg       Prints out cfg in GraphViz Dot format
  compile         Compiles Cranelift IR into target language
  pass            Run specified pass(es) on an input file
  bugpoint        Reduce size of clif file causing panic during compilation
  souper-harvest  Harvest candidates for superoptimization from a Wasm or Clif file
  help            Print this message or the help of the given subcommand(s)

Options:
  -h, --help  Print help

Example:

cargo run -- test filetests/filetests/isa/aarch64/arithmetic.clif

Craneflit Codegen (post mid-end optimization)

Terminologies:

CLIF - Cranelift IR
VCode - Virtual Register Code
regalloc - Register Allocation
binemit - Binary Emission

Flow: Translating (high-level IR) CLIF to (low-level IR) VCode is called lowering.

As VCode is target-dependent, the translation process is also target-specific.
This is where we consider which machine instructions will eventually be used for a given CLIF opcode.
There are many ways to achieve the same machine state results for given semantics, but some of these ways are faster than others and/or require fewer code bytes to achieve.
The problem can be summed up like this:
- given some CLIF,
- which VCode can we create
- to generate the fastest
- and/or smallest machine code
- that carries out the desired semantics?
This is called instruction selection, because we're selecting the VCode instructions from a set of different possible instructions.
A given CLIF node may be lowered into
- 1 to N VCode instructions.
A given VCode instruction may lead to the generation of
- 1 to M machine instructions.
There are no rules governing the maximum number of entities mapped.
For instance, the integer addition CLIF opcode iadd on 64-bit inputs maps to a single VCode instruction on AArch64.
The VCode instruction then causes a single machine instruction to be generated.

ISLE:

- ISLE is a domain specific language (DSL) - stands for instruction selection or lowering expressions. - ISLE is a statically-typed term-rewriting language. - You define rewriting rules that map input terms (`clif` instructions) into output terms (`MachInsts`). - These rules get compiled down into Rust source that uses a tree of match expressions that is as good or better than what you would have written by hand. - [The Context trait:](https://github.com/bytecodealliance/wasmtime/blob/main/cranelift/docs/isle-integration.md#gluing-isles-generated-code-into-cranelift) Each ISA-specific, ISLE-generated file is generic over a `Context` trait that has a trait method for each extern helper defined in ISLE. - `ISLE build process:` The ISLE compiler is set up as a build dependency, and the build script uses it to compile ISLE source into generated Rust code. - A normal build typically generates an `out` folder within the `target` directory, containing all ISLE-to-Rust source translations.

The ISLE README.md has a pretty good example.

Describes ISLE constructors, external constructors, extractors

A term can be declared as both a constructor and an extractor because:

A constructor allows you to create the type. If it’s used in an expression (RHS) that creates a new value, it’s a constructor.

An extractor allows you to pattern-match and de-structure the type. If it’s used in a pattern (LHS) that de-structures or matches an existing value, it’s an extractor. This versatility lets ISLE handle different scenarios more concisely and efficiently, using the same name in contexts where each role is clear.

Register Allocation:

The register allocation problem involves mapping intermediate representation (IR) values, often referred to as registers (or Reg in VCode), to machine-specific "containers" such as physical registers or memory locations. This process is known as register allocation (regalloc).

Regalloc2 implementation:

The implementation uses an abstraction that includes the following types:

Operand: virtual registers (or VRegs) defined or used in an instruction, along with their constraints.
Function: A trait implemented by the client that provides the allocator with information about function blocks, block instructions, instruction operands, block predecessors, successors, and additional details.
MachinEnv: A struct that holds information about resources on the target machine, including physical registers for each register class, scratch registers, and registers designated as fixed stack slots.
Allocation: An ‘Allocation’ represents the end result of regalloc for an Operand.
Output: The output of the register allocator.
- among other things, holds a Vec<Allocation> ’s - the main output that contains the final allocations for all operands across all instructions in the function.
- edits - new instructions to be inserted at specific program points (before or after instructions, mostly stackspills).

Key Points:

Virtual Registers: Inputs to register allocation are numerous and map to virtual values, known as virtual registers. The V in VCode stands for these virtual registers. VCode instructions reference values that are virtual registers before register allocation, hence the code is in virtualized register form.
Output of Register Allocation: The output is a set of instructions where virtual registers have been replaced by physical registers or stack slot references, constrained by the limited number of physical registers available on the machine. Additional metadata may also be produced to assist with this mapping.
Purpose and Challenges: The main challenge in register allocation is efficiently utilizing the limited physical registers while minimizing the need to spill registers to memory, which can degrade performance. This involves deciding which variables should be kept in registers and which should be moved to memory, a process that typically includes techniques like graph coloring and live range splitting.

Notes:

Register allocation may create an arbitrary number of spills, reloads, and register moves[^Regalloc2 impl] around VCode instructions to ensure that their register allocation constraints are met.
This is why the output of register allocation is a new list of instructions that includes not only the initial instructions filled with the actual registers but also additional spill, reload, and move (VCode) instructions added by regalloc.

E-graphs:

A data structure that represents an equivalence relationship over terms
It’s composed of equivalence classes or e-classes which are sets of e-nodes
These e-nodes are basically operators in a given language
- Like /or * operators
- Or childless operators like a and 2

- an operator takes as its children not another operator but a whole family of equivalent operators (i.e. children are e-classes themselves) - We can grow our e-graph by adding equivalence rewrites to it - like so

- Final rewrite doesn’t add any more e-nodes but it does decrease the number of e-classes - So, we shrank the e-graph but it contains more equivalences

- if you kept adding rewrites, you may get to a point where you are not adding any more information to the e-graph (i.e. we are not adding new e-classes or e-nodes) - This is called saturation or equality saturation

- the last step is to extract - extraction is a procedure which allows us to pick out the best represented term (i.e. the best possible equivalent term) according to some cost function from a specific e-class (i.e. the initial term) - There’s a lot of ways to extract

Idempotent means that performing the same operation multiple times has the same effect as performing it once.

Cranelift Impl Details

Essential Cranelift Data Structures (i.e. cranelift types):

Cranelift Entities:

Context: A persistent data structure used to hold the state of a compilation pipeline.
- Function: Represents the function currently being compiled.
- Entity References: References that are used to identify various entities within a function, such as blocks, instructions, and stack slots.
  - Note: they are not your typical rust references. Entity references are structs wrapping a u32 index into a table in the Function main data structure.
PrimaryMap: A map that tracks a vector of entities, assigning a unique reference to each. It's used to allocate new entities that can be indexed via unique entity references.
SecondaryMap: A simple vector-based map used to associate additional information with entities. Unlike PrimaryMap, it doesn’t allocate new entity references and maps unknown entities to a default value.
Layout: Defines the order and number of blocks and instructions in a function.
DataFlowGraph: Represents all instructions and basic blocks within a function, along with their data flow dependencies.
ControlFlowGraph: Maintains the mapping of blocks to their predecessors and successors, where both predecessors and successors are basic blocks.

Cranelift Instruction Formats and Data:

Every Cranelift Opcode has a corresponding instruction format represented by two enums: InstructionFormat and InstructionData. These enums determine the number of operands and their types.

InstructionFormat: An enum containing all Cranelift instruction formats as variants
InstructionData: An enum with the same variants as InstructionFormat, but it also includes additional data for each variant, such as the opcode and its operands (immediates and values).

Cranelift ISLE types:

Examples are Riscv64IsleContext

We get to implement a Context trait for every target (or backend) - see [[Cranelift/ISLE:]]
This trait implementation contains common methods for all targets. Their implementation is in prelude.rs

Cranelift Backends:

Examples are Aarch64Backend or Riscv64Backend etc.
The isa::lookup() function is the main entry point which returns an isa::Builder appropriate for the requested ISA.
- The IsaBuilder wraps a backend.
- This function returns an IsaBuilder, allowing you to configure a backend with both isa-specific and shared (i.e. architecturally common) settings.
- Cranelift filetests use a custom parser to handle .clif files. When parsing the target command line in a .clif file, the parser looks up a suitable backend using the isa::lookup.

How do we read Cranelift’s (ISLE) lowering rules:

;; Defined in cranelift/codegen/src/isa/riscv64/lower.isle
;; The main lowering constructor term: takes a clif `Inst` and returns the
;; register(s) within which the lowered instruction's result values live.
(decl partial lower (Inst) InstOutput)

;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Base case: simply adding things in registers.
(rule -1 (lower (has_type (fits_in_32 (ty_int ty)) (iadd x y)))
  (rv_addw x y))

;; Defined in target/debug/build/cranelift-codegen-aa52b046ac4ee4de/out/clif_lower.isle
;; Iadd is a Cranelift instruction (Opcode with operands) that needs to be lowered to the `rv_addw` expression.
;; The first step is using the `iadd` extractor to extract the `InstructionData` for an `Inst`, which maps to the external extractor `inst_data`.
;; `inst_data` produces the first match for `&InstructionData::Binary` and, subsequently, matches the rest of the constraints: `fits_in_32`, `ty_int`, and `has_type`.
(decl iadd (Value Value) Inst)
(extractor
    (iadd x y)
    (inst_data (InstructionData.Binary (Opcode.Iadd) (value_array_2 x y)))
)

;; Defined in cranelift/codegen/src/prelude_lower.isle
;; Extract the `InstructionData` for an `Inst`.
(decl inst_data (InstructionData) Inst)
(extern extractor infallible inst_data inst_data)


;;;; Value Arrays ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Defined in target/debug/build/cranelift-codegen-aa52b046ac4ee4de/out/clif_lower.isle
;; ISLE representation of `[Value; 2]`.
(type ValueArray2 extern (enum))

(decl value_array_2 (Value Value) ValueArray2)
(extern constructor value_array_2 pack_value_array_2)
(extern extractor infallible value_array_2 unpack_value_array_2)


;; Defined in cranelift/codegen/src/prelude_lower.isle
;; Internal extractors (e.g., has_type) are inlined, i.e., they are simply mapped to an external extractor.

;; Extract the type of the instruction's first result and pass along the
;; instruction as well.
(decl has_type (Type Inst) Inst)
(extractor (has_type ty inst)
           (and (result_type ty)
                inst))

;; Extract the type of the instruction's first result.
(decl result_type (Type) Inst)
(extractor (result_type ty)
           (first_result (value_type ty)))

;; Extract the first result value of the given instruction.
(decl first_result (Value) Inst)
(extern extractor first_result first_result)

;; Extract the type of a `Value`.
(decl value_type (Type) Value)
(extern extractor infallible value_type value_type)

// Corresponding Rust code for external extractors
//
// Note: `Inst` here is a Cranelift instruction entity reference, and `MInst` represents an
// actual machine instruction used to emit the final corresponding RISC-V instruction.

// These three are defined in cranelift/codegen/src/machinst/isle.rs.
// They look up the `DataFlowGraph` for the function and retrieve the relevant information.
#[inline]
fn value_type(&mut self, val: Value) -> Type {
	self.lower_ctx.dfg().value_type(val)
}

#[inline]
fn first_result(&mut self, inst: Inst) -> Option<Value> {
	self.lower_ctx.dfg().inst_results(inst).first().copied()
}

#[inline]
fn inst_data(&mut self, inst: Inst) -> InstructionData {
	self.lower_ctx.dfg().insts[inst]
}

// These two are defined in cranelift/codegen/src/isle_prelude.rs.
#[inline]
fn fits_in_32(&mut self, ty: Type) -> Option<Type> {
	if ty.bits() <= 32 && !ty.is_dynamic_vector() {
		Some(ty)
    } else {
        None
    }
}

#[inline]
fn ty_int(&mut self, ty: Type) -> Option<Type> {
	ty.is_int().then(|| ty)
}

When reading a lowering rule, all terms on the LHS (or pattern to be matched) are extractors. For example:

has_type is an internal extractor. Internal extractors simply expand to (or map to) an external extractor (see above)
fits_in_32 is an external extractor that only matches types that can fit in 32 bits.
ty_int is also an external extractor that checks if the type of the result of an iadd operation is an integer.

and all terms on the RHS are constructors.

Extractors can also appear in rules defined for ISLE declarations. In this context, extractors may look different from their usage elsewhere. For example:

;; Here, we declare an ISLE function to load immediates.
(decl imm (Type u64) Reg)

;; The following rule defines how to handle certain immediate values.
;; Notice the `if-let` construct, which is followed by an external extractor `i64_shift_for_lui`.
;; This is how to interpret the rule:
;; - The function `i64_shift_for_lui` takes the result of `(i64_sextend_u64 ty c)` (an `i64`)
;;   and produces a tuple `(u64, Imm12)` — representing the base and shift values.
;; - The `base` value is then passed to another external extractor, `imm20_from_u64`.
;; - Only if both extractions succeed will the rule's left-hand side (LHS) be replaced with
;;   the right-hand side (RHS): `(rv_slli (rv_lui base) shift)`.

;; In essence, if the non-zero bits of the immediate fit within 20 bits, 
;; the immediate can be encoded using `LUI` (Load Upper Immediate) and a subsequent shift.
(rule 1 (imm (ty_int ty) c)
  (if-let (i64_shift_for_lui (imm20_from_u64 base) shift) (i64_sextend_u64 ty c))
  (rv_slli (rv_lui base) shift))

Assuming all the above constraints are met, we replace the pattern (LHS) with the following expression: RHS

(rv_addw x y)

;; whose definition is as follows

;; Defined in cranelift/codegen/src/isa/riscv64/inst.isle
;; Helper for emitting the `addw` ("Add Word") instruction.
;; rd ← sext32(rs1) + sext32(rs2)
(decl rv_addw (XReg XReg) XReg)
(rule (rv_addw rs1 rs2)
  (alu_rrr (AluOPRRR.Addw) rs1 rs2))

;; which, in turn, is translated into:

;; Defined in cranelift/codegen/src/isa/riscv64/inst.isle
;; Helper for emitting `MInst.AluRRR` instructions.
(decl alu_rrr (AluOPRRR Reg Reg) Reg)
(rule (alu_rrr op src1 src2)
      (let ((dst WritableXReg (temp_writable_xreg))
            (_ Unit (emit (MInst.AluRRR op dst src1 src2))))
        dst))

For x and y, we declare an external constructor to place a value into a register using constructor_put_in_xreg.
For rv_addw: In Cranelift, terms like rv_addw are abstractions representing architecture-specific instructions. This constructor invokes an alu_rrr operation.
- alu_rrr: This emits the actual RISC-V equivalent instruction (as shown below).

// The generated file is located in the target directory within the out folder for each specified
// target.
//
// Generated as an internal constructor for term alu_rrr.
pub fn constructor_alu_rrr<C: Context>(ctx: &mut C, arg0: &AluOPRRR, arg1: Reg, arg2: Reg) -> Reg {
    let v3 = constructor_temp_writable_xreg(ctx);
    let v4 = C::writable_xreg_to_writable_reg(ctx, v3);
    let v5 = MInst::AluRRR {
        alu_op: arg0.clone(),
        rd: v4,
        rs1: arg1,
        rs2: arg2,
    };
    let v6 = C::emit(ctx, &v5);
    let v7 = constructor_writable_xreg_to_reg(ctx, v3);
    return v7;
}

// Generated as an internal constructor for term rv_addw.
pub fn constructor_rv_addw<C: Context>(ctx: &mut C, arg0: XReg, arg1: XReg) -> XReg {
    let v3 = C::xreg_to_reg(ctx, arg0);
    let v4 = C::xreg_to_reg(ctx, arg1);
    let v5 = constructor_alu_rrr(ctx, &AluOPRRR::Addw, v3, v4);
    let v6 = C::xreg_new(ctx, v5);
    return v6;
}

// Generated as an internal constructor for term lower.
// PS: The `Lower` constructor contains `10k` lines of code.
pub fn constructor_lower<C: Context>(ctx: &mut C, arg0: Inst) -> Option<InstOutput> {
    let v4 = &C::inst_data(ctx, arg0);
    match v4 {
        ...
        &InstructionData::Binary { opcode: ref v46, args: ref v47 } => {
            match v46 {
                ...
                &Opcode::Iadd => {
                    let v1 = C::first_result(ctx, arg0);
                    if let Some(v2) = v1 {
                        let v3 = C::value_type(ctx, v2);
                        ...
                        let v42 = C::fits_in_32(ctx, v3);
                        if let Some(v43) = v42 {
                            let v44 = C::ty_int(ctx, v43);
                            if let Some(v45) = v44 {       
                                let v48 = C::unpack_value_array_2(ctx, v47);
                                let v51 = constructor_put_in_xreg(ctx, v48.0);
                                let v52 = constructor_put_in_xreg(ctx, v48.1);
                                let v53 = constructor_rv_addw(ctx, v51, v52);
                                let v54 = constructor_output_xreg(ctx, v53);
                                let v55 = Some(v54);
                                return v55;
                            }
                        }
                    }
                }
            }
        }
    }
}

Cranelift Lowering & Binary Emission:

Cranelift Lowering & Binary Emission

    ir::Function                (SSA IR, machine-independent opcodes)
        |
        |  [lower]
        |
    VCode<arch_backend::Inst>   (machine instructions:
        |                        - mostly virtual registers.
        |                        - cond branches in two-target form.
        |                        - branch targets are block indices.
        |                        - in-memory constants held by insns,
        |                          with unknown offsets.
        |                        - critical edges (actually all edges)
        |                          are split.)
        |
        | [regalloc --> `regalloc2::Output`; VCode is unchanged]
        |
        | [binary emission via MachBuffer]
        |
    Vec<u8>                     (machine code:
        |                        - two-dest branches resolved via
        |                          streaming branch resolution/simplification.
        |                        - regalloc `Allocation` results used directly
        |                          by instruction emission code.
        |                        - prologue and epilogue(s) built and emitted
        |                          directly during emission.
        |                        - SP-relative offsets resolved by tracking
        |                          EmitState.)

Types used in Binary Emission:

Lower: The type that handles the lowering process from CLIF IR to VCode<MInst>.
- The VCode type serves as an abstraction—a container for target-specific MInst instances. In Cranelift, a sequence of machine instructions organized into basic blocks is referred to as vcode, short for virtual-register code.
- Cranelift introduces two closely related types: EntityList and ListPool
  - EntityList: Instead of using a structure like Vec<T>, the memory for an EntityList is managed by a ListPool<T>. An EntityList wraps a u32 index, which points to the starting position of the actual list in the ListPool’s data vector. If the index is 0, the list is considered empty.
  - ListPool: Acts as the shared storage for multiple lists.
  - By delegating memory ownership and management to the ListPool, EntityList<T> reduces its size to the minimum necessary for locating its data—a 4-byte index. This design is ideal for scenarios that demand compact, memory-efficient data structures, such as compiler implementations like Cranelift.

A side note: ~~Global value proof-carrying-code facts:~~ We can associate GlobalValue’s in the function with optional facts. Facts describe properties or constraints that the GlobalValue must satisfy. These constraints are used for verification or optimization during code generation. ~~Memory types for proof-carrying code:~~ A MemoryType is an abstract representation of structured memory (like a struct) or unstructured memory (like a blob). memory_types enables reasoning about memory layouts during compilation.

MInst: This type represents target-specific instruction formats and is implemented as an enum.
- Note: This is a generated type. Target instruction formats are defined in ISLE, which generates the corresponding Rust type.
- MInst does not directly represent the raw bytes of a target instruction—it is a Cranelift abstraction.
Vcode: A struct that serves as a container for machine instructions at various stages of construction.
MachBuffer:
- Raw code bytes (the actual machine instructions).
- Metadata like where variables live, where jumps or calls go, or where debug information maps to source code.
- Uses SmallVec for storage
- It is generic over a type that implements MachInst and MachInstEmit traits.

MachBuffer Key Fields and Their Purpose:

data: Stores the machine code bytes.
relocs: Tracks external references (e.g., a call to another function or a jump to another module).
traps: Stores information about trap points (e.g., for error handling).
user_stack_maps: Metadata for garbage collection, describing where pointers are in the stack.
label_offsets and label_aliases: Track where labels (like jump targets) are in the code.
pending_constants: Keeps track of constants that need to be inserted into the code.
fixup_records: Stores information about jumps or calls that can only be resolved after all code is emitted.

Relocation tables:

Imagine a binary where a function call foo() is made, but the address of foo() is resolved at runtime.

Source Code:

void call_function() {
    foo();  // Call to an external function.
}

Generated Machine Code:

Initially, the function address for foo() is not known.
The instruction might contain a placeholder for the address to be resolved later:
Here, auipc (add upper immediate to PC) loads the upper 20 bits of the offset between the target address and PC, and jalr (jump and link register) completes the jump using the lower bits.

auipc t0, %pcrel_hi(foo@plt)
jalr t1, %pcrel_lo(foo@plt)(t0)

Step-by-Step Calculation:

1. Current PC:
 • Assume PC = 0x1000 (the address of the auipc instruction).

2. Function Address:
 • Target function address = 0x3004.

3. Offset:
	offset = function_address - current_pc
       = 0x3004 - 0x1000
       = 0x2004

4. Split Offset into High and Low Parts:
 • For auipc, we only care about the high 20 bits of the offset.
 • High 20 bits of offset:
	high_bits = (offset + 0x800) >> 12
         = (0x2004 + 0x800) >> 12
         = (0x2804) >> 12
         = 0x2
* The addition of 0x800 ensures proper rounding when truncating lower bits during the shift (to handle sign-extension and alignment).

5. AUIPC Encodes the High 20 Bits:
 • The auipc instruction takes 0x2 as the immediate value:

auipc t0, 0x2

 • Internally, this computes:
	t0 = PC + (0x2 << 12)
   		= 0x1000 + 0x2000
   		= 0x3000

JALR Resolves the Low Bits:

1. Remaining Offset (Low 12 Bits):
 • The auipc instruction only handled the high part of the offset. The low 12 bits are resolved by the immediate value in the jalr instruction.
 • From the original offset 0x2004:
	low_bits = offset & 0xFFF
         = 0x2004 & 0xFFF
         = 0x004

2. JALR Instruction:
 • The jalr instruction uses t0 (from auipc) and adds the low_bits to compute the final target address:
jalr ra, t0, 4
 • This computes:
	PC = t0 + imm
   		= 0x3000 + 0x004
   		= 0x3004

Summary:
 • AUIPC Immediate (0x2):
 	• Derived from the high 20 bits of the PC-relative offset (0x2004).
 	• Computed as: (offset + 0x800) >> 12.
 • JALR Immediate (0x4):
 	• Derived from the low 12 bits of the PC-relative offset (0x2004).
 	• Computed as: offset & 0xFFF.

Relocation Table:

A relocation entry is created for foo in the ELF binary:

Offset:  0x004
Type:    R_RISCV_CALL_PLT
Symbol:  foo

At Runtime:

The linker resolves the address of foo() and patches the instructions so that they jump to the resolved address.

+------------------------+
| Instruction Memory     |
|  auipc t0, %pcrel_hi   | -> Placeholder for foo@plt
|  jalr t1, %pcrel_lo    | -> Placeholder for foo@plt
+------------------------+

       Relocation Table (contains a single entry)
+-------------------------------------+
| Offset  | Type             | Symbol |
| 0x004   | R_RISCV_CALL_PLT | foo    |
+-------------------------------------+

At Runtime:
+------------------------+
| Instruction Patched to |
|  auipc t0, hi(foo)     | -> Resolved foo's address
|  jalr t1, lo(foo)(t0)  |
+------------------------+

Why Only One Relocation Entry Is Needed:

The RISC-V relocation model assumes the linker will:

Identify the auipc and jalr pair based on the relocation type (R_RISCV_CALL_PLT).
Use the symbol’s resolved address to compute:
- The high 20 bits for auipc (patched first).
- The low 12 bits for jalr (patched next).

Thus, a second relocation entry isn’t necessary because the relocation type and context provide all the needed information.

Advantages of This Approach

Compact Relocation Table:
- By storing only one relocation entry, the relocation table remains smaller and simpler, especially for large programs with many such call sequences.
Reduced Complexity for Assemblers and Compilers:
- Assemblers don’t need to emit separate relocation entries for auipc and jalr, simplifying code generation.
Linker Handles the Association:
- The linker, which is already responsible for patching instructions, manages the pairing implicitly without needing redundant entries

What is a Global Offset Table (GOT)?

The Global Offset Table (GOT) is a structure used in Position-Independent Code (PIC) to facilitate the relocation of global symbols at runtime. It holds the absolute addresses of global variables and functions, which may be relocated by the operating system’s dynamic linker.

In simpler terms, the GOT is like a lookup table that stores the runtime addresses of symbols so that code can access them indirectly.

Steps to Add a New Backend:

Create a Folder for the Backend
- Place the new backend directory under /cranelift/codegen/src/isa, where each backend resides.
Define Backend and Implement Required Traits
- Ensure the backend implements these essential traits:
  - TargetIsa: Specifies the target architecture’s interface.
  - LowerBackend: Manages instruction lowering for the architecture.
Add Core ISLE Files to the Backend Folder
- inst.isle: Define ISA-specific instructions, encodings, registers, and settings for the target architecture using ISLE (Instruction Selection or Lowering Expressions).
- lower.isle: Specify instruction selection and CLIF (Cranelift Intermediate Representation) to MachInst (Machine Instructions) lowering rules.
Implement an ABI for the Target Architecture
- Place the ABI implementation in the appropriate folder within the backend directory.
Add ISA-Specific Settings
- Define any required ISA-specific settings in cranelift-codegen-meta, located at:
  - /cranelift/codegen/meta/src/isa/<arch>
Include Filetests for Basic Functionality
- Add initial filetests to verify the backend’s basic functionality in:
  - /cranelift/filetests/filetests/isa/<arch>

[[Cranelift/ISLE:]] we define rewriting rules that map input terms (clif instructions) into output terms (MachInsts, a type that represents an actual machine instruction).
[[Cranelift/E-graphs:]] We can perform an E-graph pass over a function, identifying similar values with multiple representations and selecting the optimal representation for each used value. This approach, known as non-destructive term rewriting, enables Cranelift's E-graphs to select the best value among alternatives, which can then be passed to an ISLE constructor.

Example Function and Its Stack Frame in Cranelift for RV32I:

fn example(a: i32, b: i32) -> i32 {
    let c = a + b;       // Local variable stored in stack slot.
    let d = helper(c);   // Tail call (outgoing args considered).
    d
}

    High Memory
    +---------------------+
    | Incoming Args (a, b)| <-- Stack grows downward
    +---------------------+
    | Return Address      | <-- Stored by `jal` instruction
    +---------------------+
    | Saved Frame Pointer | <-- Saved `fp` register
    +---------------------+
    | Callee-Saved Regs   | <-- Save `s0, s1, s2, s3`
    +---------------------+
    | Local Var (c)       | <-- Space for local `c`
    +---------------------+
    | Outgoing Args       | <-- Argument for `helper`
    +---------------------+
    Low Memory

Key Points for RV32:

1.	**Word Size**: 4 bytes.
2.	**Stack Alignment**: 4 bytes for RV32I ABIs.
3.	**Callee-Saved Registers**: ABI defines which registers (s0 to s11) must be preserved across function calls.
4.	**Tail Calls**: Ensure outgoing arguments fit within allocated stack space.

This layout ensures correctness and adherence to the RV32 ABI while efficiently utilizing stack memory.

nihalpasham/crane lift_gist.md

Select an option

No results found