#idempotent
- Functions are compiled individually
- The instructions in the function body use and produce values in SSA form i.e.
Single Static Assignment- SSA construction pushed to the frontend - done with the
cranelift_frontendcrate - The
cranelift_frontend cratecontains utilities for translating from programs containing multiple assignments to the same variables into SSA form for Cranelift [IR].
- SSA construction pushed to the frontend - done with the
- Cranelift’s IR is typed i.e. All SSA values have a type which determines the size and shape of the value.
- Integer types:
- Floating point types:
- SIMD vector types:
- Memory instructions:
- has fully general
loadandstoreinstructions for accessing memory.
- has fully general
- Uses traditional CFG (control-flow graphs) using basic blocks.
rustc’s cg_clif compiles the following:
../rustc_codegen_cranelift/dist/bin/rustc-clif --emit=llvm-ir src/main.rs#[no_mangle]
fn test(a: u8, b: u8) -> u8 {
if a > b {
let c = a - b;
c
} else {
let c = a + b;
c
}
}to this
; compiler options
set opt_level=none
set tls_model=macho
set libcall_call_conv=isa_default
set probestack_size_log2=12
set probestack_strategy=inline
set bb_padding_log2_minus_one=0
set regalloc_checker=0
set regalloc_verbose_logs=0
set enable_alias_analysis=1
set enable_verifier=0
set enable_pcc=0
set is_pic=1
set use_colocated_libcalls=0
set enable_float=1
set enable_nan_canonicalization=0
set enable_pinned_reg=0
set enable_atomics=1
set enable_safepoints=0
set enable_llvm_abi_extensions=1
set unwind_info=1
set preserve_frame_pointers=1
set machine_code_cfg_info=0
set enable_probestack=1
set enable_jump_tables=1
set enable_heap_access_spectre_mitigation=1
set enable_table_access_spectre_mitigation=1
set enable_incremental_compilation_cache_checks=0
target aarch64 has_lse=0 has_pauth=0 sign_return_address_all=0 sign_return_address=0 sign_return_address_with_bkey=0 use_bti=0
function u0:9(i8, i8) -> i8 apple_aarch64 {
; symbol test
; instance Instance { def: Item(DefId(0:4 ~ main[1152]::test)), args: [] }
; abi FnAbi { args: [ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }, ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }], ret: ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }, c_variadic: false, fixed_count: 2, conv: Rust, can_unwind: false }
; kind loc.idx param pass mode ty
; ssa _0 u8 1b 1, 1 var=0
; ret _0 - Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8
; arg _1 = v0 Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8
; arg _2 = v1 Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8
; kind local ty size align (abi,pref)
; ssa _1 u8 1b 1, 1 var=1
; ssa _2 u8 1b 1, 1 var=2
; ssa _3 bool 1b 1, 1 var=3
; ssa _4 u8 1b 1, 1 var=4
; ssa _5 (u8, bool) 2b 1, 8 var=(5, 6)
; ssa _6 u8 1b 1, 1 var=7
; ssa _7 (u8, bool) 2b 1, 8 var=(8, 9)
gv0 = symbol colocated userextname0 ; alloc10
gv1 = symbol colocated userextname2 ; alloc11
sig0 = (i64) apple_aarch64
sig1 = (i64) apple_aarch64
fn0 = u0:11 sig0 ; "_ZN4core9panicking11panic_const24panic_const_sub_overflow17h69a7109fa301a030E"
fn1 = u0:12 sig1 ; "_ZN4core9panicking11panic_const24panic_const_add_overflow17h1c3192e1cfd46512E"
block0(v0: i8, v1: i8):
v2 -> v0
v5 -> v0
v11 -> v0
v3 -> v1
v6 -> v1
v12 -> v1
nop
; write_cvalue: Var(_1, var1): u8 <- ByVal(v0): u8
; write_cvalue: Var(_2, var2): u8 <- ByVal(v1): u8
jump block1
block1:
nop
; _3 = Gt(copy _1, copy _2)
v4 = icmp.i8 ugt v2, v3
; write_cvalue: Var(_3, var3): bool <- ByVal(v4): bool
;
; switchInt(move _3)
brif v4, block2, block4
block2:
nop
; _5 = SubWithOverflow(copy _1, copy _2)
v7 = isub.i8 v5, v6
v10 -> v7
v8 = icmp ugt v7, v5
; write_cvalue: VarPair(_5, var5, var6): (u8, bool) <- ByValPair(v7, v8): (u8, bool)
;
; assert(!move (_5.1: bool), "attempt to compute `{} - {}`, which would overflow", copy _1, copy _2)
brif v8, block7, block3
block7 cold:
nop
v9 = global_value.i64 gv0
call fn0(v9)
; lib_call _ZN4core9panicking11panic_const24panic_const_sub_overflow17h69a7109fa301a030E
trap unreachable
block3:
nop
; _4 = move (_5.0: u8)
; write_cvalue: Var(_4, var4): u8 <- ByVal(v10): u8
; _0 = copy _4
; write_cvalue: Var(_0, var0): u8 <- ByVal(v10): u8
;
; goto
return v10
block4:
nop
; _7 = AddWithOverflow(copy _1, copy _2)
v13 = iadd.i8 v11, v12
v16 -> v13
v14 = icmp ult v13, v11
; write_cvalue: VarPair(_7, var8, var9): (u8, bool) <- ByValPair(v13, v14): (u8, bool)
;
; assert(!move (_7.1: bool), "attempt to compute `{} + {}`, which would overflow", copy _1, copy _2)
brif v14, block8, block5
block8 cold:
nop
v15 = global_value.i64 gv1
call fn1(v15)
; lib_call _ZN4core9panicking11panic_const24panic_const_add_overflow17h1c3192e1cfd46512E
trap unreachable
block5:
nop
; _6 = move (_7.0: u8)
; write_cvalue: Var(_6, var7): u8 <- ByVal(v16): u8
; _0 = copy _6
; write_cvalue: Var(_0, var0): u8 <- ByVal(v16): u8
;
; goto
return v16
block6:
v18 = iconst.i8 0
v17 -> v18
nop
;
; return
return v17 ; v17 = 0
}Notes:
- cranelift-module uses
u0:Xto refer to functionXwithin the current module (can be an import) andu1:Yto refer to data objectY. - The
v2 -> v0are aliases. So in this case all references to v2 are implicitly replaced with v0 during compilation. They mostly exist to make the SSA lowering in cranelift-frontend easier.
struct FnAbi {
args: [
ArgAbi {
layout: TyAndLayout {
ty: u8,
layout: Layout {
size: Size(1 bytes),
align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
fields: Primitive,
largest_niche: None,
variants: Single { index: 0 },
max_repr_align: None,
unadjusted_abi_align: Align(1 bytes),
}
},
mode: Direct(ArgAttributes {
regular: NoUndef,
arg_ext: None,
pointee_size: Size(0 bytes),
pointee_align: None
})
},
ArgAbi {
layout: TyAndLayout {
ty: u8,
layout: Layout {
size: Size(1 bytes),
align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
fields: Primitive,
largest_niche: None,
variants: Single { index: 0 },
max_repr_align: None,
unadjusted_abi_align: Align(1 bytes),
}
},
mode: Direct(ArgAttributes {
regular: NoUndef,
arg_ext: None,
pointee_size: Size(0 bytes),
pointee_align: None
})
}
],
ret: ArgAbi {
layout: TyAndLayout {
ty: u8,
layout: Layout {
size: Size(1 bytes),
align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
fields: Primitive,
largest_niche: None,
variants: Single { index: 0 },
max_repr_align: None,
unadjusted_abi_align: Align(1 bytes),
}
},
mode: Direct(ArgAttributes {
regular: NoUndef,
arg_ext: None,
pointee_size: Size(0 bytes),
pointee_align: None
})
},
c_variadic: false,
fixed_count: 2,
conv: Rust,
can_unwind: false
}CLIF -> legalization -> mid-end egraph rewrites if enabled (rules in ISLE) -> lowering to backend-specific VCode (rules in ISLE) -> regalloc -> binary emission- translates Cranelift IR to machine level IR (with machine op codes)
- we are actually mapping the Cranelift IR to a particular target architecture
- For example: Cranelift IR’s
iaddinstruction takes two parameters and produces a parameter of the same type. - we can map this to an equivalent instruction in say
x86-64and call it an encoding. - Legalization - the goal is to turn IR instructions into instructions that are legal for a target machine or architecture.
- For example: Cranelift IR’s
- we are actually mapping the Cranelift IR to a particular target architecture
- Code-generators focus on optimizations such as -
- GVN (Global value numbering)
- LICM (Loop invariant code motion)
- DCE (Dead code elimination)
For each codegend function three files are emitted.
- The
.unopt.cliffile contains whatcg_clifemits. - The
.opt.cliffile contains the result of optimizing by Cranelift and - The
.vcodefile contains theirright before cranelift emits machine code bytes.
- The Cranelift project includes
clif-util, a Cranelift code generator utility that can be used for various tasks such as testing, execution, and interpretation.
Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.37s
Running `/Users/nihal.pasham/devspace/compiler/wasmtime/target/debug/clif-util -h`
Cranelift code generator utility
Usage: clif-util <COMMAND>
Commands:
test Run Cranelift tests
run Execute clif code and verify with test expressions
interpret Interpret clif code
cat Outputs .clif file
print-cfg Prints out cfg in GraphViz Dot format
compile Compiles Cranelift IR into target language
pass Run specified pass(es) on an input file
bugpoint Reduce size of clif file causing panic during compilation
souper-harvest Harvest candidates for superoptimization from a Wasm or Clif file
help Print this message or the help of the given subcommand(s)
Options:
-h, --help Print helpExample:
cargo run -- test filetests/filetests/isa/aarch64/arithmetic.clifTerminologies:
CLIF- Cranelift IRVCode- Virtual Register Coderegalloc- Register Allocationbinemit- Binary Emission
Flow: Translating (high-level IR) CLIF to (low-level IR) VCode is called lowering.
- As VCode is target-dependent, the translation process is also target-specific.
- This is where we consider which machine instructions will eventually be used for a given
CLIF opcode. - There are many ways to achieve the same machine state results for given semantics, but some of these ways are faster than others and/or require fewer code bytes to achieve.
- The problem can be summed up like this:
- given
some CLIF, which VCodecan we create- to generate the fastest
- and/or smallest machine code
- that carries out the desired semantics?
- given
- This is called instruction selection, because we're selecting the VCode instructions from a set of different possible instructions.
-
- A given CLIF node may be lowered into
- 1 to N VCode instructions.
- A given VCode instruction may lead to the generation of
- 1 to M machine instructions.
- There are no rules governing the maximum number of entities mapped.
- For instance, the integer addition CLIF opcode
iaddon 64-bit inputs maps to a single VCode instruction on AArch64. - The VCode instruction then causes a single machine instruction to be generated.
- ISLE is a domain specific language (DSL)
- stands for instruction selection or lowering expressions.
- ISLE is a statically-typed term-rewriting language.
- You define rewriting rules that map input terms (`clif` instructions) into output terms (`MachInsts`).
- These rules get compiled down into Rust source that uses a tree of match expressions that is as good or better than what you would have written by hand.
- [The Context trait:](https://github.com/bytecodealliance/wasmtime/blob/main/cranelift/docs/isle-integration.md#gluing-isles-generated-code-into-cranelift) Each ISA-specific, ISLE-generated file is generic over a `Context` trait that has a trait method for each extern helper defined in ISLE.
- `ISLE build process:` The ISLE compiler is set up as a build dependency, and the build script uses it to compile ISLE source into generated Rust code.
- A normal build typically generates an `out` folder within the `target` directory, containing all ISLE-to-Rust source translations.
The ISLE README.md has a pretty good example.
Describes ISLE constructors, external constructors, extractors
A term can be declared as both a constructor and an extractor because:
- A constructor allows you to create the type. If it’s used in an expression (RHS) that creates a new value, it’s a constructor.
- An extractor allows you to pattern-match and de-structure the type. If it’s used in a pattern (LHS) that de-structures or matches an existing value, it’s an extractor. This versatility lets ISLE handle different scenarios more concisely and efficiently, using the same name in contexts where each role is clear.
- The register allocation problem involves mapping intermediate representation (IR) values, often referred to as registers (or Reg in VCode), to machine-specific "containers" such as physical registers or memory locations. This process is known as register allocation (regalloc).
The implementation uses an abstraction that includes the following types:
Operand:virtual registers (orVRegs) defined or used in an instruction, along with their constraints.Function:A trait implemented by the client that provides the allocator with information about function blocks, block instructions, instruction operands, block predecessors, successors, and additional details.MachinEnv:A struct that holds information about resources on the target machine, including physical registers for each register class, scratch registers, and registers designated as fixed stack slots.Allocation:An ‘Allocation’ represents the end result of regalloc for anOperand.Output:The output of the register allocator.- among other things, holds a
Vec<Allocation>’s - the main output that contains the final allocations for all operands across all instructions in the function. edits- new instructions to be inserted at specific program points (before or after instructions, mostly stackspills).
- among other things, holds a
- Virtual Registers: Inputs to register allocation are numerous and map to
virtualvalues, known as virtual registers. TheVin VCode stands for these virtual registers. VCode instructions reference values that are virtual registers before register allocation, hence the code is in virtualized register form. - Output of Register Allocation: The output is a set of instructions where virtual registers have been replaced by physical registers or stack slot references, constrained by the limited number of physical registers available on the machine. Additional metadata may also be produced to assist with this mapping.
- Purpose and Challenges: The main challenge in register allocation is efficiently utilizing the limited physical registers while minimizing the need to spill registers to memory, which can degrade performance. This involves deciding which variables should be kept in registers and which should be moved to memory, a process that typically includes techniques like graph coloring and live range splitting.
- Register allocation may create an arbitrary number of spills, reloads, and register moves[^Regalloc2 impl] around VCode instructions to ensure that their register allocation constraints are met.
- This is why the output of register allocation is a new list of instructions that includes not only the initial instructions filled with the actual registers but also additional spill, reload, and move (VCode) instructions added by regalloc.
- A data structure that represents an equivalence relationship over terms
- It’s composed of equivalence classes or
e-classeswhich are sets ofe-nodes - These e-nodes are basically operators in a given language
- Like
/or*operators - Or childless operators like
aand2
- Like
- an operator takes as its children not another operator but a whole family of equivalent operators (i.e. children are e-classes themselves)
- We can grow our e-graph by adding equivalence rewrites to it - like so
- Final rewrite doesn’t add any more e-nodes but it does decrease the number of e-classes
- So, we shrank the e-graph but it contains more equivalences
- if you kept adding rewrites, you may get to a point where you are not adding any more information to the e-graph (i.e. we are not adding new e-classes or e-nodes)
- This is called saturation or equality saturation
- the last step is to extract
- extraction is a procedure which allows us to pick out the best represented term (i.e. the best possible equivalent term) according to some cost function from a specific e-class (i.e. the initial term)
- There’s a lot of ways to extract
Idempotent means that performing the same operation multiple times has the same effect as performing it once.
Context:A persistent data structure used to hold the state of a compilation pipeline.Function:Represents the function currently being compiled.Entity References:References that are used to identify various entities within a function, such as blocks, instructions, and stack slots.- Note: they are not your typical rust references. Entity references are structs wrapping a
u32index into a table in theFunctionmain data structure.
- Note: they are not your typical rust references. Entity references are structs wrapping a
PrimaryMap:A map that tracks a vector of entities, assigning a unique reference to each. It's used to allocate new entities that can be indexed via unique entity references.SecondaryMap:A simple vector-based map used to associate additional information with entities. UnlikePrimaryMap, it doesn’t allocate new entity references and maps unknown entities to a default value.Layout:Defines the order and number of blocks and instructions in a function.DataFlowGraph:Represents all instructions and basic blocks within a function, along with their data flow dependencies.ControlFlowGraph:Maintains the mapping of blocks to their predecessors and successors, where both predecessors and successors are basic blocks.
Every Cranelift Opcode has a corresponding instruction format represented by two enums: InstructionFormat and InstructionData. These enums determine the number of operands and their types.
InstructionFormat:An enum containing all Cranelift instruction formats as variantsInstructionData:An enum with the same variants as InstructionFormat, but it also includes additional data for each variant, such as the opcode and its operands (immediates and values).
Examples are Riscv64IsleContext
- We get to implement a 
Context trait for every target (or backend) - see [[Cranelift/ISLE:]] - This trait implementation contains common methods for all targets. Their implementation is in 
prelude.rs
- Examples are
Aarch64BackendorRiscv64Backendetc. - The
isa::lookup()function is the main entry point which returns anisa::Builderappropriate for the requested ISA.- The IsaBuilder wraps a backend.
- This function returns an
IsaBuilder, allowing you to configure a backend with bothisa-specificandshared(i.e. architecturally common) settings. - Cranelift filetests use a custom parser to handle
.cliffiles. When parsing thetargetcommand line in a.cliffile, the parser looks up a suitable backend using theisa::lookup.
;; Defined in cranelift/codegen/src/isa/riscv64/lower.isle
;; The main lowering constructor term: takes a clif `Inst` and returns the
;; register(s) within which the lowered instruction's result values live.
(decl partial lower (Inst) InstOutput)
;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Base case: simply adding things in registers.
(rule -1 (lower (has_type (fits_in_32 (ty_int ty)) (iadd x y)))
(rv_addw x y))
;; Defined in target/debug/build/cranelift-codegen-aa52b046ac4ee4de/out/clif_lower.isle
;; Iadd is a Cranelift instruction (Opcode with operands) that needs to be lowered to the `rv_addw` expression.
;; The first step is using the `iadd` extractor to extract the `InstructionData` for an `Inst`, which maps to the external extractor `inst_data`.
;; `inst_data` produces the first match for `&InstructionData::Binary` and, subsequently, matches the rest of the constraints: `fits_in_32`, `ty_int`, and `has_type`.
(decl iadd (Value Value) Inst)
(extractor
(iadd x y)
(inst_data (InstructionData.Binary (Opcode.Iadd) (value_array_2 x y)))
)
;; Defined in cranelift/codegen/src/prelude_lower.isle
;; Extract the `InstructionData` for an `Inst`.
(decl inst_data (InstructionData) Inst)
(extern extractor infallible inst_data inst_data)
;;;; Value Arrays ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Defined in target/debug/build/cranelift-codegen-aa52b046ac4ee4de/out/clif_lower.isle
;; ISLE representation of `[Value; 2]`.
(type ValueArray2 extern (enum))
(decl value_array_2 (Value Value) ValueArray2)
(extern constructor value_array_2 pack_value_array_2)
(extern extractor infallible value_array_2 unpack_value_array_2)
;; Defined in cranelift/codegen/src/prelude_lower.isle
;; Internal extractors (e.g., has_type) are inlined, i.e., they are simply mapped to an external extractor.
;; Extract the type of the instruction's first result and pass along the
;; instruction as well.
(decl has_type (Type Inst) Inst)
(extractor (has_type ty inst)
(and (result_type ty)
inst))
;; Extract the type of the instruction's first result.
(decl result_type (Type) Inst)
(extractor (result_type ty)
(first_result (value_type ty)))
;; Extract the first result value of the given instruction.
(decl first_result (Value) Inst)
(extern extractor first_result first_result)
;; Extract the type of a `Value`.
(decl value_type (Type) Value)
(extern extractor infallible value_type value_type)// Corresponding Rust code for external extractors
//
// Note: `Inst` here is a Cranelift instruction entity reference, and `MInst` represents an
// actual machine instruction used to emit the final corresponding RISC-V instruction.
// These three are defined in cranelift/codegen/src/machinst/isle.rs.
// They look up the `DataFlowGraph` for the function and retrieve the relevant information.
#[inline]
fn value_type(&mut self, val: Value) -> Type {
self.lower_ctx.dfg().value_type(val)
}
#[inline]
fn first_result(&mut self, inst: Inst) -> Option<Value> {
self.lower_ctx.dfg().inst_results(inst).first().copied()
}
#[inline]
fn inst_data(&mut self, inst: Inst) -> InstructionData {
self.lower_ctx.dfg().insts[inst]
}
// These two are defined in cranelift/codegen/src/isle_prelude.rs.
#[inline]
fn fits_in_32(&mut self, ty: Type) -> Option<Type> {
if ty.bits() <= 32 && !ty.is_dynamic_vector() {
Some(ty)
} else {
None
}
}
#[inline]
fn ty_int(&mut self, ty: Type) -> Option<Type> {
ty.is_int().then(|| ty)
}When reading a lowering rule, all terms on the LHS (or pattern to be matched) are extractors. For example:
has_typeis an internal extractor. Internal extractors simply expand to (or map to) an external extractor (see above)fits_in_32is an external extractor that only matches types that can fit in 32 bits.ty_intis also an external extractor that checks if the type of the result of aniaddoperation is an integer.
and all terms on the RHS are constructors.
Extractors can also appear in rules defined for ISLE declarations. In this context, extractors may look different from their usage elsewhere. For example:
;; Here, we declare an ISLE function to load immediates.
(decl imm (Type u64) Reg)
;; The following rule defines how to handle certain immediate values.
;; Notice the `if-let` construct, which is followed by an external extractor `i64_shift_for_lui`.
;; This is how to interpret the rule:
;; - The function `i64_shift_for_lui` takes the result of `(i64_sextend_u64 ty c)` (an `i64`)
;; and produces a tuple `(u64, Imm12)` — representing the base and shift values.
;; - The `base` value is then passed to another external extractor, `imm20_from_u64`.
;; - Only if both extractions succeed will the rule's left-hand side (LHS) be replaced with
;; the right-hand side (RHS): `(rv_slli (rv_lui base) shift)`.
;; In essence, if the non-zero bits of the immediate fit within 20 bits,
;; the immediate can be encoded using `LUI` (Load Upper Immediate) and a subsequent shift.
(rule 1 (imm (ty_int ty) c)
(if-let (i64_shift_for_lui (imm20_from_u64 base) shift) (i64_sextend_u64 ty c))
(rv_slli (rv_lui base) shift))
Assuming all the above constraints are met, we replace the pattern (LHS) with the following expression: RHS
(rv_addw x y)
;; whose definition is as follows
;; Defined in cranelift/codegen/src/isa/riscv64/inst.isle
;; Helper for emitting the `addw` ("Add Word") instruction.
;; rd ← sext32(rs1) + sext32(rs2)
(decl rv_addw (XReg XReg) XReg)
(rule (rv_addw rs1 rs2)
(alu_rrr (AluOPRRR.Addw) rs1 rs2))
;; which, in turn, is translated into:
;; Defined in cranelift/codegen/src/isa/riscv64/inst.isle
;; Helper for emitting `MInst.AluRRR` instructions.
(decl alu_rrr (AluOPRRR Reg Reg) Reg)
(rule (alu_rrr op src1 src2)
(let ((dst WritableXReg (temp_writable_xreg))
(_ Unit (emit (MInst.AluRRR op dst src1 src2))))
dst))- For
xandy, we declare an external constructor to place a value into a register usingconstructor_put_in_xreg. - For
rv_addw: In Cranelift, terms likerv_addware abstractions representing architecture-specific instructions. This constructor invokes analu_rrroperation.alu_rrr: This emits the actual RISC-V equivalent instruction (as shown below).
// The generated file is located in the target directory within the out folder for each specified
// target.
//
// Generated as an internal constructor for term alu_rrr.
pub fn constructor_alu_rrr<C: Context>(ctx: &mut C, arg0: &AluOPRRR, arg1: Reg, arg2: Reg) -> Reg {
let v3 = constructor_temp_writable_xreg(ctx);
let v4 = C::writable_xreg_to_writable_reg(ctx, v3);
let v5 = MInst::AluRRR {
alu_op: arg0.clone(),
rd: v4,
rs1: arg1,
rs2: arg2,
};
let v6 = C::emit(ctx, &v5);
let v7 = constructor_writable_xreg_to_reg(ctx, v3);
return v7;
}
// Generated as an internal constructor for term rv_addw.
pub fn constructor_rv_addw<C: Context>(ctx: &mut C, arg0: XReg, arg1: XReg) -> XReg {
let v3 = C::xreg_to_reg(ctx, arg0);
let v4 = C::xreg_to_reg(ctx, arg1);
let v5 = constructor_alu_rrr(ctx, &AluOPRRR::Addw, v3, v4);
let v6 = C::xreg_new(ctx, v5);
return v6;
}
// Generated as an internal constructor for term lower.
// PS: The `Lower` constructor contains `10k` lines of code.
pub fn constructor_lower<C: Context>(ctx: &mut C, arg0: Inst) -> Option<InstOutput> {
let v4 = &C::inst_data(ctx, arg0);
match v4 {
...
&InstructionData::Binary { opcode: ref v46, args: ref v47 } => {
match v46 {
...
&Opcode::Iadd => {
let v1 = C::first_result(ctx, arg0);
if let Some(v2) = v1 {
let v3 = C::value_type(ctx, v2);
...
let v42 = C::fits_in_32(ctx, v3);
if let Some(v43) = v42 {
let v44 = C::ty_int(ctx, v43);
if let Some(v45) = v44 {
let v48 = C::unpack_value_array_2(ctx, v47);
let v51 = constructor_put_in_xreg(ctx, v48.0);
let v52 = constructor_put_in_xreg(ctx, v48.1);
let v53 = constructor_rv_addw(ctx, v51, v52);
let v54 = constructor_output_xreg(ctx, v53);
let v55 = Some(v54);
return v55;
}
}
}
}
}
}
}
}
Cranelift Lowering & Binary Emission
ir::Function (SSA IR, machine-independent opcodes)
|
| [lower]
|
VCode<arch_backend::Inst> (machine instructions:
| - mostly virtual registers.
| - cond branches in two-target form.
| - branch targets are block indices.
| - in-memory constants held by insns,
| with unknown offsets.
| - critical edges (actually all edges)
| are split.)
|
| [regalloc --> `regalloc2::Output`; VCode is unchanged]
|
| [binary emission via MachBuffer]
|
Vec<u8> (machine code:
| - two-dest branches resolved via
| streaming branch resolution/simplification.
| - regalloc `Allocation` results used directly
| by instruction emission code.
| - prologue and epilogue(s) built and emitted
| directly during emission.
| - SP-relative offsets resolved by tracking
| EmitState.)
Lower:Thetypethat handles the lowering process fromCLIF IRtoVCode<MInst>.- The
VCodetype serves as an abstraction—a container for target-specificMInstinstances. In Cranelift, a sequence of machine instructions organized into basic blocks is referred to as vcode, short forvirtual-register code. - Cranelift introduces two closely related types:
EntityListandListPoolEntityList:Instead of using a structure likeVec<T>, the memory for anEntityListis managed by aListPool<T>. An EntityList wraps a u32 index, which points to the starting position of the actual list in the ListPool’s data vector. If the index is 0, the list is considered empty.ListPool:Acts as the shared storage for multiple lists.- By delegating memory ownership and management to the
ListPool,EntityList<T>reduces its size to the minimum necessary for locating its data—a 4-byte index. This design is ideal for scenarios that demand compact, memory-efficient data structures, such as compiler implementations like Cranelift.
- The
A side note:
Global value proof-carrying-code facts:We can associateGlobalValue’s in the function with optionalfacts.Facts describe properties or constraints that theGlobalValuemust satisfy. These constraints are used for verification or optimization during code generation.Memory types for proof-carrying code:A MemoryType is an abstract representation of structured memory (like a struct) or unstructured memory (like a blob).memory_typesenables reasoning about memory layouts during compilation.
MInst:This type represents target-specific instruction formats and is implemented as an enum.- Note: This is a generated type. Target instruction formats are defined in ISLE, which generates the corresponding Rust type.
MInstdoes not directly represent the raw bytes of a target instruction—it is a Cranelift abstraction.
Vcode:A struct that serves as a container for machine instructions at various stages of construction.MachBuffer:- Raw code bytes (the actual machine instructions).
- Metadata like where variables live, where jumps or calls go, or where debug information maps to source code.
- Uses
SmallVecfor storage - It is generic over a type that implements
MachInstand MachInstEmit traits.
data:Stores the machine code bytes.relocs:Tracks external references (e.g., a call to another function or a jump to another module).traps:Stores information about trap points (e.g., for error handling).user_stack_maps:Metadata for garbage collection, describing where pointers are in the stack.label_offsets and label_aliases:Track where labels (like jump targets) are in the code.pending_constants:Keeps track of constants that need to be inserted into the code.fixup_records:Stores information about jumps or calls that can only be resolved after all code is emitted.
Imagine a binary where a function call foo() is made, but the address of foo() is resolved at runtime.
Source Code:
void call_function() {
foo(); // Call to an external function.
}Generated Machine Code:
- Initially, the function address for foo() is not known.
- The instruction might contain a placeholder for the address to be resolved later:
- Here,
auipc(add upper immediate to PC) loads the upper 20 bits of the offset between the target address and PC, andjalr(jump and link register) completes the jump using the lower bits.
auipc t0, %pcrel_hi(foo@plt)
jalr t1, %pcrel_lo(foo@plt)(t0)
Step-by-Step Calculation:
1. Current PC:
• Assume PC = 0x1000 (the address of the auipc instruction).
2. Function Address:
• Target function address = 0x3004.
3. Offset:
offset = function_address - current_pc
= 0x3004 - 0x1000
= 0x2004
4. Split Offset into High and Low Parts:
• For auipc, we only care about the high 20 bits of the offset.
• High 20 bits of offset:
high_bits = (offset + 0x800) >> 12
= (0x2004 + 0x800) >> 12
= (0x2804) >> 12
= 0x2
* The addition of 0x800 ensures proper rounding when truncating lower bits during the shift (to handle sign-extension and alignment).
5. AUIPC Encodes the High 20 Bits:
• The auipc instruction takes 0x2 as the immediate value:
auipc t0, 0x2
• Internally, this computes:
t0 = PC + (0x2 << 12)
= 0x1000 + 0x2000
= 0x3000
JALR Resolves the Low Bits:
1. Remaining Offset (Low 12 Bits):
• The auipc instruction only handled the high part of the offset. The low 12 bits are resolved by the immediate value in the jalr instruction.
• From the original offset 0x2004:
low_bits = offset & 0xFFF
= 0x2004 & 0xFFF
= 0x004
2. JALR Instruction:
• The jalr instruction uses t0 (from auipc) and adds the low_bits to compute the final target address:
jalr ra, t0, 4
• This computes:
PC = t0 + imm
= 0x3000 + 0x004
= 0x3004
Summary:
• AUIPC Immediate (0x2):
• Derived from the high 20 bits of the PC-relative offset (0x2004).
• Computed as: (offset + 0x800) >> 12.
• JALR Immediate (0x4):
• Derived from the low 12 bits of the PC-relative offset (0x2004).
• Computed as: offset & 0xFFF.
Relocation Table:
- A relocation entry is created for foo in the ELF binary:
Offset: 0x004
Type: R_RISCV_CALL_PLT
Symbol: foo
At Runtime:
- The linker resolves the address of foo() and patches the instructions so that they jump to the resolved address.
+------------------------+
| Instruction Memory |
| auipc t0, %pcrel_hi | -> Placeholder for foo@plt
| jalr t1, %pcrel_lo | -> Placeholder for foo@plt
+------------------------+
Relocation Table (contains a single entry)
+-------------------------------------+
| Offset | Type | Symbol |
| 0x004 | R_RISCV_CALL_PLT | foo |
+-------------------------------------+
At Runtime:
+------------------------+
| Instruction Patched to |
| auipc t0, hi(foo) | -> Resolved foo's address
| jalr t1, lo(foo)(t0) |
+------------------------+The RISC-V relocation model assumes the linker will:
- Identify the
auipcandjalrpair based on the relocation type (R_RISCV_CALL_PLT). - Use the symbol’s resolved address to compute:
- The high 20 bits for
auipc(patched first). - The low 12 bits for
jalr(patched next).
- The high 20 bits for
Thus, a second relocation entry isn’t necessary because the relocation type and context provide all the needed information.
- Compact Relocation Table:
- By storing only one relocation entry, the relocation table remains smaller and simpler, especially for large programs with many such call sequences.
- Reduced Complexity for Assemblers and Compilers:
- Assemblers don’t need to emit separate relocation entries for
auipcandjalr,simplifying code generation.
- Assemblers don’t need to emit separate relocation entries for
- Linker Handles the Association:
- The linker, which is already responsible for patching instructions, manages the pairing implicitly without needing redundant entries
The Global Offset Table (GOT) is a structure used in Position-Independent Code (PIC) to facilitate the relocation of global symbols at runtime. It holds the absolute addresses of global variables and functions, which may be relocated by the operating system’s dynamic linker.
In simpler terms, the GOT is like a lookup table that stores the runtime addresses of symbols so that code can access them indirectly.
- Create a Folder for the Backend
- Place the new backend directory under
/cranelift/codegen/src/isa, where each backend resides.
- Place the new backend directory under
- Define Backend and Implement Required Traits
- Ensure the backend implements these essential traits:
TargetIsa: Specifies the target architecture’s interface.LowerBackend: Manages instruction lowering for the architecture.
- Ensure the backend implements these essential traits:
- Add Core ISLE Files to the Backend Folder
inst.isle: Define ISA-specific instructions, encodings, registers, and settings for the target architecture using ISLE (Instruction Selection or Lowering Expressions).lower.isle: Specify instruction selection and CLIF (Cranelift Intermediate Representation) to MachInst (Machine Instructions) lowering rules.
- Implement an ABI for the Target Architecture
- Place the ABI implementation in the appropriate folder within the backend directory.
- Add ISA-Specific Settings
- Define any required ISA-specific settings in
cranelift-codegen-meta, located at:/cranelift/codegen/meta/src/isa/<arch>
- Define any required ISA-specific settings in
- Include Filetests for Basic Functionality
- Add initial filetests to verify the backend’s basic functionality in:
/cranelift/filetests/filetests/isa/<arch>
- Add initial filetests to verify the backend’s basic functionality in:
- [[Cranelift/ISLE:]] we define rewriting rules that map input terms (
clifinstructions) into output terms (MachInsts, a type that represents an actual machine instruction). - [[Cranelift/E-graphs:]] We can perform an E-graph pass over a function, identifying similar values with multiple representations and selecting the optimal representation for each used value. This approach, known as non-destructive term rewriting, enables Cranelift's E-graphs to select the best value among alternatives, which can then be passed to an ISLE constructor.
fn example(a: i32, b: i32) -> i32 {
let c = a + b; // Local variable stored in stack slot.
let d = helper(c); // Tail call (outgoing args considered).
d
} High Memory
+---------------------+
| Incoming Args (a, b)| <-- Stack grows downward
+---------------------+
| Return Address | <-- Stored by `jal` instruction
+---------------------+
| Saved Frame Pointer | <-- Saved `fp` register
+---------------------+
| Callee-Saved Regs | <-- Save `s0, s1, s2, s3`
+---------------------+
| Local Var (c) | <-- Space for local `c`
+---------------------+
| Outgoing Args | <-- Argument for `helper`
+---------------------+
Low Memory1. **Word Size**: 4 bytes.
2. **Stack Alignment**: 4 bytes for RV32I ABIs.
3. **Callee-Saved Registers**: ABI defines which registers (s0 to s11) must be preserved across function calls.
4. **Tail Calls**: Ensure outgoing arguments fit within allocated stack space.
This layout ensures correctness and adherence to the RV32 ABI while efficiently utilizing stack memory.


