Skip to content

Instantly share code, notes, and snippets.

@stillwwater
Last active November 11, 2024 19:28
Show Gist options
  • Select an option

  • Save stillwwater/cdcebf7b7e42008052c71bf4ec4ff670 to your computer and use it in GitHub Desktop.

Select an option

Save stillwwater/cdcebf7b7e42008052c71bf4ec4ff670 to your computer and use it in GitHub Desktop.
Simple rasterizer
bits 64
default rel
%define R_PITCH (160 * 4)
%define R_PITCH_SHIFT 10
section .data
float_1 dd 1.0
align 32
float_1x8 dd 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
section .bss
align 32
w0_row resd 8
w1_row resd 8
w2_row resd 8
extern r_framebuffer
section .text
extern puts
extern r_vertexbuffer
extern r_lanes
%macro edge 4
mov r10d, [r_vertexbuffer + %2+0] ; R10 = A.x
mov r11d, [r_vertexbuffer + %2+4] ; R11 = A.y
mov r12d, [r_vertexbuffer + %3+0] ; R12 = B.x
mov r13d, [r_vertexbuffer + %3+4] ; R13 = B.y
mov r14d, ecx ; R14 = V.x
mov r15d, esi ; R15 = V.y
sub r12d, r10d ; R12 = b.x - a.x
sub r15d, r11d ; R15 = c.y - a.y
imul r12d, r15d ; R12 = (b.x - a.x) * (c.y - a.y)
sub r13d, r11d ; R13 = b.y - a.y
sub r14d, r10d ; R14 = c.x - a.x
imul r14d, r13d ; R14 = (b.y - a.y) * (c.x - a.x)
sub r12d, r14d ; R14 = edge(B, C, V)
vmovd eax, %4
mov [%1], r12d ; very bad
add r12d, eax
mov [%1 + 4], r12d
add r12d, eax
mov [%1 + 8], r12d
add r12d, eax
mov [%1 + 12], r12d
add r12d, eax
mov [%1 + 16], r12d
add r12d, eax
mov [%1 + 20], r12d
add r12d, eax
mov [%1 + 24], r12d
add r12d, eax
mov [%1 + 28], r12d
%endmacro
%macro tri_sub 3
vbroadcastss ymm0, [r_vertexbuffer + %2]
vbroadcastss ymm1, [r_vertexbuffer + %3]
vpsubd %1, ymm0, ymm1
%endmacro
%macro mul8 1
vpaddd %1, %1
vpaddd %1, %1
vpaddd %1, %1
%endmacro
global r_fill_tri
r_fill_tri:
mov ebp, 1024
.loop_tri:
vmovups xmm0, [r_vertexbuffer + 0] ; vertex A
vmovups xmm1, [r_vertexbuffer + 12] ; B
vmovups xmm2, [r_vertexbuffer + 24] ; C
vpminsd xmm3, xmm0, xmm1 ; AABB X/Y min
vpminsd xmm3, xmm3, xmm2
vpmaxsd xmm4, xmm0, xmm1 ; AABB X/Y max
vpmaxsd xmm4, xmm4, xmm2
vmovq rcx, xmm3 ; RCX = mins X
mov rsi, rcx
shr rsi, 32 ; RSI = mins Y
vmovq rdx, xmm4 ; RDX = maxs X
mov rdi, rdx
shr rdi, 32 ; RDI = maxs Y
mov ecx, ecx
lea r9, [r_framebuffer]
add r9, rcx
add r9, rsi ; R9 = top left of rect in buffer
tri_sub ymm5, 4, 16 ; XMM5 = A01
tri_sub ymm6, 12, 0 ; XMM6 = B01
tri_sub ymm7, 16, 28 ; XMM7 = A12
tri_sub ymm8, 24, 12 ; XMM8 = B12
tri_sub ymm9, 28, 4 ; XMM9 = A20
tri_sub ymm10, 0, 24 ; XMM1a0 = B20
edge w0_row, 12, 24, xmm7 ; edge(B, C, V)
vmovaps ymm2, [w0_row]
edge w1_row, 24, 0, xmm9 ; edge(C, A, V)
vmovaps ymm3, [w1_row]
edge w2_row, 0, 12, xmm5 ; edge(A, B, V)
vmovaps ymm4, [w2_row]
mul8 ymm7
mul8 ymm9
mul8 ymm5
vmovd ebx, xmm3
vmovaps ymm15, [float_1x8]
.loop_y:
mov ecx, ebx
vmovaps ymm12, ymm2 ; w0
vmovaps ymm13, ymm3 ; w1
vmovaps ymm14, ymm4 ; w2
.loop_x:
vpor ymm0, ymm12, ymm13
vpor ymm0, ymm0, ymm14
vxorps ymm1, ymm1
vpcmpgtd ymm0, ymm0, ymm1
vpand ymm0, ymm0, ymm15
vmovups [r9 + 4 * rcx], ymm0
.continue_x:
vpaddd ymm12, ymm7 ; step column
vpaddd ymm13, ymm9
vpaddd ymm14, ymm5
add ecx, 8 ; end .loop_x
cmp ecx, edx
jle .loop_x
vpaddd ymm2, ymm8 ; step row
vpaddd ymm3, ymm10
vpaddd ymm4, ymm6
add r9, R_PITCH ; next scanline
inc esi ; end .loop_y
cmp esi, edi
jle .loop_y
.break_y:
dec ebp ; end .loop_tri
jnz .loop_tri
vzeroupper
mov rax, rdi
ret
#include "rasterizer.h"
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
enum {
R_WIDTH = 256,
R_HEIGHT = 128,
R_PITCH = R_WIDTH * sizeof(float),
};
float r_framebuffer[R_WIDTH * R_HEIGHT * 100];
int r_vertexbuffer[256];
int r_fill_tri(void);
void
image_write_tga(void)
{
FILE *file = fopen("framebuffer.tga", "wb+");
assert(file);
unsigned char header[18] = {
0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
R_WIDTH & 0xFF, (R_WIDTH >> 8) & 0xFF, R_HEIGHT & 0xFF, (R_HEIGHT >> 8) & 0xFF, 32, 8,
};
fwrite(header, 1, sizeof(header), file);
for (unsigned i = 0; i < R_WIDTH * R_HEIGHT; ++i) {
unsigned depth = r_framebuffer[i] * 255;
fwrite(&depth, 1, 4, file);
}
fclose(file);
}
int
main()
{
r_vertexbuffer[0] = 0;
r_vertexbuffer[1] = 0;
r_vertexbuffer[2] = 0;
r_vertexbuffer[3] = 256;
r_vertexbuffer[4] = 0;
r_vertexbuffer[5] = 0;
r_vertexbuffer[6] = 128;
r_vertexbuffer[7] = 128;
r_vertexbuffer[8] = 0;
r_fill_tri();
image_write_tga();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment