Skip to content

Instantly share code, notes, and snippets.

@IgorBaratta
Last active January 24, 2024 16:34
Show Gist options
  • Select an option

  • Save IgorBaratta/da8b9ce3b2b11ecfad3d4333bcbb023a to your computer and use it in GitHub Desktop.

Select an option

Save IgorBaratta/da8b9ce3b2b11ecfad3d4333bcbb023a to your computer and use it in GitHub Desktop.
CUDA python demonstration
from mpi4py import MPI
import basix
from dolfinx import fem
import dolfinx
import ufl
import numpy as np
from numba import cuda
def volume(mesh):
# Function to compute the volume of each cell in the mesh
dg0 = basix.ufl.element("DG", cell_type, 0)
W = fem.functionspace(mesh, dg0)
w = fem.Function(W)
v = ufl.TestFunction(W)
L = ufl.inner(1, v) * ufl.dx
L = fem.form(L)
fem.assemble_vector(w.x.array, L)
return w.x.array
# Create a unit cube mesh
comm = MPI.COMM_WORLD
mesh = dolfinx.mesh.create_unit_cube(comm, 10, 10, 10, dolfinx.mesh.CellType.hexahedron)
cell_type = mesh.basix_cell()
degree = 1
# Define finite element and function space
e = basix.ufl.element("Lagrange", cell_type, degree)
V = fem.functionspace(mesh, e)
ndofs = V.dofmap.list.shape[1]
# Compute quadrature points and weights
quad_degree = e.degree * 2 + 1
pts, wts = basix.make_quadrature(cell_type, quad_degree)
# Compute reference mass matrix
phi = e.tabulate(0, pts)[0, :, :]
M = np.einsum("qi,q,qj->ij", phi, wts, phi)
# Initialize functions and interpolate a function for the source term
u = fem.Function(V)
f = fem.Function(V)
f.interpolate(lambda x: x[0])
@cuda.jit
def assemble_vector(u_d, f_d, dofmap_d, M_d, vol_d):
# CUDA kernel to assemble the vector
tid = cuda.threadIdx.x
bid = cuda.blockIdx.x
sA = cuda.shared.array(shape=(ndofs,), dtype=np.float64)
dof = dofmap_d[bid, tid]
sA[tid] = f_d[dof]
cuda.syncthreads()
# Each thread computes the dot product of row i of M_d with sA
val = 0.0
for i in range(M_d.shape[1]):
val += sA[i] * M_d[tid, i] * vol_d[bid]
# u_d[dof] += val
cuda.atomic.add(u_d, dof, val)
# CUDA variables
dofmap_d = cuda.to_device(V.dofmap.list)
u_d = cuda.to_device(u.x.array)
f_d = cuda.to_device(f.x.array)
M_d = cuda.to_device(M)
vol_d = cuda.to_device(volume(mesh))
# Launch CUDA kernel
num_blocks = V.dofmap.list.shape[0]
threads_per_block = V.dofmap.list.shape[1]
assemble_vector[num_blocks, threads_per_block, 0](u_d, f_d, dofmap_d, M_d, vol_d)
cuda.synchronize()
# Reference solution with dolfinx
v = ufl.TestFunction(V)
L = ufl.inner(f, v) * ufl.dx
L = fem.form(L)
u_ref = fem.Function(V)
fem.assemble_vector(u_ref.x.array, L)
# Compare with reference solution
u_d.copy_to_host(u.x.array)
assert np.allclose(u_ref.x.array, u.x.array)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment