IgorBaratta · January 24, 2024 16:34
diff --git a/assmeble_vector.py b/assmeble_vector.py
 from mpi4py import MPI
 import basix
 from dolfinx import fem
 import dolfinx
 import ufl
 import numpy as np
 from numba import cuda




 def volume(mesh):
    # Function to compute the volume of each cell in the mesh
    dg0 = basix.ufl.element("DG", cell_type, 0)
    W = fem.functionspace(mesh, dg0)
    w = fem.Function(W)
    v = ufl.TestFunction(W)
    L = ufl.inner(1, v) * ufl.dx
    L = fem.form(L)
    fem.assemble_vector(w.x.array, L)
    return w.x.array


 # Create a unit cube mesh
 comm = MPI.COMM_WORLD

 mesh = dolfinx.mesh.create_unit_cube(comm, 10, 10, 10, dolfinx.mesh.CellType.hexahedron)
 cell_type = mesh.basix_cell()
 degree = 1

 # Define finite element and function space
 e = basix.ufl.element("Lagrange", cell_type, degree)
 V = fem.functionspace(mesh, e)
 ndofs = V.dofmap.list.shape[1]

 # Compute quadrature points and weights
 quad_degree = e.degree * 2 + 1
 pts, wts = basix.make_quadrature(cell_type, quad_degree)

 # Compute reference mass matrix
 phi = e.tabulate(0, pts)[0, :, :]
 M = np.einsum("qi,q,qj->ij", phi, wts, phi)

 # Initialize functions and interpolate a function for the source term
 u = fem.Function(V)
 f = fem.Function(V)
 f.interpolate(lambda x: x[0])

 @cuda.jit
 def assemble_vector(u_d, f_d, dofmap_d, M_d, vol_d):
    # CUDA kernel to assemble the vector
    tid = cuda.threadIdx.x
    bid = cuda.blockIdx.x
    sA = cuda.shared.array(shape=(ndofs,), dtype=np.float64)
    dof = dofmap_d[bid, tid]
    sA[tid] = f_d[dof]
    cuda.syncthreads()

    # Each thread computes the dot product of row i of M_d with sA
    val = 0.0
    for i in range(M_d.shape[1]):
        val += sA[i] * M_d[tid, i] * vol_d[bid]
    
    # u_d[dof] += val
    cuda.atomic.add(u_d, dof, val)


 # CUDA variables
 dofmap_d = cuda.to_device(V.dofmap.list)
 u_d = cuda.to_device(u.x.array)
 f_d = cuda.to_device(f.x.array)
 M_d = cuda.to_device(M)
 vol_d = cuda.to_device(volume(mesh))

 # Launch CUDA kernel
 num_blocks = V.dofmap.list.shape[0]
 threads_per_block = V.dofmap.list.shape[1]
 assemble_vector[num_blocks, threads_per_block, 0](u_d, f_d, dofmap_d, M_d, vol_d)
 cuda.synchronize()


 # Reference solution with dolfinx
 v = ufl.TestFunction(V)
 L = ufl.inner(f, v) * ufl.dx
 L = fem.form(L)

 u_ref = fem.Function(V)
 fem.assemble_vector(u_ref.x.array, L)

 # Compare with reference solution
 u_d.copy_to_host(u.x.array)
 assert np.allclose(u_ref.x.array, u.x.array)
	from mpi4py import MPI
	import basix
	from dolfinx import fem
	import dolfinx
	import ufl
	import numpy as np
	from numba import cuda




	def volume(mesh):
	# Function to compute the volume of each cell in the mesh
	dg0 = basix.ufl.element("DG", cell_type, 0)
	W = fem.functionspace(mesh, dg0)
	w = fem.Function(W)
	v = ufl.TestFunction(W)
	L = ufl.inner(1, v) * ufl.dx
	L = fem.form(L)
	fem.assemble_vector(w.x.array, L)
	return w.x.array


	# Create a unit cube mesh
	comm = MPI.COMM_WORLD

	mesh = dolfinx.mesh.create_unit_cube(comm, 10, 10, 10, dolfinx.mesh.CellType.hexahedron)
	cell_type = mesh.basix_cell()
	degree = 1

	# Define finite element and function space
	e = basix.ufl.element("Lagrange", cell_type, degree)
	V = fem.functionspace(mesh, e)
	ndofs = V.dofmap.list.shape[1]

	# Compute quadrature points and weights
	quad_degree = e.degree * 2 + 1
	pts, wts = basix.make_quadrature(cell_type, quad_degree)

	# Compute reference mass matrix
	phi = e.tabulate(0, pts)[0, :, :]
	M = np.einsum("qi,q,qj->ij", phi, wts, phi)

	# Initialize functions and interpolate a function for the source term
	u = fem.Function(V)
	f = fem.Function(V)
	f.interpolate(lambda x: x[0])

	@cuda.jit
	def assemble_vector(u_d, f_d, dofmap_d, M_d, vol_d):
	# CUDA kernel to assemble the vector
	tid = cuda.threadIdx.x
	bid = cuda.blockIdx.x
	sA = cuda.shared.array(shape=(ndofs,), dtype=np.float64)
	dof = dofmap_d[bid, tid]
	sA[tid] = f_d[dof]
	cuda.syncthreads()

	# Each thread computes the dot product of row i of M_d with sA
	val = 0.0
	for i in range(M_d.shape[1]):
	val += sA[i] * M_d[tid, i] * vol_d[bid]

	# u_d[dof] += val
	cuda.atomic.add(u_d, dof, val)


	# CUDA variables
	dofmap_d = cuda.to_device(V.dofmap.list)
	u_d = cuda.to_device(u.x.array)
	f_d = cuda.to_device(f.x.array)
	M_d = cuda.to_device(M)
	vol_d = cuda.to_device(volume(mesh))

	# Launch CUDA kernel
	num_blocks = V.dofmap.list.shape[0]
	threads_per_block = V.dofmap.list.shape[1]
	assemble_vector[num_blocks, threads_per_block, 0](u_d, f_d, dofmap_d, M_d, vol_d)
	cuda.synchronize()


	# Reference solution with dolfinx
	v = ufl.TestFunction(V)
	L = ufl.inner(f, v) * ufl.dx
	L = fem.form(L)

	u_ref = fem.Function(V)
	fem.assemble_vector(u_ref.x.array, L)

	# Compare with reference solution
	u_d.copy_to_host(u.x.array)
	assert np.allclose(u_ref.x.array, u.x.array)
No results found