Leo Yu fishmingyu

Simple Segment reduce in OpenMP

Compile

g++ -fopenmp segreduce.cpp -o segreduce

Test Results

Total time for 10 repetitions: 27 microseconds Average time per function call:(parallel) 2.7 microseconds Total time for 10 repetitions: 78 microseconds

	import time
	import triton
	import triton.language as tl
	import torch


	@triton.jit
	def spmm_atomic(edge_index, B, C, num_edges, feature_size: tl.constexpr, XBLOCK: tl.constexpr):
	group_id = tl.program_id(0)
	xoffset = group_id * XBLOCK

	------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
	Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
	------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
	graph_0_cpp_fused_add_exp_index_select_mul_scatter_a... 29.02% 11.966ms 29.02% 11.966ms 11.966ms 1
	graph_0_cpp_fused_add_clone_exp_index_select_mul_rel... 28.60% 11.794ms 28.60% 11.794ms 11.794ms 1
	graph_0_cpp_fused_add_clone_exp_index_select_mul_new... 27.49% 11.335ms 27.49% 11.335ms 11.335ms 1
	aten::scatter_ 5.92% 2.442ms 5.92% 2.442ms 814.000us 3

	import torch

	import torch_geometric
	from torch_geometric.profile import benchmark
	from torch_geometric.testing import (
	disableExtensions,
	onlyFullTest,
	onlyLinux,
	withCUDA,
	withPackage,