g++ -fopenmp segreduce.cpp -o segreduce
Total time for 10 repetitions: 27 microseconds Average time per function call:(parallel) 2.7 microseconds Total time for 10 repetitions: 78 microseconds
| import time | |
| import triton | |
| import triton.language as tl | |
| import torch | |
| @triton.jit | |
| def spmm_atomic(edge_index, B, C, num_edges, feature_size: tl.constexpr, XBLOCK: tl.constexpr): | |
| group_id = tl.program_id(0) | |
| xoffset = group_id * XBLOCK |
| ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ | |
| Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls | |
| ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ | |
| graph_0_cpp_fused_add_exp_index_select_mul_scatter_a... 29.02% 11.966ms 29.02% 11.966ms 11.966ms 1 | |
| graph_0_cpp_fused_add_clone_exp_index_select_mul_rel... 28.60% 11.794ms 28.60% 11.794ms 11.794ms 1 | |
| graph_0_cpp_fused_add_clone_exp_index_select_mul_new... 27.49% 11.335ms 27.49% 11.335ms 11.335ms 1 | |
| aten::scatter_ 5.92% 2.442ms 5.92% 2.442ms 814.000us 3 | |
| import torch | |
| import torch_geometric | |
| from torch_geometric.profile import benchmark | |
| from torch_geometric.testing import ( | |
| disableExtensions, | |
| onlyFullTest, | |
| onlyLinux, | |
| withCUDA, | |
| withPackage, |