This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import math | |
| import torch | |
| import torch.nn as nn | |
| import torch.optim as optim | |
| import torch.multiprocessing as mp | |
| import torch.distributed as dist | |
| from torch.distributed.device_mesh import init_device_mesh | |
| from torch.distributed.fsdp import fully_shard |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import torch | |
| import torch.nn as nn | |
| import torch.optim as optim | |
| import torch.multiprocessing as mp | |
| import torch.distributed as dist | |
| from torch.distributed.device_mesh import init_device_mesh | |
| from torch.distributed.fsdp import fully_shard | |
| from torch.distributed.tensor.parallel import ( |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| baseline runtime(s) 0.5243263244628906 | |
| with reshape runtime (s) 0.0022399425506591797 | |
| @ cpu | |
| ========= | |
| baseline runtime (s) 0.25386476516723633 | |
| with reshape runtime (s) 0.0008966922760009766 | |
| @ cuda:0 | |
| """ | |
| import torch |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // https://marketplace.visualstudio.com/items?itemName=coolchyni.beyond-debug | |
| { | |
| "inputs": [ | |
| { | |
| "id": "hostname", | |
| "description": "xxx", | |
| "default": "localhost", | |
| "type": "promptString" | |
| }, | |
| { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # python single_gpu_ddp.py | |
| # https://discuss.pytorch.org/t/single-machine-single-gpu-distributed-best-practices/169243 | |
| import torch | |
| import torch.distributed as dist | |
| import torch.nn as nn | |
| import torch.multiprocessing as mp | |
| from torch.nn.parallel import DistributedDataParallel as DDP | |
| import os | |
| def setup(rank, world_size): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| IR module { | |
| tt.func public @matmul_kernel_0d1d2d3d4c5d6c7d8c(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} { | |
| %c16_i32 = arith.constant 16 : i32 | |
| %c1024_i32 = arith.constant 1024 : i32 | |
| %c0_i32 = arith.constant 0 : i32 | |
| %cst = arith.constant dense<16> : tensor<16x16xi32> | |
| %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32> | |
| %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> | |
| %1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32> | |
| %2 = tt.splat %arg3 : (i32) -> tensor<16x1xi32> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import triton | |
| import triton.language as tl | |
| import torch.nn.functional as F | |
| @triton.jit | |
| def matmul_kernel( | |
| a_ptr, b_ptr, c_ptr, | |
| stride_am, stride_ak, | |
| stride_bk, stride_bn, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import triton | |
| import triton.language as tl | |
| import torch.nn.functional as F | |
| import time | |
| @triton.jit | |
| def add_kernel(x_ptr, y_ptr, output_ptr, N, | |
| BLOCK_SIZE: tl.constexpr): | |
| pid = tl.program_id(0) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| # acknowledgement: https://gist.github.com/bwasti/7e4cb9bd1aaddeb09bd360b570a486b1 | |
| def cudagraph(f): | |
| _graphs = {} | |
| def f_(*args): | |
| key = hash(tuple(tuple(a.shape) for a in args)) | |
| if key in _graphs: | |
| wrapped, *_ = _graphs[key] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class Vector2D { | |
| private: | |
| vector<vector<int>>::iterator row, iBegin, iEnd; | |
| vector<int>::iterator col; | |
| public: | |
| Vector2D(vector<vector<int>>& vec2d) { | |
| iBegin = row = vec2d.begin(); | |
| iEnd = vec2d.end(); | |
| if(vec2d.size()) | |
| col = row->begin(); |
NewerOlder