(Training material on pytorch CPU performance optimization)
- Part II: Parallelization Techniques
- Part III: Vectorization Techniques
- Part IV: BFloat16 Kernel Optimization
Chinese version for this chapter, link.
| import torch | |
| import torch.nn.functional as F | |
| def to_float8(x, dtype=torch.float8_e4m3fn): | |
| finfo = torch.finfo(dtype) | |
| # Calculate the scale as dtype max divided by absmax | |
| scale = finfo.max / x.abs().max().clamp(min=1e-12) | |
| # scale and clamp the tensor to bring it to | |
| # the representative range of float8 data type | |
| # (as default cast is unsaturated) |
(Training material on pytorch CPU performance optimization)
Chinese version for this chapter, link.
| # This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting. | |
| # https://developer.nvidia.com/nsight-systems | |
| # https://docs.nvidia.com/nsight-systems/profiling/index.html | |
| # My preferred nsys (command line executable used to create profiles) commands | |
| # | |
| # In your script, write | |
| # torch.cuda.nvtx.range_push("region name") | |
| # ... |
| #!/bin/bash | |
| # Install docker | |
| apt-get update | |
| apt-get install -y cloud-utils apt-transport-https ca-certificates curl software-properties-common | |
| curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - | |
| add-apt-repository \ | |
| "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ | |
| $(lsb_release -cs) \ | |
| stable" | |
| apt-get update |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| int addi(int a, int b) { | |
| return a + b; | |
| } | |
| char *adds(char *a, char *b) { | |
| char *res = malloc(strlen(a) + strlen(b) + 1); |