Skip to content

Instantly share code, notes, and snippets.

@chychen
Last active October 17, 2022 08:39
Show Gist options
  • Select an option

  • Save chychen/373fdfad0352bd4708eef9c67292c44f to your computer and use it in GitHub Desktop.

Select an option

Save chychen/373fdfad0352bd4708eef9c67292c44f to your computer and use it in GitHub Desktop.
add gdown
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import Numba CUDA"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from numba import cuda\n",
"import numpy as np\n",
"import math\n",
"import gdown\n",
"gdown.download('https://drive.google.com/uc?id=1OO0tUguZMyQ1d37K7F9jiwV7mm_z2yuD', 'example_data.npy')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data = np.load('example_data.npy')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Original (CPU-based)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def ridge_detection(f, thres):\n",
" count = np.zeros(f.shape)\n",
" for i in range(len(f)):\n",
" for j in range(len(f[i])):\n",
" if (\n",
" i > 0\n",
" and j > 0\n",
" and i < (len(f) - 1)\n",
" and j < (len(f[i]) - 1)\n",
" and f[i, j] > thres\n",
" and ~np.isnan(f[i, j])\n",
" ):\n",
" step_i = i\n",
" step_j = j\n",
" for k in range(1000):\n",
" if (\n",
" step_i == 0\n",
" or step_j == 0\n",
" or step_i == (len(f) - 1)\n",
" or step_j == (len(f[i]) - 1)\n",
" ):\n",
" break\n",
" index = np.nanargmax(\n",
" f[step_i - 1 : step_i + 2, step_j - 1 : step_j + 2].data\n",
" )\n",
" vmax = np.nanmax(\n",
" f[step_i - 1 : step_i + 2, step_j - 1 : step_j + 2].data\n",
" )\n",
" if index == 4 or vmax == f[step_i, step_j] or np.isnan(vmax):\n",
" break\n",
" row = int(index / 3)\n",
" col = index % 3\n",
" count[step_i - 1 + row, step_j - 1 + col] += 1\n",
" step_i = step_i - 1 + row\n",
" step_j = step_j - 1 + col\n",
" return count"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6min 14s, sys: 15.9 s, total: 6min 30s\n",
"Wall time: 6min 6s\n"
]
}
],
"source": [
"%%time\n",
"results = ridge_detection(data, 0)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# %timeit -r 7 -n 1 ridge_detection(data, 0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Numba (CUDA Python)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"@cuda.jit\n",
"def cuda_ridge_detection(f, count, thres):\n",
" start_i, start_j = cuda.grid(2)\n",
" stride_i, stride_j = cuda.gridsize(2)\n",
" for i in range(start_i, f.shape[0], stride_i):\n",
" for j in range(start_j, f.shape[1], stride_j):\n",
" if (\n",
" i > 0\n",
" and j > 0\n",
" and i < (f.shape[0] - 1)\n",
" and j < (f.shape[1] - 1)\n",
" and f[i, j] > thres\n",
" and ~math.isnan(f[i, j])\n",
" ):\n",
" step_i = i\n",
" step_j = j\n",
" for k in range(1000):\n",
" if (\n",
" step_i == 0\n",
" or step_j == 0\n",
" or step_i == (f.shape[0] - 1)\n",
" or step_j == (f.shape[1] - 1)\n",
" ):\n",
" break\n",
" index = 4\n",
" vmax = -np.inf\n",
" for ii in range(3):\n",
" for jj in range(3):\n",
" if f[step_i + ii - 1, step_j + jj - 1] > vmax:\n",
" vmax = f[step_i + ii - 1, step_j + jj - 1]\n",
" index = jj + 3 * ii\n",
" if index == 4 or vmax == f[step_i, step_j] or math.isnan(vmax):\n",
" break\n",
" row = int(index / 3)\n",
" col = index % 3\n",
" cuda.atomic.add(count, (step_i - 1 + row, step_j - 1 + col), 1)\n",
" step_i = step_i - 1 + row\n",
" step_j = step_j - 1 + col"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def test_func(data):\n",
" device_data = cuda.to_device(data)\n",
" device_results = cuda.device_array_like(device_data)\n",
" cuda_ridge_detection[(8, 8), (8, 32)](device_data, device_results, 0)\n",
" cuda_results = device_results.copy_to_host()\n",
" return cuda_results"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"cuda_results = test_func(data)\n",
"np.testing.assert_almost_equal(results, cuda_results)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.67 ms ± 8.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"%timeit -r 7 -n 1000 test_func(data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Speedup by 200,000+ times!!!!!\n",
"\n",
"- CPU-based solution cost 366 seconds (366000 ms)\n",
"- CUDA Python solution cost 0.00167 seconds (1.67 ms)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"219161.6766467066"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"366000 / 1.67"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment