Skip to content

Instantly share code, notes, and snippets.

@chychen
Last active August 27, 2020 15:16
Show Gist options
  • Select an option

  • Save chychen/e7d89c5bf891be95dbceddfeeabada56 to your computer and use it in GitHub Desktop.

Select an option

Save chychen/e7d89c5bf891be95dbceddfeeabada56 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import Numba CUDA"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from numba import cuda\n",
"import numpy as np\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data = np.load('example_data.npy')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Numba (CUDA Python)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"@cuda.jit\n",
"def cuda_ridge_detection(f, count, thres):\n",
" start_i, start_j = cuda.grid(2)\n",
" stride_i, stride_j = cuda.gridsize(2)\n",
" for i in range(start_i, f.shape[0], stride_i):\n",
" for j in range(start_j, f.shape[1], stride_j):\n",
" if (\n",
" i > 0\n",
" and j > 0\n",
" and i < (f.shape[0] - 1)\n",
" and j < (f.shape[1] - 1)\n",
" and f[i, j] > thres\n",
" and ~math.isnan(f[i, j])\n",
" ):\n",
" step_i = i\n",
" step_j = j\n",
" for k in range(1000):\n",
" if (\n",
" step_i == 0\n",
" or step_j == 0\n",
" or step_i == (f.shape[0] - 1)\n",
" or step_j == (f.shape[1] - 1)\n",
" ):\n",
" break\n",
" index = 4\n",
" vmax = -np.inf\n",
" for ii in range(3):\n",
" for jj in range(3):\n",
" if f[step_i + ii - 1, step_j + jj - 1] > vmax:\n",
" vmax = f[step_i + ii - 1, step_j + jj - 1]\n",
" index = jj + 3 * ii\n",
" if index == 4 or vmax == f[step_i, step_j] or math.isnan(vmax):\n",
" break\n",
" row = int(index / 3)\n",
" col = index % 3\n",
" cuda.atomic.add(count, (step_i - 1 + row, step_j - 1 + col), 1)\n",
" step_i = step_i - 1 + row\n",
" step_j = step_j - 1 + col"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def test_func(data):\n",
" device_data = cuda.to_device(data)\n",
" device_results = cuda.device_array_like(device_data)\n",
" cuda_ridge_detection[(8, 8), (8, 32)](device_data, device_results, 0)\n",
" cuda_results = device_results.copy_to_host()\n",
" return cuda_results"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"cuda_results = test_func(data)\n",
"np.testing.assert_almost_equal(results, cuda_results)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.67 ms ± 8.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
]
}
],
"source": [
"%timeit -r 7 -n 1000 test_func(data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Speedup by 200,000+ times!!!!!\n",
"\n",
"- CPU-based solution cost 366 seconds (366000 ms)\n",
"- CUDA Python solution cost 0.00167 seconds (1.67 ms)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"219161.6766467066"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"366000 / 1.67"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment