chychen · October 17, 2022 08:39
diff --git a/accelerate_ridge_detection.ipynb b/accelerate_ridge_detection.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Numba CUDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numba import cuda\n",
    "import numpy as np\n",
    "import math\n",
    "import gdown\n",
    "gdown.download('https://drive.google.com/uc?id=1OO0tUguZMyQ1d37K7F9jiwV7mm_z2yuD', 'example_data.npy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.load('example_data.npy')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Original (CPU-based)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ridge_detection(f, thres):\n",
    "    count = np.zeros(f.shape)\n",
    "    for i in range(len(f)):\n",
    "        for j in range(len(f[i])):\n",
    "            if (\n",
    "                i > 0\n",
    "                and j > 0\n",
    "                and i < (len(f) - 1)\n",
    "                and j < (len(f[i]) - 1)\n",
    "                and f[i, j] > thres\n",
    "                and ~np.isnan(f[i, j])\n",
    "            ):\n",
    "                step_i = i\n",
    "                step_j = j\n",
    "                for k in range(1000):\n",
    "                    if (\n",
    "                        step_i == 0\n",
    "                        or step_j == 0\n",
    "                        or step_i == (len(f) - 1)\n",
    "                        or step_j == (len(f[i]) - 1)\n",
    "                    ):\n",
    "                        break\n",
    "                    index = np.nanargmax(\n",
    "                        f[step_i - 1 : step_i + 2, step_j - 1 : step_j + 2].data\n",
    "                    )\n",
    "                    vmax = np.nanmax(\n",
    "                        f[step_i - 1 : step_i + 2, step_j - 1 : step_j + 2].data\n",
    "                    )\n",
    "                    if index == 4 or vmax == f[step_i, step_j] or np.isnan(vmax):\n",
    "                        break\n",
    "                    row = int(index / 3)\n",
    "                    col = index % 3\n",
    "                    count[step_i - 1 + row, step_j - 1 + col] += 1\n",
    "                    step_i = step_i - 1 + row\n",
    "                    step_j = step_j - 1 + col\n",
    "    return count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 6min 14s, sys: 15.9 s, total: 6min 30s\n",
      "Wall time: 6min 6s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "results = ridge_detection(data, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %timeit -r 7 -n 1 ridge_detection(data, 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Numba (CUDA Python)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "@cuda.jit\n",
    "def cuda_ridge_detection(f, count, thres):\n",
    "    start_i, start_j = cuda.grid(2)\n",
    "    stride_i, stride_j = cuda.gridsize(2)\n",
    "    for i in range(start_i, f.shape[0], stride_i):\n",
    "        for j in range(start_j, f.shape[1], stride_j):\n",
    "            if (\n",
    "                i > 0\n",
    "                and j > 0\n",
    "                and i < (f.shape[0] - 1)\n",
    "                and j < (f.shape[1] - 1)\n",
    "                and f[i, j] > thres\n",
    "                and ~math.isnan(f[i, j])\n",
    "            ):\n",
    "                step_i = i\n",
    "                step_j = j\n",
    "                for k in range(1000):\n",
    "                    if (\n",
    "                        step_i == 0\n",
    "                        or step_j == 0\n",
    "                        or step_i == (f.shape[0] - 1)\n",
    "                        or step_j == (f.shape[1] - 1)\n",
    "                    ):\n",
    "                        break\n",
    "                    index = 4\n",
    "                    vmax = -np.inf\n",
    "                    for ii in range(3):\n",
    "                        for jj in range(3):\n",
    "                            if f[step_i + ii - 1, step_j + jj - 1] > vmax:\n",
    "                                vmax = f[step_i + ii - 1, step_j + jj - 1]\n",
    "                                index = jj + 3 * ii\n",
    "                    if index == 4 or vmax == f[step_i, step_j] or math.isnan(vmax):\n",
    "                        break\n",
    "                    row = int(index / 3)\n",
    "                    col = index % 3\n",
    "                    cuda.atomic.add(count, (step_i - 1 + row, step_j - 1 + col), 1)\n",
    "                    step_i = step_i - 1 + row\n",
    "                    step_j = step_j - 1 + col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def test_func(data):\n",
    "    device_data = cuda.to_device(data)\n",
    "    device_results = cuda.device_array_like(device_data)\n",
    "    cuda_ridge_detection[(8, 8), (8, 32)](device_data, device_results, 0)\n",
    "    cuda_results = device_results.copy_to_host()\n",
    "    return cuda_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "cuda_results = test_func(data)\n",
    "np.testing.assert_almost_equal(results, cuda_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.67 ms ± 8.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit -r 7 -n 1000 test_func(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Speedup by 200,000+ times!!!!!\n",
    "\n",
    "- CPU-based solution cost 366 seconds (366000 ms)\n",
    "- CUDA Python solution cost 0.00167 seconds (1.67 ms)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "219161.6766467066"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "366000 / 1.67"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Import Numba CUDA"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"from numba import cuda\n",
	"import numpy as np\n",
	"import math\n",
	"import gdown\n",
	"gdown.download('https://drive.google.com/uc?id=1OO0tUguZMyQ1d37K7F9jiwV7mm_z2yuD', 'example_data.npy')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"data = np.load('example_data.npy')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Original (CPU-based)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"def ridge_detection(f, thres):\n",
	" count = np.zeros(f.shape)\n",
	" for i in range(len(f)):\n",
	" for j in range(len(f[i])):\n",
	" if (\n",
	" i > 0\n",
	" and j > 0\n",
	" and i < (len(f) - 1)\n",
	" and j < (len(f[i]) - 1)\n",
	" and f[i, j] > thres\n",
	" and ~np.isnan(f[i, j])\n",
	" ):\n",
	" step_i = i\n",
	" step_j = j\n",
	" for k in range(1000):\n",
	" if (\n",
	" step_i == 0\n",
	" or step_j == 0\n",
	" or step_i == (len(f) - 1)\n",
	" or step_j == (len(f[i]) - 1)\n",
	" ):\n",
	" break\n",
	" index = np.nanargmax(\n",
	" f[step_i - 1 : step_i + 2, step_j - 1 : step_j + 2].data\n",
	" )\n",
	" vmax = np.nanmax(\n",
	" f[step_i - 1 : step_i + 2, step_j - 1 : step_j + 2].data\n",
	" )\n",
	" if index == 4 or vmax == f[step_i, step_j] or np.isnan(vmax):\n",
	" break\n",
	" row = int(index / 3)\n",
	" col = index % 3\n",
	" count[step_i - 1 + row, step_j - 1 + col] += 1\n",
	" step_i = step_i - 1 + row\n",
	" step_j = step_j - 1 + col\n",
	" return count"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 6min 14s, sys: 15.9 s, total: 6min 30s\n",
	"Wall time: 6min 6s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"results = ridge_detection(data, 0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"# %timeit -r 7 -n 1 ridge_detection(data, 0)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Numba (CUDA Python)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"@cuda.jit\n",
	"def cuda_ridge_detection(f, count, thres):\n",
	" start_i, start_j = cuda.grid(2)\n",
	" stride_i, stride_j = cuda.gridsize(2)\n",
	" for i in range(start_i, f.shape[0], stride_i):\n",
	" for j in range(start_j, f.shape[1], stride_j):\n",
	" if (\n",
	" i > 0\n",
	" and j > 0\n",
	" and i < (f.shape[0] - 1)\n",
	" and j < (f.shape[1] - 1)\n",
	" and f[i, j] > thres\n",
	" and ~math.isnan(f[i, j])\n",
	" ):\n",
	" step_i = i\n",
	" step_j = j\n",
	" for k in range(1000):\n",
	" if (\n",
	" step_i == 0\n",
	" or step_j == 0\n",
	" or step_i == (f.shape[0] - 1)\n",
	" or step_j == (f.shape[1] - 1)\n",
	" ):\n",
	" break\n",
	" index = 4\n",
	" vmax = -np.inf\n",
	" for ii in range(3):\n",
	" for jj in range(3):\n",
	" if f[step_i + ii - 1, step_j + jj - 1] > vmax:\n",
	" vmax = f[step_i + ii - 1, step_j + jj - 1]\n",
	" index = jj + 3 * ii\n",
	" if index == 4 or vmax == f[step_i, step_j] or math.isnan(vmax):\n",
	" break\n",
	" row = int(index / 3)\n",
	" col = index % 3\n",
	" cuda.atomic.add(count, (step_i - 1 + row, step_j - 1 + col), 1)\n",
	" step_i = step_i - 1 + row\n",
	" step_j = step_j - 1 + col"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"def test_func(data):\n",
	" device_data = cuda.to_device(data)\n",
	" device_results = cuda.device_array_like(device_data)\n",
	" cuda_ridge_detection[(8, 8), (8, 32)](device_data, device_results, 0)\n",
	" cuda_results = device_results.copy_to_host()\n",
	" return cuda_results"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"cuda_results = test_func(data)\n",
	"np.testing.assert_almost_equal(results, cuda_results)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.67 ms ± 8.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
	]
	}
	],
	"source": [
	"%timeit -r 7 -n 1000 test_func(data)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Speedup by 200,000+ times!!!!!\n",
	"\n",
	"- CPU-based solution cost 366 seconds (366000 ms)\n",
	"- CUDA Python solution cost 0.00167 seconds (1.67 ms)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"219161.6766467066"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"366000 / 1.67"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}
No results found