Created
October 7, 2025 06:04
-
-
Save tedliosu/66d3d55d178695ecfb2857f5df95f845 to your computer and use it in GitHub Desktop.
OpenCV Canny Microbenchmark - CPU (multithreaded) vs GPU (OpenCL/UMat)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # cv2_canny_cpu_vs_opencl.py | |
| # SPDX-License-Identifier: Apache-2.0 | |
| # Copyright (c) 2025 Yensong Ted Li | |
| # Licensed under the Apache License, Version 2.0 (see http://www.apache.org/licenses/LICENSE-2.0) | |
| # | |
| # About this script: | |
| # Microbenchmark of OpenCV Canny on the same image: CPU (multithreaded) vs GPU (OpenCL/UMat). | |
| # Uses preallocated buffers and blocked medians for stable per-frame latency (ms); | |
| # reports median/min/max/std and a CPU-GPU pixel-diff count; no GUI required. | |
| # Note: operator-level microbenchmark only — excludes I/O/transfer and end-to-end pipeline effects. | |
| # | |
| # Script drafted using ChatGPT assistance | |
| import argparse, sys, cv2, time, os, numpy as np | |
| def blocked_timings(fn, repeats, block, warmup): | |
| for _ in range(warmup): fn() | |
| times = [] | |
| full_blocks, tail = divmod(repeats, block) | |
| for _ in range(full_blocks): | |
| t0 = time.perf_counter() | |
| for _ in range(block): fn() | |
| times.append((time.perf_counter() - t0) * 1000 / block) | |
| if tail: | |
| t0 = time.perf_counter() | |
| for _ in range(tail): fn() | |
| times.append((time.perf_counter() - t0) * 1000 / max(1, tail)) | |
| return np.array(times, dtype=np.float64) | |
| if __name__ == "__main__": | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--img-in", required=True, help="path to input image (grayscale loaded)") | |
| p.add_argument("--img-out", required=True, help="path to save CPU result") | |
| p.add_argument("--t1", type=float, default=30) | |
| p.add_argument("--t2", type=float, default=120) | |
| p.add_argument("--aperture", type=int, default=3, choices=[3,5,7]) | |
| p.add_argument("--l2", action="store_true", help="enable L2 gradient (default off)") | |
| p.add_argument("--block", type=int, default=50) | |
| p.add_argument("--repeats", type=int, default=500) | |
| p.add_argument("--warmup", type=int, default=10) | |
| p.add_argument("--cpu-prealloc", action="store_true", | |
| help="Use a preallocated dst for CPU Canny (faster if supported by your OpenCV build)") | |
| args = p.parse_args() | |
| if os.path.abspath(os.path.normcase(args.img_in)) == os.path.abspath(os.path.normcase(args.img_out)): | |
| raise ValueError("Input and output paths must differ.") | |
| img = cv2.imread(args.img_in, cv2.IMREAD_GRAYSCALE) | |
| if img is None: | |
| raise OSError(f"Error reading file {args.img_in}") | |
| H, W = img.shape | |
| print(f"\nOpenCV {cv2.__version__}") | |
| print(f"Image: {W}×{H}") | |
| # --- CPU (multithread) --- | |
| cv2.setUseOptimized(True) | |
| cv2.setNumThreads(-1) # let OpenCV use all cores | |
| out_cpu_buf = np.empty_like(img) | |
| def cpu_once_return(): | |
| return cv2.Canny(img, args.t1, args.t2, None, args.aperture, args.l2) | |
| def cpu_once_prealloc(): | |
| cv2.Canny(img, args.t1, args.t2, out_cpu_buf, args.aperture, args.l2) | |
| return out_cpu_buf | |
| # pick path (probe once if prealloc requested) | |
| if args.cpu_prealloc: | |
| try: | |
| _ = cpu_once_prealloc() | |
| cpu_once = cpu_once_prealloc | |
| except Exception: | |
| print("Note: prealloc path not supported by this OpenCV build; using return-alloc.") | |
| cpu_once = cpu_once_return | |
| else: | |
| cpu_once = cpu_once_return | |
| cpu_times = blocked_timings(cpu_once, args.repeats, args.block, args.warmup) | |
| out_cpu = cpu_once() # final result to compare/save | |
| print(f"\nCPU (multithread) median: {np.median(cpu_times):10.3f} ms " | |
| f"(blocks: min {np.min(cpu_times):7.3f}, max {np.max(cpu_times):7.3f}, " | |
| f"std {np.std(cpu_times, ddof=1):7.3f})") | |
| # --- GPU (OpenCL / UMat) --- | |
| cv2.ocl.setUseOpenCL(True) | |
| if not cv2.ocl.haveOpenCL(): | |
| raise SystemExit("This OpenCV build lacks OpenCL support.") | |
| dev = cv2.ocl.Device.getDefault() | |
| if dev is not None: | |
| print(f"OpenCL device: {dev.name()} ({'GPU' if dev.isAMD() or dev.isNVidia() or dev.isIntel() else 'CL'})") | |
| u_in = cv2.UMat(img) # upload once | |
| u_dst = cv2.UMat(np.empty_like(img)) # preallocate once | |
| def gpu_once(): | |
| cv2.Canny(u_in, args.t1, args.t2, u_dst, args.aperture, args.l2) | |
| # warm + measure | |
| for _ in range(args.warmup): gpu_once() | |
| cv2.ocl.finish() | |
| gpu_times = [] | |
| full_blocks, tail = divmod(args.repeats, args.block) | |
| for _ in range(full_blocks): | |
| t0 = time.perf_counter() | |
| for _ in range(args.block): gpu_once() | |
| cv2.ocl.finish() | |
| gpu_times.append((time.perf_counter() - t0) * 1000 / args.block) | |
| if tail: | |
| t0 = time.perf_counter() | |
| for _ in range(tail): gpu_once() | |
| cv2.ocl.finish() | |
| gpu_times.append((time.perf_counter() - t0) * 1000 / tail) | |
| gpu_times = np.array(gpu_times, dtype=np.float64) | |
| out_gpu = u_dst.get() | |
| # Use absolute diff so we count all mismatches | |
| diffs = cv2.countNonZero(cv2.absdiff(out_cpu, out_gpu)) | |
| print(f"GPU (OpenCL) median: {np.median(gpu_times):10.3f} ms " | |
| f"(blocks: min {np.min(gpu_times):7.3f}, max {np.max(gpu_times):7.3f}, " | |
| f"std {np.std(gpu_times, ddof=1):7.3f})") | |
| print(f"\nRan {args.repeats} iterations (block {args.block}) for both CPU and GPU") | |
| print(f"Differing pixels (CPU vs GPU): {diffs}") | |
| if not cv2.imwrite(args.img_out, out_cpu): | |
| raise OSError(f"Failed to save CPU result to {args.img_out}") | |
| print(f"CPU Canny result successfully saved to {args.img_out}\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment