Skip to content

Instantly share code, notes, and snippets.

@tedliosu
Created October 7, 2025 06:04
Show Gist options
  • Select an option

  • Save tedliosu/66d3d55d178695ecfb2857f5df95f845 to your computer and use it in GitHub Desktop.

Select an option

Save tedliosu/66d3d55d178695ecfb2857f5df95f845 to your computer and use it in GitHub Desktop.
OpenCV Canny Microbenchmark - CPU (multithreaded) vs GPU (OpenCL/UMat)
# cv2_canny_cpu_vs_opencl.py
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2025 Yensong Ted Li
# Licensed under the Apache License, Version 2.0 (see http://www.apache.org/licenses/LICENSE-2.0)
#
# About this script:
# Microbenchmark of OpenCV Canny on the same image: CPU (multithreaded) vs GPU (OpenCL/UMat).
# Uses preallocated buffers and blocked medians for stable per-frame latency (ms);
# reports median/min/max/std and a CPU-GPU pixel-diff count; no GUI required.
# Note: operator-level microbenchmark only — excludes I/O/transfer and end-to-end pipeline effects.
#
# Script drafted using ChatGPT assistance
import argparse, sys, cv2, time, os, numpy as np
def blocked_timings(fn, repeats, block, warmup):
for _ in range(warmup): fn()
times = []
full_blocks, tail = divmod(repeats, block)
for _ in range(full_blocks):
t0 = time.perf_counter()
for _ in range(block): fn()
times.append((time.perf_counter() - t0) * 1000 / block)
if tail:
t0 = time.perf_counter()
for _ in range(tail): fn()
times.append((time.perf_counter() - t0) * 1000 / max(1, tail))
return np.array(times, dtype=np.float64)
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("--img-in", required=True, help="path to input image (grayscale loaded)")
p.add_argument("--img-out", required=True, help="path to save CPU result")
p.add_argument("--t1", type=float, default=30)
p.add_argument("--t2", type=float, default=120)
p.add_argument("--aperture", type=int, default=3, choices=[3,5,7])
p.add_argument("--l2", action="store_true", help="enable L2 gradient (default off)")
p.add_argument("--block", type=int, default=50)
p.add_argument("--repeats", type=int, default=500)
p.add_argument("--warmup", type=int, default=10)
p.add_argument("--cpu-prealloc", action="store_true",
help="Use a preallocated dst for CPU Canny (faster if supported by your OpenCV build)")
args = p.parse_args()
if os.path.abspath(os.path.normcase(args.img_in)) == os.path.abspath(os.path.normcase(args.img_out)):
raise ValueError("Input and output paths must differ.")
img = cv2.imread(args.img_in, cv2.IMREAD_GRAYSCALE)
if img is None:
raise OSError(f"Error reading file {args.img_in}")
H, W = img.shape
print(f"\nOpenCV {cv2.__version__}")
print(f"Image: {W}×{H}")
# --- CPU (multithread) ---
cv2.setUseOptimized(True)
cv2.setNumThreads(-1) # let OpenCV use all cores
out_cpu_buf = np.empty_like(img)
def cpu_once_return():
return cv2.Canny(img, args.t1, args.t2, None, args.aperture, args.l2)
def cpu_once_prealloc():
cv2.Canny(img, args.t1, args.t2, out_cpu_buf, args.aperture, args.l2)
return out_cpu_buf
# pick path (probe once if prealloc requested)
if args.cpu_prealloc:
try:
_ = cpu_once_prealloc()
cpu_once = cpu_once_prealloc
except Exception:
print("Note: prealloc path not supported by this OpenCV build; using return-alloc.")
cpu_once = cpu_once_return
else:
cpu_once = cpu_once_return
cpu_times = blocked_timings(cpu_once, args.repeats, args.block, args.warmup)
out_cpu = cpu_once() # final result to compare/save
print(f"\nCPU (multithread) median: {np.median(cpu_times):10.3f} ms "
f"(blocks: min {np.min(cpu_times):7.3f}, max {np.max(cpu_times):7.3f}, "
f"std {np.std(cpu_times, ddof=1):7.3f})")
# --- GPU (OpenCL / UMat) ---
cv2.ocl.setUseOpenCL(True)
if not cv2.ocl.haveOpenCL():
raise SystemExit("This OpenCV build lacks OpenCL support.")
dev = cv2.ocl.Device.getDefault()
if dev is not None:
print(f"OpenCL device: {dev.name()} ({'GPU' if dev.isAMD() or dev.isNVidia() or dev.isIntel() else 'CL'})")
u_in = cv2.UMat(img) # upload once
u_dst = cv2.UMat(np.empty_like(img)) # preallocate once
def gpu_once():
cv2.Canny(u_in, args.t1, args.t2, u_dst, args.aperture, args.l2)
# warm + measure
for _ in range(args.warmup): gpu_once()
cv2.ocl.finish()
gpu_times = []
full_blocks, tail = divmod(args.repeats, args.block)
for _ in range(full_blocks):
t0 = time.perf_counter()
for _ in range(args.block): gpu_once()
cv2.ocl.finish()
gpu_times.append((time.perf_counter() - t0) * 1000 / args.block)
if tail:
t0 = time.perf_counter()
for _ in range(tail): gpu_once()
cv2.ocl.finish()
gpu_times.append((time.perf_counter() - t0) * 1000 / tail)
gpu_times = np.array(gpu_times, dtype=np.float64)
out_gpu = u_dst.get()
# Use absolute diff so we count all mismatches
diffs = cv2.countNonZero(cv2.absdiff(out_cpu, out_gpu))
print(f"GPU (OpenCL) median: {np.median(gpu_times):10.3f} ms "
f"(blocks: min {np.min(gpu_times):7.3f}, max {np.max(gpu_times):7.3f}, "
f"std {np.std(gpu_times, ddof=1):7.3f})")
print(f"\nRan {args.repeats} iterations (block {args.block}) for both CPU and GPU")
print(f"Differing pixels (CPU vs GPU): {diffs}")
if not cv2.imwrite(args.img_out, out_cpu):
raise OSError(f"Failed to save CPU result to {args.img_out}")
print(f"CPU Canny result successfully saved to {args.img_out}\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment