tedliosu · October 7, 2025 06:04
diff --git a/cv2_canny_cpu_vs_opencl.py b/cv2_canny_cpu_vs_opencl.py
 # cv2_canny_cpu_vs_opencl.py
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (c) 2025 Yensong Ted Li
 # Licensed under the Apache License, Version 2.0 (see http://www.apache.org/licenses/LICENSE-2.0)
 #
 # About this script:
 # Microbenchmark of OpenCV Canny on the same image: CPU (multithreaded) vs GPU (OpenCL/UMat).
 # Uses preallocated buffers and blocked medians for stable per-frame latency (ms);
 # reports median/min/max/std and a CPU-GPU pixel-diff count; no GUI required.
 # Note: operator-level microbenchmark only — excludes I/O/transfer and end-to-end pipeline effects.
 #
 # Script drafted using ChatGPT assistance
 import argparse, sys, cv2, time, os, numpy as np

 def blocked_timings(fn, repeats, block, warmup):
    for _ in range(warmup): fn()
    times = []
    full_blocks, tail = divmod(repeats, block)
    for _ in range(full_blocks):
        t0 = time.perf_counter()
        for _ in range(block): fn()
        times.append((time.perf_counter() - t0) * 1000 / block)
    if tail:
        t0 = time.perf_counter()
        for _ in range(tail): fn()
        times.append((time.perf_counter() - t0) * 1000 / max(1, tail))
    return np.array(times, dtype=np.float64)

 if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--img-in",  required=True, help="path to input image (grayscale loaded)")
    p.add_argument("--img-out", required=True, help="path to save CPU result")
    p.add_argument("--t1", type=float, default=30)
    p.add_argument("--t2", type=float, default=120)
    p.add_argument("--aperture", type=int, default=3, choices=[3,5,7])
    p.add_argument("--l2", action="store_true", help="enable L2 gradient (default off)")
    p.add_argument("--block", type=int, default=50)
    p.add_argument("--repeats", type=int, default=500)
    p.add_argument("--warmup", type=int, default=10)
    p.add_argument("--cpu-prealloc", action="store_true",
                   help="Use a preallocated dst for CPU Canny (faster if supported by your OpenCV build)")
    args = p.parse_args()

    if os.path.abspath(os.path.normcase(args.img_in)) == os.path.abspath(os.path.normcase(args.img_out)):
        raise ValueError("Input and output paths must differ.")

    img = cv2.imread(args.img_in, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise OSError(f"Error reading file {args.img_in}")
    H, W = img.shape
    print(f"\nOpenCV {cv2.__version__}")
    print(f"Image: {W}×{H}")

    # --- CPU (multithread) ---
    cv2.setUseOptimized(True)
    cv2.setNumThreads(-1)  # let OpenCV use all cores

    out_cpu_buf = np.empty_like(img)
    def cpu_once_return():
        return cv2.Canny(img, args.t1, args.t2, None, args.aperture, args.l2)

    def cpu_once_prealloc():
        cv2.Canny(img, args.t1, args.t2, out_cpu_buf, args.aperture, args.l2)
        return out_cpu_buf

    # pick path (probe once if prealloc requested)
    if args.cpu_prealloc:
        try:
            _ = cpu_once_prealloc()
            cpu_once = cpu_once_prealloc
        except Exception:
            print("Note: prealloc path not supported by this OpenCV build; using return-alloc.")
            cpu_once = cpu_once_return
    else:
        cpu_once = cpu_once_return

    cpu_times = blocked_timings(cpu_once, args.repeats, args.block, args.warmup)
    out_cpu = cpu_once()  # final result to compare/save

    print(f"\nCPU (multithread) median: {np.median(cpu_times):10.3f} ms   "
          f"(blocks: min {np.min(cpu_times):7.3f}, max {np.max(cpu_times):7.3f}, "
          f"std {np.std(cpu_times, ddof=1):7.3f})")

    # --- GPU (OpenCL / UMat) ---
    cv2.ocl.setUseOpenCL(True)
    if not cv2.ocl.haveOpenCL():
        raise SystemExit("This OpenCV build lacks OpenCL support.")
    dev = cv2.ocl.Device.getDefault()
    if dev is not None:
        print(f"OpenCL device: {dev.name()} ({'GPU' if dev.isAMD() or dev.isNVidia() or dev.isIntel() else 'CL'})")

    u_in  = cv2.UMat(img)                 # upload once
    u_dst = cv2.UMat(np.empty_like(img))  # preallocate once
    def gpu_once():
        cv2.Canny(u_in, args.t1, args.t2, u_dst, args.aperture, args.l2)

    # warm + measure
    for _ in range(args.warmup): gpu_once()
    cv2.ocl.finish()
    gpu_times = []
    full_blocks, tail = divmod(args.repeats, args.block)
    for _ in range(full_blocks):
        t0 = time.perf_counter()
        for _ in range(args.block): gpu_once()
        cv2.ocl.finish()
        gpu_times.append((time.perf_counter() - t0) * 1000 / args.block)
    if tail:
        t0 = time.perf_counter()
        for _ in range(tail): gpu_once()
        cv2.ocl.finish()
        gpu_times.append((time.perf_counter() - t0) * 1000 / tail)
    gpu_times = np.array(gpu_times, dtype=np.float64)

    out_gpu = u_dst.get()
    # Use absolute diff so we count all mismatches
    diffs = cv2.countNonZero(cv2.absdiff(out_cpu, out_gpu))

    print(f"GPU (OpenCL)      median: {np.median(gpu_times):10.3f} ms   "
          f"(blocks: min {np.min(gpu_times):7.3f}, max {np.max(gpu_times):7.3f}, "
          f"std {np.std(gpu_times, ddof=1):7.3f})")
    print(f"\nRan {args.repeats} iterations (block {args.block}) for both CPU and GPU")
    print(f"Differing pixels (CPU vs GPU): {diffs}")

    if not cv2.imwrite(args.img_out, out_cpu):
        raise OSError(f"Failed to save CPU result to {args.img_out}")
    print(f"CPU Canny result successfully saved to {args.img_out}\n")
	# cv2_canny_cpu_vs_opencl.py
	# SPDX-License-Identifier: Apache-2.0
	# Copyright (c) 2025 Yensong Ted Li
	# Licensed under the Apache License, Version 2.0 (see http://www.apache.org/licenses/LICENSE-2.0)
	#
	# About this script:
	# Microbenchmark of OpenCV Canny on the same image: CPU (multithreaded) vs GPU (OpenCL/UMat).
	# Uses preallocated buffers and blocked medians for stable per-frame latency (ms);
	# reports median/min/max/std and a CPU-GPU pixel-diff count; no GUI required.
	# Note: operator-level microbenchmark only — excludes I/O/transfer and end-to-end pipeline effects.
	#
	# Script drafted using ChatGPT assistance
	import argparse, sys, cv2, time, os, numpy as np

	def blocked_timings(fn, repeats, block, warmup):
	for _ in range(warmup): fn()
	times = []
	full_blocks, tail = divmod(repeats, block)
	for _ in range(full_blocks):
	t0 = time.perf_counter()
	for _ in range(block): fn()
	times.append((time.perf_counter() - t0) * 1000 / block)
	if tail:
	t0 = time.perf_counter()
	for _ in range(tail): fn()
	times.append((time.perf_counter() - t0) * 1000 / max(1, tail))
	return np.array(times, dtype=np.float64)

	if __name__ == "__main__":
	p = argparse.ArgumentParser()
	p.add_argument("--img-in", required=True, help="path to input image (grayscale loaded)")
	p.add_argument("--img-out", required=True, help="path to save CPU result")
	p.add_argument("--t1", type=float, default=30)
	p.add_argument("--t2", type=float, default=120)
	p.add_argument("--aperture", type=int, default=3, choices=[3,5,7])
	p.add_argument("--l2", action="store_true", help="enable L2 gradient (default off)")
	p.add_argument("--block", type=int, default=50)
	p.add_argument("--repeats", type=int, default=500)
	p.add_argument("--warmup", type=int, default=10)
	p.add_argument("--cpu-prealloc", action="store_true",
	help="Use a preallocated dst for CPU Canny (faster if supported by your OpenCV build)")
	args = p.parse_args()

	if os.path.abspath(os.path.normcase(args.img_in)) == os.path.abspath(os.path.normcase(args.img_out)):
	raise ValueError("Input and output paths must differ.")

	img = cv2.imread(args.img_in, cv2.IMREAD_GRAYSCALE)
	if img is None:
	raise OSError(f"Error reading file {args.img_in}")
	H, W = img.shape
	print(f"\nOpenCV {cv2.__version__}")
	print(f"Image: {W}×{H}")

	# --- CPU (multithread) ---
	cv2.setUseOptimized(True)
	cv2.setNumThreads(-1) # let OpenCV use all cores

	out_cpu_buf = np.empty_like(img)
	def cpu_once_return():
	return cv2.Canny(img, args.t1, args.t2, None, args.aperture, args.l2)

	def cpu_once_prealloc():
	cv2.Canny(img, args.t1, args.t2, out_cpu_buf, args.aperture, args.l2)
	return out_cpu_buf

	# pick path (probe once if prealloc requested)
	if args.cpu_prealloc:
	try:
	_ = cpu_once_prealloc()
	cpu_once = cpu_once_prealloc
	except Exception:
	print("Note: prealloc path not supported by this OpenCV build; using return-alloc.")
	cpu_once = cpu_once_return
	else:
	cpu_once = cpu_once_return

	cpu_times = blocked_timings(cpu_once, args.repeats, args.block, args.warmup)
	out_cpu = cpu_once() # final result to compare/save

	print(f"\nCPU (multithread) median: {np.median(cpu_times):10.3f} ms "
	f"(blocks: min {np.min(cpu_times):7.3f}, max {np.max(cpu_times):7.3f}, "
	f"std {np.std(cpu_times, ddof=1):7.3f})")

	# --- GPU (OpenCL / UMat) ---
	cv2.ocl.setUseOpenCL(True)
	if not cv2.ocl.haveOpenCL():
	raise SystemExit("This OpenCV build lacks OpenCL support.")
	dev = cv2.ocl.Device.getDefault()
	if dev is not None:
	print(f"OpenCL device: {dev.name()} ({'GPU' if dev.isAMD() or dev.isNVidia() or dev.isIntel() else 'CL'})")

	u_in = cv2.UMat(img) # upload once
	u_dst = cv2.UMat(np.empty_like(img)) # preallocate once
	def gpu_once():
	cv2.Canny(u_in, args.t1, args.t2, u_dst, args.aperture, args.l2)

	# warm + measure
	for _ in range(args.warmup): gpu_once()
	cv2.ocl.finish()
	gpu_times = []
	full_blocks, tail = divmod(args.repeats, args.block)
	for _ in range(full_blocks):
	t0 = time.perf_counter()
	for _ in range(args.block): gpu_once()
	cv2.ocl.finish()
	gpu_times.append((time.perf_counter() - t0) * 1000 / args.block)
	if tail:
	t0 = time.perf_counter()
	for _ in range(tail): gpu_once()
	cv2.ocl.finish()
	gpu_times.append((time.perf_counter() - t0) * 1000 / tail)
	gpu_times = np.array(gpu_times, dtype=np.float64)

	out_gpu = u_dst.get()
	# Use absolute diff so we count all mismatches
	diffs = cv2.countNonZero(cv2.absdiff(out_cpu, out_gpu))

	print(f"GPU (OpenCL) median: {np.median(gpu_times):10.3f} ms "
	f"(blocks: min {np.min(gpu_times):7.3f}, max {np.max(gpu_times):7.3f}, "
	f"std {np.std(gpu_times, ddof=1):7.3f})")
	print(f"\nRan {args.repeats} iterations (block {args.block}) for both CPU and GPU")
	print(f"Differing pixels (CPU vs GPU): {diffs}")

	if not cv2.imwrite(args.img_out, out_cpu):
	raise OSError(f"Failed to save CPU result to {args.img_out}")
	print(f"CPU Canny result successfully saved to {args.img_out}\n")
No results found