Skip to content

Instantly share code, notes, and snippets.

@elegos
Created September 15, 2025 08:50
Show Gist options
  • Select an option

  • Save elegos/7d9349a417e6a135c97884d0d4f65230 to your computer and use it in GitHub Desktop.

Select an option

Save elegos/7d9349a417e6a135c97884d0d4f65230 to your computer and use it in GitHub Desktop.
ffmpeg + CUDA 13

Intro

I struggled making ffmpeg work with CUDA 13 SDK, being a GPU development complete ignorant.

As far as I understand, ffmpeg still uses the non-context versions of the various CUDA functions, deprecated in CUDA SDK 13.

I was helped by the AI to "make it compile", replacing the non-context functions with the relative context ones.

This is a naive patch, mostly because it will always address the GPU with index 0 (in multi-GPU context, this might be a limitation).

Prerequisites

You need to install the CUDA SDK 13 and relative ffmpeg dependencies (check them during the configure command, I did the same).

Preparation

The patch aims to the ffmpeg's git branch "release/8.0". The compile script configures ffmpeg accordingly enabling cuda-powered encoders and filters.

Execution

chmod +x compile.sh
./compile.sh
sudo make install

ffmpeg and the rest of the binaries will be in /opt/ffmpeg_cuda/bin.

Switch between "ffmpeg" and "ffmpeg (cuda)"

You can use update-alternatives to switch between system's ffmpeg and this one, as some system software might rely on a particular version, possibly incompatible with this release.

# At least in my installation, /usr/local/bin is before /usr/bin in PATH, where ffmpeg is installed
# Take care about avoiding overwriting the original installed binary!
sudo update-alternatives --install /usr/local/bin/ffmpeg ffmpeg /opt/ffmpeg_cuda/bin/ffmpeg 100
sudo update-alternatives --install /usr/local/bin/ffmpeg ffmpeg $(which ffmpeg) 50
sudo update-alternatives ffmpeg
#!/usr/bin/env bash
set -e
# This is where, on Fedora at least, the cuda sdk installer installs the required pkgconfig, which seems not to be in the standard PKG_CONFIG_PATH.
export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
./configure \
--prefix=/opt/ffmpeg_cuda \
--enable-nonfree \
--enable-cuda \
--enable-cuda-nvcc \
--enable-libnpp \
--extra-libs="-lcudart -lnppial -lnppicc -lnppig -lnppc" \
--extra-cflags=-I/usr/local/cuda/include \
--extra-ldflags=-L/usr/local/cuda/lib64 \
--disable-shared \
--enable-static \
--nvccflags="-gencode arch=compute_89,code=sm_89 -O2" \
--enable-libx265 --enable-libx265 --enable-gpl
make -j$(nproc --all)
diff --git a/libavfilter/vf_scale_npp.c b/libavfilter/vf_scale_npp.c
index 1b1b7b9fc9..2346b3b81d 100644
--- a/libavfilter/vf_scale_npp.c
+++ b/libavfilter/vf_scale_npp.c
@@ -22,6 +22,7 @@
*/
#include <nppi.h>
+#include <cuda_runtime.h>
#include <stdio.h>
#include <string.h>
@@ -696,12 +697,21 @@ static int nppscale_deinterleave(AVFilterContext *ctx, NPPScaleStageContext *sta
AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)in->hw_frames_ctx->data;
NppStatus err;
+ int device_id = 0;
+ cudaGetDevice(&device_id);
+ CUstream cu_stream = 0;
+ NppStreamContext nppStreamCtx = {0};
+ nppStreamCtx.hStream = cu_stream;
+ nppStreamCtx.nCudaDeviceId = device_id;
+
switch (in_frames_ctx->sw_format) {
case AV_PIX_FMT_NV12:
- err = nppiYCbCr420_8u_P2P3R(in->data[0], in->linesize[0],
- in->data[1], in->linesize[1],
- out->data, out->linesize,
- (NppiSize){ in->width, in->height });
+ err = nppiYCbCr420_8u_P2P3R_Ctx(
+ in->data[0], in->linesize[0],
+ in->data[1], in->linesize[1],
+ out->data, out->linesize,
+ (NppiSize){ in->width, in->height },
+ nppStreamCtx);
break;
default:
return AVERROR_BUG;
@@ -721,18 +731,26 @@ static int nppscale_resize(AVFilterContext *ctx, NPPScaleStageContext *stage,
NppStatus err;
int i;
+ CUstream cu_stream = 0;
+
+ NppStreamContext nppStreamCtx = {0};
+ nppStreamCtx.hStream = cu_stream;
+ nppStreamCtx.nCudaDeviceId = 0;
+ nppStreamCtx.nMultiProcessorCount = 0;
+ nppStreamCtx.nMaxThreadsPerMultiProcessor = 0;
+
for (i = 0; i < FF_ARRAY_ELEMS(stage->planes_in) && i < FF_ARRAY_ELEMS(in->data) && in->data[i]; i++) {
int iw = stage->planes_in[i].width;
int ih = stage->planes_in[i].height;
int ow = stage->planes_out[i].width;
int oh = stage->planes_out[i].height;
- err = nppiResizeSqrPixel_8u_C1R(in->data[i], (NppiSize){ iw, ih },
- in->linesize[i], (NppiRect){ 0, 0, iw, ih },
- out->data[i], out->linesize[i],
- (NppiRect){ 0, 0, ow, oh },
- (double)ow / iw, (double)oh / ih,
- 0.0, 0.0, s->interp_algo);
+ err = nppiResizeSqrPixel_8u_C1R_Ctx(in->data[i], (NppiSize){ iw, ih },
+ in->linesize[i], (NppiRect){ 0, 0, iw, ih },
+ out->data[i], out->linesize[i],
+ (NppiRect){ 0, 0, ow, oh },
+ (double)ow / iw, (double)oh / ih,
+ 0.0, 0.0, s->interp_algo, nppStreamCtx);
if (err != NPP_SUCCESS) {
av_log(ctx, AV_LOG_ERROR, "NPP resize error: %d\n", err);
return AVERROR_UNKNOWN;
@@ -748,13 +766,22 @@ static int nppscale_interleave(AVFilterContext *ctx, NPPScaleStageContext *stage
AVHWFramesContext *out_frames_ctx = (AVHWFramesContext*)out->hw_frames_ctx->data;
NppStatus err;
+ int device_id = 0;
+ cudaGetDevice(&device_id);
+ CUstream cu_stream = 0;
+ NppStreamContext nppStreamCtx = {0};
+ nppStreamCtx.hStream = cu_stream;
+ nppStreamCtx.nCudaDeviceId = device_id;
+
switch (out_frames_ctx->sw_format) {
case AV_PIX_FMT_NV12:
- err = nppiYCbCr420_8u_P3P2R((const uint8_t**)in->data,
- in->linesize,
- out->data[0], out->linesize[0],
- out->data[1], out->linesize[1],
- (NppiSize){ in->width, in->height });
+ err = nppiYCbCr420_8u_P3P2R_Ctx(
+ (const uint8_t**)in->data,
+ in->linesize,
+ out->data[0], out->linesize[0],
+ out->data[1], out->linesize[1],
+ (NppiSize){ in->width, in->height },
+ nppStreamCtx);
break;
default:
return AVERROR_BUG;
diff --git a/libavfilter/vf_sharpen_npp.c b/libavfilter/vf_sharpen_npp.c
index c7769f5837..66ad86df06 100644
--- a/libavfilter/vf_sharpen_npp.c
+++ b/libavfilter/vf_sharpen_npp.c
@@ -161,13 +161,24 @@ static int nppsharpen_sharpen(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(in_ctx->sw_format);
+ NppStreamContext nppStreamCtx = {0};
+ CUcontext cu_ctx;
+ CUstream cu_stream = 0;
+
+ cu_ctx = ((AVCUDADeviceContext*)in_ctx->device_ctx->hwctx)->cuda_ctx;
+
+ nppStreamCtx.hStream = (CUstream)cu_stream;
+ nppStreamCtx.nCudaDeviceId = 0;
+ nppStreamCtx.nMultiProcessorCount = 0;
+ nppStreamCtx.nMaxThreadsPerMultiProcessor = 0;
+
for (int i = 0; i < FF_ARRAY_ELEMS(in->data) && in->data[i]; i++) {
int ow = AV_CEIL_RSHIFT(in->width, (i == 1 || i == 2) ? desc->log2_chroma_w : 0);
int oh = AV_CEIL_RSHIFT(in->height, (i == 1 || i == 2) ? desc->log2_chroma_h : 0);
- NppStatus err = nppiFilterSharpenBorder_8u_C1R(
+ NppStatus err = nppiFilterSharpenBorder_8u_C1R_Ctx(
in->data[i], in->linesize[i], (NppiSize){ow, oh}, (NppiPoint){0, 0},
- out->data[i], out->linesize[i], (NppiSize){ow, oh}, s->border_type);
+ out->data[i], out->linesize[i], (NppiSize){ow, oh}, s->border_type, nppStreamCtx);
if (err != NPP_SUCCESS) {
av_log(ctx, AV_LOG_ERROR, "NPP sharpen error: %d\n", err);
return AVERROR_EXTERNAL;
diff --git a/libavfilter/vf_transpose_npp.c b/libavfilter/vf_transpose_npp.c
index e781d7c58b..f70dfb62f1 100644
--- a/libavfilter/vf_transpose_npp.c
+++ b/libavfilter/vf_transpose_npp.c
@@ -296,6 +296,14 @@ static int npptranspose_rotate(AVFilterContext *ctx, NPPTransposeStageContext *s
NppStatus err;
int i;
+ CUstream cu_stream = 0;
+
+ NppStreamContext nppStreamCtx = {0};
+ nppStreamCtx.hStream = cu_stream;
+ nppStreamCtx.nCudaDeviceId = 0;
+ nppStreamCtx.nMultiProcessorCount = 0;
+ nppStreamCtx.nMaxThreadsPerMultiProcessor = 0;
+
for (i = 0; i < FF_ARRAY_ELEMS(stage->planes_in) && i < FF_ARRAY_ELEMS(in->data) && in->data[i]; i++) {
int iw = stage->planes_in[i].width;
int ih = stage->planes_in[i].height;
@@ -309,11 +317,12 @@ static int npptranspose_rotate(AVFilterContext *ctx, NPPTransposeStageContext *s
int shiftw = (s->dir == NPP_TRANSPOSE_CLOCK || s->dir == NPP_TRANSPOSE_CLOCK_FLIP) ? ow - 1 : 0;
int shifth = (s->dir == NPP_TRANSPOSE_CCLOCK || s->dir == NPP_TRANSPOSE_CLOCK_FLIP) ? oh - 1 : 0;
- err = nppiRotate_8u_C1R(in->data[i], (NppiSize){ iw, ih },
- in->linesize[i], (NppiRect){ 0, 0, iw, ih },
- out->data[i], out->linesize[i],
- (NppiRect){ 0, 0, ow, oh },
- angle, shiftw, shifth, NPPI_INTER_NN);
+ err = nppiRotate_8u_C1R_Ctx(
+ in->data[i], (NppiSize){ iw, ih },
+ in->linesize[i], (NppiRect){ 0, 0, iw, ih },
+ out->data[i], out->linesize[i],
+ (NppiRect){ 0, 0, ow, oh },
+ angle, shiftw, shifth, NPPI_INTER_NN, nppStreamCtx);
if (err != NPP_SUCCESS) {
av_log(ctx, AV_LOG_ERROR, "NPP rotate error: %d\n", err);
return AVERROR_UNKNOWN;
@@ -329,13 +338,22 @@ static int npptranspose_transpose(AVFilterContext *ctx, NPPTransposeStageContext
NppStatus err;
int i;
+ int device_id = 0;
+ cudaGetDevice(&device_id);
+ CUstream cu_stream = 0;
+ NppStreamContext nppStreamCtx = {0};
+ nppStreamCtx.hStream = cu_stream;
+ nppStreamCtx.nCudaDeviceId = device_id;
+
for (i = 0; i < FF_ARRAY_ELEMS(stage->planes_in) && i < FF_ARRAY_ELEMS(in->data) && in->data[i]; i++) {
int iw = stage->planes_in[i].width;
int ih = stage->planes_in[i].height;
- err = nppiTranspose_8u_C1R(in->data[i], in->linesize[i],
- out->data[i], out->linesize[i],
- (NppiSize){ iw, ih });
+ err = nppiTranspose_8u_C1R_Ctx(
+ in->data[i], in->linesize[i],
+ out->data[i], out->linesize[i],
+ (NppiSize){ iw, ih },
+ nppStreamCtx);
if (err != NPP_SUCCESS) {
av_log(ctx, AV_LOG_ERROR, "NPP transpose error: %d\n", err);
return AVERROR_UNKNOWN;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment