|
diff --git a/libavfilter/vf_scale_npp.c b/libavfilter/vf_scale_npp.c |
|
index 1b1b7b9fc9..2346b3b81d 100644 |
|
--- a/libavfilter/vf_scale_npp.c |
|
+++ b/libavfilter/vf_scale_npp.c |
|
@@ -22,6 +22,7 @@ |
|
*/ |
|
|
|
#include <nppi.h> |
|
+#include <cuda_runtime.h> |
|
#include <stdio.h> |
|
#include <string.h> |
|
|
|
@@ -696,12 +697,21 @@ static int nppscale_deinterleave(AVFilterContext *ctx, NPPScaleStageContext *sta |
|
AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)in->hw_frames_ctx->data; |
|
NppStatus err; |
|
|
|
+ int device_id = 0; |
|
+ cudaGetDevice(&device_id); |
|
+ CUstream cu_stream = 0; |
|
+ NppStreamContext nppStreamCtx = {0}; |
|
+ nppStreamCtx.hStream = cu_stream; |
|
+ nppStreamCtx.nCudaDeviceId = device_id; |
|
+ |
|
switch (in_frames_ctx->sw_format) { |
|
case AV_PIX_FMT_NV12: |
|
- err = nppiYCbCr420_8u_P2P3R(in->data[0], in->linesize[0], |
|
- in->data[1], in->linesize[1], |
|
- out->data, out->linesize, |
|
- (NppiSize){ in->width, in->height }); |
|
+ err = nppiYCbCr420_8u_P2P3R_Ctx( |
|
+ in->data[0], in->linesize[0], |
|
+ in->data[1], in->linesize[1], |
|
+ out->data, out->linesize, |
|
+ (NppiSize){ in->width, in->height }, |
|
+ nppStreamCtx); |
|
break; |
|
default: |
|
return AVERROR_BUG; |
|
@@ -721,18 +731,26 @@ static int nppscale_resize(AVFilterContext *ctx, NPPScaleStageContext *stage, |
|
NppStatus err; |
|
int i; |
|
|
|
+ CUstream cu_stream = 0; |
|
+ |
|
+ NppStreamContext nppStreamCtx = {0}; |
|
+ nppStreamCtx.hStream = cu_stream; |
|
+ nppStreamCtx.nCudaDeviceId = 0; |
|
+ nppStreamCtx.nMultiProcessorCount = 0; |
|
+ nppStreamCtx.nMaxThreadsPerMultiProcessor = 0; |
|
+ |
|
for (i = 0; i < FF_ARRAY_ELEMS(stage->planes_in) && i < FF_ARRAY_ELEMS(in->data) && in->data[i]; i++) { |
|
int iw = stage->planes_in[i].width; |
|
int ih = stage->planes_in[i].height; |
|
int ow = stage->planes_out[i].width; |
|
int oh = stage->planes_out[i].height; |
|
|
|
- err = nppiResizeSqrPixel_8u_C1R(in->data[i], (NppiSize){ iw, ih }, |
|
- in->linesize[i], (NppiRect){ 0, 0, iw, ih }, |
|
- out->data[i], out->linesize[i], |
|
- (NppiRect){ 0, 0, ow, oh }, |
|
- (double)ow / iw, (double)oh / ih, |
|
- 0.0, 0.0, s->interp_algo); |
|
+ err = nppiResizeSqrPixel_8u_C1R_Ctx(in->data[i], (NppiSize){ iw, ih }, |
|
+ in->linesize[i], (NppiRect){ 0, 0, iw, ih }, |
|
+ out->data[i], out->linesize[i], |
|
+ (NppiRect){ 0, 0, ow, oh }, |
|
+ (double)ow / iw, (double)oh / ih, |
|
+ 0.0, 0.0, s->interp_algo, nppStreamCtx); |
|
if (err != NPP_SUCCESS) { |
|
av_log(ctx, AV_LOG_ERROR, "NPP resize error: %d\n", err); |
|
return AVERROR_UNKNOWN; |
|
@@ -748,13 +766,22 @@ static int nppscale_interleave(AVFilterContext *ctx, NPPScaleStageContext *stage |
|
AVHWFramesContext *out_frames_ctx = (AVHWFramesContext*)out->hw_frames_ctx->data; |
|
NppStatus err; |
|
|
|
+ int device_id = 0; |
|
+ cudaGetDevice(&device_id); |
|
+ CUstream cu_stream = 0; |
|
+ NppStreamContext nppStreamCtx = {0}; |
|
+ nppStreamCtx.hStream = cu_stream; |
|
+ nppStreamCtx.nCudaDeviceId = device_id; |
|
+ |
|
switch (out_frames_ctx->sw_format) { |
|
case AV_PIX_FMT_NV12: |
|
- err = nppiYCbCr420_8u_P3P2R((const uint8_t**)in->data, |
|
- in->linesize, |
|
- out->data[0], out->linesize[0], |
|
- out->data[1], out->linesize[1], |
|
- (NppiSize){ in->width, in->height }); |
|
+ err = nppiYCbCr420_8u_P3P2R_Ctx( |
|
+ (const uint8_t**)in->data, |
|
+ in->linesize, |
|
+ out->data[0], out->linesize[0], |
|
+ out->data[1], out->linesize[1], |
|
+ (NppiSize){ in->width, in->height }, |
|
+ nppStreamCtx); |
|
break; |
|
default: |
|
return AVERROR_BUG; |
|
diff --git a/libavfilter/vf_sharpen_npp.c b/libavfilter/vf_sharpen_npp.c |
|
index c7769f5837..66ad86df06 100644 |
|
--- a/libavfilter/vf_sharpen_npp.c |
|
+++ b/libavfilter/vf_sharpen_npp.c |
|
@@ -161,13 +161,24 @@ static int nppsharpen_sharpen(AVFilterContext* ctx, AVFrame* out, AVFrame* in) |
|
|
|
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(in_ctx->sw_format); |
|
|
|
+ NppStreamContext nppStreamCtx = {0}; |
|
+ CUcontext cu_ctx; |
|
+ CUstream cu_stream = 0; |
|
+ |
|
+ cu_ctx = ((AVCUDADeviceContext*)in_ctx->device_ctx->hwctx)->cuda_ctx; |
|
+ |
|
+ nppStreamCtx.hStream = (CUstream)cu_stream; |
|
+ nppStreamCtx.nCudaDeviceId = 0; |
|
+ nppStreamCtx.nMultiProcessorCount = 0; |
|
+ nppStreamCtx.nMaxThreadsPerMultiProcessor = 0; |
|
+ |
|
for (int i = 0; i < FF_ARRAY_ELEMS(in->data) && in->data[i]; i++) { |
|
int ow = AV_CEIL_RSHIFT(in->width, (i == 1 || i == 2) ? desc->log2_chroma_w : 0); |
|
int oh = AV_CEIL_RSHIFT(in->height, (i == 1 || i == 2) ? desc->log2_chroma_h : 0); |
|
|
|
- NppStatus err = nppiFilterSharpenBorder_8u_C1R( |
|
+ NppStatus err = nppiFilterSharpenBorder_8u_C1R_Ctx( |
|
in->data[i], in->linesize[i], (NppiSize){ow, oh}, (NppiPoint){0, 0}, |
|
- out->data[i], out->linesize[i], (NppiSize){ow, oh}, s->border_type); |
|
+ out->data[i], out->linesize[i], (NppiSize){ow, oh}, s->border_type, nppStreamCtx); |
|
if (err != NPP_SUCCESS) { |
|
av_log(ctx, AV_LOG_ERROR, "NPP sharpen error: %d\n", err); |
|
return AVERROR_EXTERNAL; |
|
diff --git a/libavfilter/vf_transpose_npp.c b/libavfilter/vf_transpose_npp.c |
|
index e781d7c58b..f70dfb62f1 100644 |
|
--- a/libavfilter/vf_transpose_npp.c |
|
+++ b/libavfilter/vf_transpose_npp.c |
|
@@ -296,6 +296,14 @@ static int npptranspose_rotate(AVFilterContext *ctx, NPPTransposeStageContext *s |
|
NppStatus err; |
|
int i; |
|
|
|
+ CUstream cu_stream = 0; |
|
+ |
|
+ NppStreamContext nppStreamCtx = {0}; |
|
+ nppStreamCtx.hStream = cu_stream; |
|
+ nppStreamCtx.nCudaDeviceId = 0; |
|
+ nppStreamCtx.nMultiProcessorCount = 0; |
|
+ nppStreamCtx.nMaxThreadsPerMultiProcessor = 0; |
|
+ |
|
for (i = 0; i < FF_ARRAY_ELEMS(stage->planes_in) && i < FF_ARRAY_ELEMS(in->data) && in->data[i]; i++) { |
|
int iw = stage->planes_in[i].width; |
|
int ih = stage->planes_in[i].height; |
|
@@ -309,11 +317,12 @@ static int npptranspose_rotate(AVFilterContext *ctx, NPPTransposeStageContext *s |
|
int shiftw = (s->dir == NPP_TRANSPOSE_CLOCK || s->dir == NPP_TRANSPOSE_CLOCK_FLIP) ? ow - 1 : 0; |
|
int shifth = (s->dir == NPP_TRANSPOSE_CCLOCK || s->dir == NPP_TRANSPOSE_CLOCK_FLIP) ? oh - 1 : 0; |
|
|
|
- err = nppiRotate_8u_C1R(in->data[i], (NppiSize){ iw, ih }, |
|
- in->linesize[i], (NppiRect){ 0, 0, iw, ih }, |
|
- out->data[i], out->linesize[i], |
|
- (NppiRect){ 0, 0, ow, oh }, |
|
- angle, shiftw, shifth, NPPI_INTER_NN); |
|
+ err = nppiRotate_8u_C1R_Ctx( |
|
+ in->data[i], (NppiSize){ iw, ih }, |
|
+ in->linesize[i], (NppiRect){ 0, 0, iw, ih }, |
|
+ out->data[i], out->linesize[i], |
|
+ (NppiRect){ 0, 0, ow, oh }, |
|
+ angle, shiftw, shifth, NPPI_INTER_NN, nppStreamCtx); |
|
if (err != NPP_SUCCESS) { |
|
av_log(ctx, AV_LOG_ERROR, "NPP rotate error: %d\n", err); |
|
return AVERROR_UNKNOWN; |
|
@@ -329,13 +338,22 @@ static int npptranspose_transpose(AVFilterContext *ctx, NPPTransposeStageContext |
|
NppStatus err; |
|
int i; |
|
|
|
+ int device_id = 0; |
|
+ cudaGetDevice(&device_id); |
|
+ CUstream cu_stream = 0; |
|
+ NppStreamContext nppStreamCtx = {0}; |
|
+ nppStreamCtx.hStream = cu_stream; |
|
+ nppStreamCtx.nCudaDeviceId = device_id; |
|
+ |
|
for (i = 0; i < FF_ARRAY_ELEMS(stage->planes_in) && i < FF_ARRAY_ELEMS(in->data) && in->data[i]; i++) { |
|
int iw = stage->planes_in[i].width; |
|
int ih = stage->planes_in[i].height; |
|
|
|
- err = nppiTranspose_8u_C1R(in->data[i], in->linesize[i], |
|
- out->data[i], out->linesize[i], |
|
- (NppiSize){ iw, ih }); |
|
+ err = nppiTranspose_8u_C1R_Ctx( |
|
+ in->data[i], in->linesize[i], |
|
+ out->data[i], out->linesize[i], |
|
+ (NppiSize){ iw, ih }, |
|
+ nppStreamCtx); |
|
if (err != NPP_SUCCESS) { |
|
av_log(ctx, AV_LOG_ERROR, "NPP transpose error: %d\n", err); |
|
return AVERROR_UNKNOWN; |