|
|
|
@ -38,9 +38,10 @@ typedef struct NLMeansVulkanContext { |
|
|
|
|
VkSampler sampler; |
|
|
|
|
|
|
|
|
|
AVBufferPool *integral_buf_pool; |
|
|
|
|
AVBufferPool *state_buf_pool; |
|
|
|
|
AVBufferPool *ws_buf_pool; |
|
|
|
|
|
|
|
|
|
FFVkBuffer xyoffsets_buf; |
|
|
|
|
|
|
|
|
|
int pl_weights_rows; |
|
|
|
|
FFVulkanPipeline pl_weights; |
|
|
|
|
FFVkSPIRVShader shd_weights; |
|
|
|
@ -66,107 +67,97 @@ typedef struct NLMeansVulkanContext { |
|
|
|
|
|
|
|
|
|
extern const char *ff_source_prefix_sum_comp; |
|
|
|
|
|
|
|
|
|
static void insert_first(FFVkSPIRVShader *shd, int r, int horiz, int plane, int comp) |
|
|
|
|
static void insert_first(FFVkSPIRVShader *shd, int r, const char *off, int horiz, int plane, int comp) |
|
|
|
|
{ |
|
|
|
|
GLSLF(2, s1 = texture(input_img[%i], ivec2(x + %i, y + %i))[%i]; |
|
|
|
|
,plane, horiz ? r : 0, !horiz ? r : 0, comp); |
|
|
|
|
|
|
|
|
|
if (TYPE_ELEMS == 4) { |
|
|
|
|
GLSLF(2, s2[0] = texture(input_img[%i], ivec2(x + %i + xoffs[0], y + %i + yoffs[0]))[%i]; |
|
|
|
|
,plane, horiz ? r : 0, !horiz ? r : 0, comp); |
|
|
|
|
GLSLF(2, s2[1] = texture(input_img[%i], ivec2(x + %i + xoffs[1], y + %i + yoffs[1]))[%i]; |
|
|
|
|
,plane, horiz ? r : 0, !horiz ? r : 0, comp); |
|
|
|
|
GLSLF(2, s2[2] = texture(input_img[%i], ivec2(x + %i + xoffs[2], y + %i + yoffs[2]))[%i]; |
|
|
|
|
,plane, horiz ? r : 0, !horiz ? r : 0, comp); |
|
|
|
|
GLSLF(2, s2[3] = texture(input_img[%i], ivec2(x + %i + xoffs[3], y + %i + yoffs[3]))[%i]; |
|
|
|
|
,plane, horiz ? r : 0, !horiz ? r : 0, comp); |
|
|
|
|
} else { |
|
|
|
|
for (int i = 0; i < 16; i++) { |
|
|
|
|
GLSLF(2, s2[%i][%i] = texture(input_img[%i], ivec2(x + %i + xoffs[%i], y + %i + yoffs[%i]))[%i]; |
|
|
|
|
,i / 4, i % 4, plane, horiz ? r : 0, i, !horiz ? r : 0, i, comp); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
GLSLC(2, s2 = (s1 - s2) * (s1 - s2); ); |
|
|
|
|
GLSLF(4, s1 = texture(input_img[%i], pos + ivec2(%i + %s, %i + %s))[%i]; |
|
|
|
|
,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp); |
|
|
|
|
|
|
|
|
|
GLSLF(4, s2[0] = texture(input_img[%i], pos + offs[0] + ivec2(%i + %s, %i + %s))[%i]; |
|
|
|
|
,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp); |
|
|
|
|
GLSLF(4, s2[1] = texture(input_img[%i], pos + offs[1] + ivec2(%i + %s, %i + %s))[%i]; |
|
|
|
|
,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp); |
|
|
|
|
GLSLF(4, s2[2] = texture(input_img[%i], pos + offs[2] + ivec2(%i + %s, %i + %s))[%i]; |
|
|
|
|
,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp); |
|
|
|
|
GLSLF(4, s2[3] = texture(input_img[%i], pos + offs[3] + ivec2(%i + %s, %i + %s))[%i]; |
|
|
|
|
,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp); |
|
|
|
|
|
|
|
|
|
GLSLC(4, s2 = (s1 - s2) * (s1 - s2); ); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static void insert_horizontal_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp) |
|
|
|
|
{ |
|
|
|
|
GLSLF(1, x = int(gl_GlobalInvocationID.x) * %i; ,nb_rows); |
|
|
|
|
if (!first) { |
|
|
|
|
GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, |
|
|
|
|
gl_StorageSemanticsBuffer, |
|
|
|
|
gl_SemanticsAcquireRelease | |
|
|
|
|
gl_SemanticsMakeAvailable | |
|
|
|
|
gl_SemanticsMakeVisible); ); |
|
|
|
|
} |
|
|
|
|
GLSLF(1, for (y = 0; y < height[%i]; y++) { ,plane); |
|
|
|
|
GLSLC(2, offset = uint64_t(int_stride)*y*T_ALIGN; ); |
|
|
|
|
GLSLC(2, dst = DataBuffer(uint64_t(integral_data) + offset); ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
if (first) { |
|
|
|
|
for (int r = 0; r < nb_rows; r++) { |
|
|
|
|
insert_first(shd, r, 1, plane, comp); |
|
|
|
|
GLSLF(2, dst.v[x + %i] = s2; ,r); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
GLSLC(2, barrier(); ); |
|
|
|
|
GLSLC(2, prefix_sum(dst, 1, dst, 1); ); |
|
|
|
|
GLSLC(1, } ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLF(1, pos.y = int(gl_GlobalInvocationID.x) * %i; ,nb_rows); |
|
|
|
|
if (!first) |
|
|
|
|
GLSLC(1, barrier(); ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLF(1, if (pos.y < height[%i]) { ,plane); |
|
|
|
|
GLSLC(2, #pragma unroll(1) ); |
|
|
|
|
GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows); |
|
|
|
|
GLSLC(3, prefix_sum = DTYPE(0); ); |
|
|
|
|
GLSLC(3, offset = uint64_t(int_stride)*(pos.y + r)*T_ALIGN; ); |
|
|
|
|
GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLF(3, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane); |
|
|
|
|
if (first) |
|
|
|
|
insert_first(shd, 0, "r", 0, plane, comp); |
|
|
|
|
else |
|
|
|
|
GLSLC(4, s2 = dst.v[pos.x]; ); |
|
|
|
|
GLSLC(4, dst.v[pos.x] = s2 + prefix_sum; ); |
|
|
|
|
GLSLC(4, prefix_sum += s2; ); |
|
|
|
|
GLSLC(3, } ); |
|
|
|
|
GLSLC(2, } ); |
|
|
|
|
GLSLC(1, } ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static void insert_vertical_pass(FFVkSPIRVShader *shd, int nb_rows, int first, int plane, int comp) |
|
|
|
|
{ |
|
|
|
|
GLSLF(1, y = int(gl_GlobalInvocationID.x) * %i; ,nb_rows); |
|
|
|
|
if (!first) { |
|
|
|
|
GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, |
|
|
|
|
gl_StorageSemanticsBuffer, |
|
|
|
|
gl_SemanticsAcquireRelease | |
|
|
|
|
gl_SemanticsMakeAvailable | |
|
|
|
|
gl_SemanticsMakeVisible); ); |
|
|
|
|
} |
|
|
|
|
GLSLF(1, for (x = 0; x < width[%i]; x++) { ,plane); |
|
|
|
|
GLSLC(2, dst = DataBuffer(uint64_t(integral_data) + x*T_ALIGN); ); |
|
|
|
|
|
|
|
|
|
for (int r = 0; r < nb_rows; r++) { |
|
|
|
|
if (first) { |
|
|
|
|
insert_first(shd, r, 0, plane, comp); |
|
|
|
|
GLSLF(2, integral_data.v[(y + %i)*int_stride + x] = s2; ,r); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
GLSLC(2, barrier(); ); |
|
|
|
|
GLSLC(2, prefix_sum(dst, int_stride, dst, int_stride); ); |
|
|
|
|
GLSLC(1, } ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLF(1, pos.x = int(gl_GlobalInvocationID.x) * %i; ,nb_rows); |
|
|
|
|
GLSLC(1, #pragma unroll(1) ); |
|
|
|
|
GLSLF(1, for (r = 0; r < %i; r++) ,nb_rows); |
|
|
|
|
GLSLC(2, psum[r] = DTYPE(0); ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
if (!first) |
|
|
|
|
GLSLC(1, barrier(); ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLF(1, if (pos.x < width[%i]) { ,plane); |
|
|
|
|
GLSLF(2, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane); |
|
|
|
|
GLSLC(3, offset = uint64_t(int_stride)*pos.y*T_ALIGN; ); |
|
|
|
|
GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(3, #pragma unroll(1) ); |
|
|
|
|
GLSLF(3, for (r = 0; r < %i; r++) { ,nb_rows); |
|
|
|
|
if (first) |
|
|
|
|
insert_first(shd, 0, "r", 1, plane, comp); |
|
|
|
|
else |
|
|
|
|
GLSLC(4, s2 = dst.v[pos.x + r]; ); |
|
|
|
|
GLSLC(4, dst.v[pos.x + r] = s2 + psum[r]; ); |
|
|
|
|
GLSLC(4, psum[r] += s2; ); |
|
|
|
|
GLSLC(3, } ); |
|
|
|
|
GLSLC(2, } ); |
|
|
|
|
GLSLC(1, } ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert, |
|
|
|
|
int t, int dst_comp, int plane, int comp) |
|
|
|
|
{ |
|
|
|
|
GLSLF(1, p = patch_size[%i]; ,dst_comp); |
|
|
|
|
GLSLF(1, p = patch_size[%i]; ,dst_comp); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup, |
|
|
|
|
gl_StorageSemanticsBuffer, |
|
|
|
|
gl_SemanticsAcquireRelease | |
|
|
|
|
gl_SemanticsMakeAvailable | |
|
|
|
|
gl_SemanticsMakeVisible); ); |
|
|
|
|
GLSLC(1, barrier(); ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
if (!vert) { |
|
|
|
|
GLSLF(1, for (y = 0; y < height[%i]; y++) { ,plane); |
|
|
|
|
GLSLF(1, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane); |
|
|
|
|
GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane); |
|
|
|
|
GLSLC(3, break; ); |
|
|
|
|
GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows); |
|
|
|
|
GLSLF(3, x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows); |
|
|
|
|
GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows); |
|
|
|
|
GLSLF(3, pos.x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows); |
|
|
|
|
} else { |
|
|
|
|
GLSLF(1, for (x = 0; x < width[%i]; x++) { ,plane); |
|
|
|
|
GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane); |
|
|
|
|
GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane); |
|
|
|
|
GLSLC(3, break; ); |
|
|
|
|
GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows); |
|
|
|
|
GLSLF(3, y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows); |
|
|
|
|
GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows); |
|
|
|
|
GLSLF(3, pos.y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows); |
|
|
|
|
} |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(3, a = DTYPE(0); ); |
|
|
|
@ -174,25 +165,25 @@ static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert, |
|
|
|
|
GLSLC(3, c = DTYPE(0); ); |
|
|
|
|
GLSLC(3, d = DTYPE(0); ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(3, lt = ((x - p) < 0) || ((y - p) < 0); ); |
|
|
|
|
GLSLC(3, lt = ((pos.x - p) < 0) || ((pos.y - p) < 0); ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
if (TYPE_ELEMS == 4) { |
|
|
|
|
GLSLF(3, src[0] = texture(input_img[%i], ivec2(x + xoffs[0], y + yoffs[0]))[%i]; ,plane, comp); |
|
|
|
|
GLSLF(3, src[1] = texture(input_img[%i], ivec2(x + xoffs[1], y + yoffs[1]))[%i]; ,plane, comp); |
|
|
|
|
GLSLF(3, src[2] = texture(input_img[%i], ivec2(x + xoffs[2], y + yoffs[2]))[%i]; ,plane, comp); |
|
|
|
|
GLSLF(3, src[3] = texture(input_img[%i], ivec2(x + xoffs[3], y + yoffs[3]))[%i]; ,plane, comp); |
|
|
|
|
GLSLF(3, src[0] = texture(input_img[%i], pos + offs[0])[%i]; ,plane, comp); |
|
|
|
|
GLSLF(3, src[1] = texture(input_img[%i], pos + offs[1])[%i]; ,plane, comp); |
|
|
|
|
GLSLF(3, src[2] = texture(input_img[%i], pos + offs[2])[%i]; ,plane, comp); |
|
|
|
|
GLSLF(3, src[3] = texture(input_img[%i], pos + offs[3])[%i]; ,plane, comp); |
|
|
|
|
} else { |
|
|
|
|
for (int i = 0; i < 16; i++) |
|
|
|
|
GLSLF(3, src[%i][%i] = texture(input_img[%i], ivec2(x + xoffs[%i], y + yoffs[%i]))[%i]; |
|
|
|
|
,i / 4, i % 4, plane, i, i, comp); |
|
|
|
|
GLSLF(3, src[%i][%i] = texture(input_img[%i], pos + offs[%i])[%i]; |
|
|
|
|
,i / 4, i % 4, plane, i, comp); |
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(3, if (lt == false) { ); |
|
|
|
|
GLSLC(4, a = integral_data.v[(y - p)*int_stride + x - p]; ); |
|
|
|
|
GLSLC(4, c = integral_data.v[(y - p)*int_stride + x + p]; ); |
|
|
|
|
GLSLC(4, b = integral_data.v[(y + p)*int_stride + x - p]; ); |
|
|
|
|
GLSLC(4, d = integral_data.v[(y + p)*int_stride + x + p]; ); |
|
|
|
|
GLSLC(4, a = integral_data.v[(pos.y - p)*int_stride + pos.x - p]; ); |
|
|
|
|
GLSLC(4, c = integral_data.v[(pos.y - p)*int_stride + pos.x + p]; ); |
|
|
|
|
GLSLC(4, b = integral_data.v[(pos.y + p)*int_stride + pos.x - p]; ); |
|
|
|
|
GLSLC(4, d = integral_data.v[(pos.y + p)*int_stride + pos.x + p]; ); |
|
|
|
|
GLSLC(3, } ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(3, patch_diff = d + a - b - c; ); |
|
|
|
@ -212,27 +203,26 @@ static void insert_weights_pass(FFVkSPIRVShader *shd, int nb_rows, int vert, |
|
|
|
|
} |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
if (t > 1) { |
|
|
|
|
GLSLF(3, atomicAdd(weights_%i[y*ws_stride[%i] + x], w_sum); ,dst_comp, dst_comp); |
|
|
|
|
GLSLF(3, atomicAdd(sums_%i[y*ws_stride[%i] + x], sum); ,dst_comp, dst_comp); |
|
|
|
|
GLSLF(3, atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum); ,dst_comp, dst_comp); |
|
|
|
|
GLSLF(3, atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); ,dst_comp, dst_comp); |
|
|
|
|
} else { |
|
|
|
|
GLSLF(3, weights_%i[y*ws_stride[%i] + x] += w_sum; ,dst_comp, dst_comp); |
|
|
|
|
GLSLF(3, sums_%i[y*ws_stride[%i] + x] += sum; ,dst_comp, dst_comp); |
|
|
|
|
GLSLF(3, weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum; ,dst_comp, dst_comp); |
|
|
|
|
GLSLF(3, sums_%i[pos.y*ws_stride[%i] + pos.x] += sum; ,dst_comp, dst_comp); |
|
|
|
|
} |
|
|
|
|
GLSLC(2, } ); |
|
|
|
|
GLSLC(1, } ); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
typedef struct HorizontalPushData { |
|
|
|
|
VkDeviceAddress integral_data; |
|
|
|
|
VkDeviceAddress state_data; |
|
|
|
|
int32_t xoffs[TYPE_ELEMS]; |
|
|
|
|
int32_t yoffs[TYPE_ELEMS]; |
|
|
|
|
uint32_t width[4]; |
|
|
|
|
uint32_t height[4]; |
|
|
|
|
uint32_t ws_stride[4]; |
|
|
|
|
int32_t patch_size[4]; |
|
|
|
|
float strength[4]; |
|
|
|
|
VkDeviceAddress integral_base; |
|
|
|
|
uint32_t integral_size; |
|
|
|
|
uint32_t int_stride; |
|
|
|
|
uint32_t xyoffs_start; |
|
|
|
|
} HorizontalPushData; |
|
|
|
|
|
|
|
|
|
static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec, |
|
|
|
@ -249,26 +239,18 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e |
|
|
|
|
FFVulkanDescriptorSetBinding *desc_set; |
|
|
|
|
int max_dim = FFMAX(width, height); |
|
|
|
|
uint32_t max_wg = vkctx->props.properties.limits.maxComputeWorkGroupSize[0]; |
|
|
|
|
int max_shm = vkctx->props.properties.limits.maxComputeSharedMemorySize; |
|
|
|
|
int wg_size, wg_rows; |
|
|
|
|
|
|
|
|
|
/* Round the max workgroup size to the previous power of two */ |
|
|
|
|
max_wg = 1 << (31 - ff_clz(max_wg)); |
|
|
|
|
wg_size = max_wg; |
|
|
|
|
wg_rows = 1; |
|
|
|
|
|
|
|
|
|
if (max_wg > max_dim) { |
|
|
|
|
wg_size = max_wg / (max_wg / max_dim); |
|
|
|
|
wg_size = max_dim; |
|
|
|
|
} else if (max_wg < max_dim) { |
|
|
|
|
/* First, make it fit */ |
|
|
|
|
/* Make it fit */ |
|
|
|
|
while (wg_size*wg_rows < max_dim) |
|
|
|
|
wg_rows++; |
|
|
|
|
|
|
|
|
|
/* Second, make sure there's enough shared memory */ |
|
|
|
|
while ((wg_size * TYPE_SIZE + TYPE_SIZE + 2*4) > max_shm) { |
|
|
|
|
wg_size >>= 1; |
|
|
|
|
wg_rows++; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
RET(ff_vk_shader_init(pl, shd, "nlmeans_weights", VK_SHADER_STAGE_COMPUTE_BIT, 0)); |
|
|
|
@ -278,33 +260,24 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e |
|
|
|
|
if (t > 1) |
|
|
|
|
GLSLC(0, #extension GL_EXT_shader_atomic_float : require ); |
|
|
|
|
GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require ); |
|
|
|
|
GLSLC(0, #pragma use_vulkan_memory_model ); |
|
|
|
|
GLSLC(0, #extension GL_KHR_memory_scope_semantics : enable ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLF(0, #define N_ROWS %i ,*nb_rows); |
|
|
|
|
GLSLC(0, #define WG_SIZE (gl_WorkGroupSize.x) ); |
|
|
|
|
GLSLF(0, #define LG_WG_SIZE %i ,ff_log2(shd->local_size[0])); |
|
|
|
|
GLSLC(0, #define PARTITION_SIZE (N_ROWS*WG_SIZE) ); |
|
|
|
|
GLSLF(0, #define DTYPE %s ,TYPE_NAME); |
|
|
|
|
GLSLF(0, #define T_ALIGN %i ,TYPE_SIZE); |
|
|
|
|
GLSLF(0, #define DTYPE %s ,TYPE_NAME); |
|
|
|
|
GLSLF(0, #define T_ALIGN %i ,TYPE_SIZE); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) coherent buffer DataBuffer { ); |
|
|
|
|
GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer DataBuffer { ); |
|
|
|
|
GLSLC(1, DTYPE v[]; ); |
|
|
|
|
GLSLC(0, }; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(0, layout(buffer_reference) buffer StateData; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(0, layout(push_constant, std430) uniform pushConstants { ); |
|
|
|
|
GLSLC(1, coherent DataBuffer integral_data; ); |
|
|
|
|
GLSLC(1, StateData state; ); |
|
|
|
|
GLSLF(1, uint xoffs[%i]; ,TYPE_ELEMS); |
|
|
|
|
GLSLF(1, uint yoffs[%i]; ,TYPE_ELEMS); |
|
|
|
|
GLSLC(1, uvec4 width; ); |
|
|
|
|
GLSLC(1, uvec4 height; ); |
|
|
|
|
GLSLC(1, uvec4 ws_stride; ); |
|
|
|
|
GLSLC(1, ivec4 patch_size; ); |
|
|
|
|
GLSLC(1, vec4 strength; ); |
|
|
|
|
GLSLC(1, DataBuffer integral_base; ); |
|
|
|
|
GLSLC(1, uint integral_size; ); |
|
|
|
|
GLSLC(1, uint int_stride; ); |
|
|
|
|
GLSLC(1, uint xyoffs_start; ); |
|
|
|
|
GLSLC(0, }; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
|
|
|
|
@ -370,42 +343,65 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e |
|
|
|
|
}; |
|
|
|
|
RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1 + 2*desc->nb_components, 0, 0)); |
|
|
|
|
|
|
|
|
|
GLSLD( ff_source_prefix_sum_comp ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(0, void main() ); |
|
|
|
|
GLSLC(0, { ); |
|
|
|
|
GLSLC(1, uint64_t offset; ); |
|
|
|
|
GLSLC(1, DataBuffer dst; ); |
|
|
|
|
GLSLC(1, float s1; ); |
|
|
|
|
GLSLC(1, DTYPE s2; ); |
|
|
|
|
GLSLC(1, int r; ); |
|
|
|
|
GLSLC(1, int x; ); |
|
|
|
|
GLSLC(1, int y; ); |
|
|
|
|
GLSLC(1, int p; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(1, DTYPE a; ); |
|
|
|
|
GLSLC(1, DTYPE b; ); |
|
|
|
|
GLSLC(1, DTYPE c; ); |
|
|
|
|
GLSLC(1, DTYPE d; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(1, DTYPE patch_diff; ); |
|
|
|
|
desc_set = (FFVulkanDescriptorSetBinding []) { |
|
|
|
|
{ |
|
|
|
|
.name = "xyoffsets_buffer", |
|
|
|
|
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |
|
|
|
|
.mem_quali = "readonly", |
|
|
|
|
.stages = VK_SHADER_STAGE_COMPUTE_BIT, |
|
|
|
|
.buf_content = "ivec2 xyoffsets[];", |
|
|
|
|
}, |
|
|
|
|
}; |
|
|
|
|
RET(ff_vk_pipeline_descriptor_set_add(vkctx, pl, shd, desc_set, 1, 1, 0)); |
|
|
|
|
|
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(0, void main() ); |
|
|
|
|
GLSLC(0, { ); |
|
|
|
|
GLSLC(1, uint64_t offset; ); |
|
|
|
|
GLSLC(1, DataBuffer dst; ); |
|
|
|
|
GLSLC(1, float s1; ); |
|
|
|
|
GLSLC(1, DTYPE s2; ); |
|
|
|
|
GLSLC(1, DTYPE prefix_sum; ); |
|
|
|
|
GLSLF(1, DTYPE psum[%i]; ,*nb_rows); |
|
|
|
|
GLSLC(1, int r; ); |
|
|
|
|
GLSLC(1, ivec2 pos; ); |
|
|
|
|
GLSLC(1, int p; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(1, DataBuffer integral_data; ); |
|
|
|
|
GLSLF(1, ivec2 offs[%i]; ,TYPE_ELEMS); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(1, int invoc_idx = int(gl_WorkGroupID.z); ); |
|
|
|
|
|
|
|
|
|
GLSLC(1, offset = uint64_t(integral_size)*invoc_idx; ); |
|
|
|
|
GLSLC(1, dst = DataBuffer(uint64_t(integral_data) + offset); ); |
|
|
|
|
|
|
|
|
|
GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) + offset); ); |
|
|
|
|
for (int i = 0; i < TYPE_ELEMS*2; i += 2) |
|
|
|
|
GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + 2*%i*invoc_idx + %i]; ,i/2,TYPE_ELEMS,i); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(1, DTYPE a; ); |
|
|
|
|
GLSLC(1, DTYPE b; ); |
|
|
|
|
GLSLC(1, DTYPE c; ); |
|
|
|
|
GLSLC(1, DTYPE d; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(1, DTYPE patch_diff; ); |
|
|
|
|
if (TYPE_ELEMS == 4) { |
|
|
|
|
GLSLC(1, vec4 src; ); |
|
|
|
|
GLSLC(1, vec4 w; ); |
|
|
|
|
GLSLC(1, vec4 src; ); |
|
|
|
|
GLSLC(1, vec4 w; ); |
|
|
|
|
} else { |
|
|
|
|
GLSLC(1, vec4 src[4]; ); |
|
|
|
|
GLSLC(1, vec4 w[4]; ); |
|
|
|
|
GLSLC(1, vec4 src[4]; ); |
|
|
|
|
GLSLC(1, vec4 w[4]; ); |
|
|
|
|
} |
|
|
|
|
GLSLC(1, float w_sum; ); |
|
|
|
|
GLSLC(1, float sum; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(1, bool lt; ); |
|
|
|
|
GLSLC(1, bool gt; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(1, float w_sum; ); |
|
|
|
|
GLSLC(1, float sum; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
GLSLC(1, bool lt; ); |
|
|
|
|
GLSLC(1, bool gt; ); |
|
|
|
|
GLSLC(0, ); |
|
|
|
|
|
|
|
|
|
for (int i = 0; i < desc->nb_components; i++) { |
|
|
|
|
int off = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 8)/8); |
|
|
|
|
if (width > height) { |
|
|
|
|
if (width >= height) { |
|
|
|
|
insert_horizontal_pass(shd, *nb_rows, 1, desc->comp[i].plane, off); |
|
|
|
|
insert_vertical_pass(shd, *nb_rows, 0, desc->comp[i].plane, off); |
|
|
|
|
insert_weights_pass(shd, *nb_rows, 0, t, i, desc->comp[i].plane, off); |
|
|
|
@ -416,7 +412,7 @@ static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *e |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
GLSLC(0, } ); |
|
|
|
|
GLSLC(0, } ); |
|
|
|
|
|
|
|
|
|
RET(spv->compile_shader(spv, vkctx, shd, &spv_data, &spv_len, "main", &spv_opaque)); |
|
|
|
|
RET(ff_vk_shader_create(vkctx, shd, spv_data, spv_len, "main")); |
|
|
|
@ -584,6 +580,8 @@ static av_cold int init_filter(AVFilterContext *ctx) |
|
|
|
|
FFVulkanContext *vkctx = &s->vkctx; |
|
|
|
|
const int planes = av_pix_fmt_count_planes(s->vkctx.output_format); |
|
|
|
|
FFVkSPIRVCompiler *spv; |
|
|
|
|
int *offsets_buf; |
|
|
|
|
int offsets_dispatched = 0, nb_dispatches = 0; |
|
|
|
|
|
|
|
|
|
const AVPixFmtDescriptor *desc; |
|
|
|
|
desc = av_pix_fmt_desc_get(vkctx->output_format); |
|
|
|
@ -634,6 +632,20 @@ static av_cold int init_filter(AVFilterContext *ctx) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
RET(ff_vk_create_buf(&s->vkctx, &s->xyoffsets_buf, 2*s->nb_offsets*sizeof(int32_t), NULL, NULL, |
|
|
|
|
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | |
|
|
|
|
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, |
|
|
|
|
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | |
|
|
|
|
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); |
|
|
|
|
RET(ff_vk_map_buffer(&s->vkctx, &s->xyoffsets_buf, (uint8_t **)&offsets_buf, 0)); |
|
|
|
|
|
|
|
|
|
for (int i = 0; i < 2*s->nb_offsets; i += 2) { |
|
|
|
|
offsets_buf[i + 0] = s->xoffsets[i >> 1]; |
|
|
|
|
offsets_buf[i + 1] = s->yoffsets[i >> 1]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
RET(ff_vk_unmap_buffer(&s->vkctx, &s->xyoffsets_buf, 1)); |
|
|
|
|
|
|
|
|
|
s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS)); |
|
|
|
|
if (!vkctx->atomic_float_feats.shaderBufferFloat32AtomicAdd) { |
|
|
|
|
av_log(ctx, AV_LOG_WARNING, "Device doesn't support atomic float adds, " |
|
|
|
@ -641,11 +653,6 @@ static av_cold int init_filter(AVFilterContext *ctx) |
|
|
|
|
s->opts.t = 1; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (!vkctx->feats_12.vulkanMemoryModel) { |
|
|
|
|
av_log(ctx, AV_LOG_ERROR, "Device doesn't support the Vulkan memory model!"); |
|
|
|
|
return AVERROR(EINVAL);; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
spv = ff_vk_spirv_init(); |
|
|
|
|
if (!spv) { |
|
|
|
|
av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n"); |
|
|
|
@ -663,8 +670,19 @@ static av_cold int init_filter(AVFilterContext *ctx) |
|
|
|
|
RET(init_denoise_pipeline(vkctx, &s->e, &s->pl_denoise, &s->shd_denoise, s->sampler, |
|
|
|
|
spv, desc, planes)); |
|
|
|
|
|
|
|
|
|
av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches, %i parallel\n", |
|
|
|
|
s->nb_offsets, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS) + 1, s->opts.t); |
|
|
|
|
RET(ff_vk_set_descriptor_buffer(&s->vkctx, &s->pl_weights, NULL, 1, 0, 0, |
|
|
|
|
s->xyoffsets_buf.address, s->xyoffsets_buf.size, |
|
|
|
|
VK_FORMAT_UNDEFINED)); |
|
|
|
|
|
|
|
|
|
do { |
|
|
|
|
int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t); |
|
|
|
|
wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]); |
|
|
|
|
offsets_dispatched += wg_invoc * TYPE_ELEMS; |
|
|
|
|
nb_dispatches++; |
|
|
|
|
} while (offsets_dispatched < s->nb_offsets); |
|
|
|
|
|
|
|
|
|
av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches\n", |
|
|
|
|
s->nb_offsets, nb_dispatches); |
|
|
|
|
|
|
|
|
|
s->initialized = 1; |
|
|
|
|
|
|
|
|
@ -736,18 +754,16 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) |
|
|
|
|
int plane_widths[4]; |
|
|
|
|
int plane_heights[4]; |
|
|
|
|
|
|
|
|
|
int offsets_dispatched = 0; |
|
|
|
|
|
|
|
|
|
/* Integral */ |
|
|
|
|
AVBufferRef *state_buf; |
|
|
|
|
FFVkBuffer *state_vk; |
|
|
|
|
AVBufferRef *integral_buf; |
|
|
|
|
AVBufferRef *integral_buf = NULL; |
|
|
|
|
FFVkBuffer *integral_vk; |
|
|
|
|
uint32_t int_stride; |
|
|
|
|
size_t int_size; |
|
|
|
|
size_t state_size; |
|
|
|
|
int t_offset = 0; |
|
|
|
|
|
|
|
|
|
/* Weights/sums */ |
|
|
|
|
AVBufferRef *ws_buf; |
|
|
|
|
AVBufferRef *ws_buf = NULL; |
|
|
|
|
FFVkBuffer *ws_vk; |
|
|
|
|
VkDeviceAddress weights_addr[4]; |
|
|
|
|
VkDeviceAddress sums_addr[4]; |
|
|
|
@ -773,7 +789,6 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) |
|
|
|
|
/* Integral image */ |
|
|
|
|
int_stride = s->pl_weights.wg_size[0]*s->pl_weights_rows; |
|
|
|
|
int_size = int_stride * int_stride * TYPE_SIZE; |
|
|
|
|
state_size = int_stride * 3 *TYPE_SIZE; |
|
|
|
|
|
|
|
|
|
/* Plane dimensions */ |
|
|
|
|
for (int i = 0; i < desc->nb_components; i++) { |
|
|
|
@ -798,16 +813,6 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) |
|
|
|
|
return err; |
|
|
|
|
integral_vk = (FFVkBuffer *)integral_buf->data; |
|
|
|
|
|
|
|
|
|
err = ff_vk_get_pooled_buffer(&s->vkctx, &s->state_buf_pool, &state_buf, |
|
|
|
|
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | |
|
|
|
|
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, |
|
|
|
|
NULL, |
|
|
|
|
s->opts.t * state_size, |
|
|
|
|
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); |
|
|
|
|
if (err < 0) |
|
|
|
|
return err; |
|
|
|
|
state_vk = (FFVkBuffer *)state_buf->data; |
|
|
|
|
|
|
|
|
|
err = ff_vk_get_pooled_buffer(&s->vkctx, &s->ws_buf_pool, &ws_buf, |
|
|
|
|
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | |
|
|
|
|
VK_BUFFER_USAGE_TRANSFER_DST_BIT | |
|
|
|
@ -844,9 +849,12 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) |
|
|
|
|
RET(ff_vk_exec_add_dep_frame(vkctx, exec, out, |
|
|
|
|
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, |
|
|
|
|
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); |
|
|
|
|
|
|
|
|
|
RET(ff_vk_exec_add_dep_buf(vkctx, exec, &integral_buf, 1, 0)); |
|
|
|
|
RET(ff_vk_exec_add_dep_buf(vkctx, exec, &state_buf, 1, 0)); |
|
|
|
|
integral_buf = NULL; |
|
|
|
|
|
|
|
|
|
RET(ff_vk_exec_add_dep_buf(vkctx, exec, &ws_buf, 1, 0)); |
|
|
|
|
ws_buf = NULL; |
|
|
|
|
|
|
|
|
|
/* Input frame prep */ |
|
|
|
|
RET(ff_vk_create_imageviews(vkctx, exec, in_views, in)); |
|
|
|
@ -869,6 +877,7 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) |
|
|
|
|
VK_IMAGE_LAYOUT_GENERAL, |
|
|
|
|
VK_QUEUE_FAMILY_IGNORED); |
|
|
|
|
|
|
|
|
|
nb_buf_bar = 0; |
|
|
|
|
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { |
|
|
|
|
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, |
|
|
|
|
.srcStageMask = ws_vk->stage, |
|
|
|
@ -881,6 +890,19 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) |
|
|
|
|
.size = ws_vk->size, |
|
|
|
|
.offset = 0, |
|
|
|
|
}; |
|
|
|
|
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { |
|
|
|
|
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, |
|
|
|
|
.srcStageMask = integral_vk->stage, |
|
|
|
|
.dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, |
|
|
|
|
.srcAccessMask = integral_vk->access, |
|
|
|
|
.dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | |
|
|
|
|
VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, |
|
|
|
|
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, |
|
|
|
|
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, |
|
|
|
|
.buffer = integral_vk->buf, |
|
|
|
|
.size = integral_vk->size, |
|
|
|
|
.offset = 0, |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { |
|
|
|
|
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, |
|
|
|
@ -891,10 +913,13 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) |
|
|
|
|
}); |
|
|
|
|
ws_vk->stage = buf_bar[0].dstStageMask; |
|
|
|
|
ws_vk->access = buf_bar[0].dstAccessMask; |
|
|
|
|
integral_vk->stage = buf_bar[1].dstStageMask; |
|
|
|
|
integral_vk->access = buf_bar[1].dstAccessMask; |
|
|
|
|
|
|
|
|
|
/* Weights/sums buffer zeroing */ |
|
|
|
|
/* Buffer zeroing */ |
|
|
|
|
vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0); |
|
|
|
|
|
|
|
|
|
nb_buf_bar = 0; |
|
|
|
|
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { |
|
|
|
|
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, |
|
|
|
|
.srcStageMask = ws_vk->stage, |
|
|
|
@ -948,29 +973,22 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) |
|
|
|
|
/* Weights pipeline */ |
|
|
|
|
ff_vk_exec_bind_pipeline(vkctx, exec, &s->pl_weights); |
|
|
|
|
|
|
|
|
|
for (int i = 0; i < s->nb_offsets; i += TYPE_ELEMS) { |
|
|
|
|
int *xoffs = s->xoffsets + i; |
|
|
|
|
int *yoffs = s->yoffsets + i; |
|
|
|
|
do { |
|
|
|
|
int wg_invoc; |
|
|
|
|
HorizontalPushData pd = { |
|
|
|
|
integral_vk->address + t_offset*int_size, |
|
|
|
|
state_vk->address + t_offset*state_size, |
|
|
|
|
{ 0 }, |
|
|
|
|
{ 0 }, |
|
|
|
|
{ plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] }, |
|
|
|
|
{ plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] }, |
|
|
|
|
{ ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] }, |
|
|
|
|
{ s->patch[0], s->patch[1], s->patch[2], s->patch[3] }, |
|
|
|
|
{ s->strength[0], s->strength[1], s->strength[2], s->strength[2], }, |
|
|
|
|
integral_vk->address, |
|
|
|
|
int_size, |
|
|
|
|
int_stride, |
|
|
|
|
offsets_dispatched * 2, |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
memcpy(pd.xoffs, xoffs, sizeof(pd.xoffs)); |
|
|
|
|
memcpy(pd.yoffs, yoffs, sizeof(pd.yoffs)); |
|
|
|
|
|
|
|
|
|
/* Put a barrier once we run out of parallelism buffers */ |
|
|
|
|
if (!t_offset) { |
|
|
|
|
if (offsets_dispatched) { |
|
|
|
|
nb_buf_bar = 0; |
|
|
|
|
/* Buffer prep/sync */ |
|
|
|
|
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { |
|
|
|
|
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, |
|
|
|
|
.srcStageMask = integral_vk->stage, |
|
|
|
@ -984,39 +1002,28 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) |
|
|
|
|
.size = integral_vk->size, |
|
|
|
|
.offset = 0, |
|
|
|
|
}; |
|
|
|
|
buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { |
|
|
|
|
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, |
|
|
|
|
.srcStageMask = state_vk->stage, |
|
|
|
|
.dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, |
|
|
|
|
.srcAccessMask = state_vk->access, |
|
|
|
|
.dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | |
|
|
|
|
VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, |
|
|
|
|
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, |
|
|
|
|
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, |
|
|
|
|
.buffer = state_vk->buf, |
|
|
|
|
.size = state_vk->size, |
|
|
|
|
.offset = 0, |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { |
|
|
|
|
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, |
|
|
|
|
.pBufferMemoryBarriers = buf_bar, |
|
|
|
|
.bufferMemoryBarrierCount = nb_buf_bar, |
|
|
|
|
}); |
|
|
|
|
integral_vk->stage = buf_bar[0].dstStageMask; |
|
|
|
|
integral_vk->access = buf_bar[0].dstAccessMask; |
|
|
|
|
state_vk->stage = buf_bar[1].dstStageMask; |
|
|
|
|
state_vk->access = buf_bar[1].dstAccessMask; |
|
|
|
|
integral_vk->stage = buf_bar[1].dstStageMask; |
|
|
|
|
integral_vk->access = buf_bar[1].dstAccessMask; |
|
|
|
|
} |
|
|
|
|
t_offset = (t_offset + 1) % s->opts.t; |
|
|
|
|
|
|
|
|
|
/* Push data */ |
|
|
|
|
ff_vk_update_push_exec(vkctx, exec, &s->pl_weights, VK_SHADER_STAGE_COMPUTE_BIT, |
|
|
|
|
0, sizeof(pd), &pd); |
|
|
|
|
|
|
|
|
|
wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t); |
|
|
|
|
wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]); |
|
|
|
|
|
|
|
|
|
/* End of horizontal pass */ |
|
|
|
|
vk->CmdDispatch(exec->buf, 1, 1, 1); |
|
|
|
|
} |
|
|
|
|
vk->CmdDispatch(exec->buf, 1, 1, wg_invoc); |
|
|
|
|
|
|
|
|
|
offsets_dispatched += wg_invoc * TYPE_ELEMS; |
|
|
|
|
} while (offsets_dispatched < s->nb_offsets); |
|
|
|
|
|
|
|
|
|
RET(denoise_pass(s, exec, ws_vk, ws_stride)); |
|
|
|
|
|
|
|
|
@ -1033,6 +1040,8 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) |
|
|
|
|
return ff_filter_frame(outlink, out); |
|
|
|
|
|
|
|
|
|
fail: |
|
|
|
|
av_buffer_unref(&integral_buf); |
|
|
|
|
av_buffer_unref(&ws_buf); |
|
|
|
|
av_frame_free(&in); |
|
|
|
|
av_frame_free(&out); |
|
|
|
|
return err; |
|
|
|
@ -1051,7 +1060,6 @@ static void nlmeans_vulkan_uninit(AVFilterContext *avctx) |
|
|
|
|
ff_vk_shader_free(vkctx, &s->shd_denoise); |
|
|
|
|
|
|
|
|
|
av_buffer_pool_uninit(&s->integral_buf_pool); |
|
|
|
|
av_buffer_pool_uninit(&s->state_buf_pool); |
|
|
|
|
av_buffer_pool_uninit(&s->ws_buf_pool); |
|
|
|
|
|
|
|
|
|
if (s->sampler) |
|
|
|
|