lavfi: add nlmeans_vulkan filter

2 years ago · 160a415e22
parent dfff3877b7
commit 160a415e22
6 changed files with 1278 additions and 0 deletions
--- a/1
+++ b/1
@ -3705,6 +3705,7 @@ minterpolate_filter_select="scene_sad"
 mptestsrc_filter_deps="gpl"
 negate_filter_deps="lut_filter"
 nlmeans_opencl_filter_deps="opencl"
+nlmeans_vulkan_filter_deps="vulkan spirv_compiler"
 nnedi_filter_deps="gpl"
 ocr_filter_deps="libtesseract"
 ocv_filter_deps="libopencv"
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@ -390,6 +390,8 @@ OBJS-$(CONFIG_MULTIPLY_FILTER)               += vf_multiply.o
 OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_negate.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += vf_nlmeans.o
 OBJS-$(CONFIG_NLMEANS_OPENCL_FILTER)         += vf_nlmeans_opencl.o opencl.o opencl/nlmeans.o
+OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER)         += vf_nlmeans_vulkan.o vulkan.o vulkan_filter.o \
+                                                vulkan/prefix_sum.o
 OBJS-$(CONFIG_NNEDI_FILTER)                  += vf_nnedi.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@ -368,6 +368,7 @@ extern const AVFilter ff_vf_multiply;
 extern const AVFilter ff_vf_negate;
 extern const AVFilter ff_vf_nlmeans;
 extern const AVFilter ff_vf_nlmeans_opencl;
+extern const AVFilter ff_vf_nlmeans_vulkan;
 extern const AVFilter ff_vf_nnedi;
 extern const AVFilter ff_vf_noformat;
 extern const AVFilter ff_vf_noise;
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
--- a/libavfilter/vulkan/prefix_sum.comp
+++ b/libavfilter/vulkan/prefix_sum.comp
@ -0,0 +1,151 @@
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+
+#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire
+#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease
+
+// These correspond to X, A, P respectively in the prefix sum paper.
+#define FLAG_NOT_READY       0u
+#define FLAG_AGGREGATE_READY 1u
+#define FLAG_PREFIX_READY    2u
+
+layout(buffer_reference, buffer_reference_align = T_ALIGN) nonprivate buffer StateData {
+    DTYPE aggregate;
+    DTYPE prefix;
+    uint flag;
+};
+
+shared DTYPE sh_scratch[WG_SIZE];
+shared DTYPE sh_prefix;
+shared uint  sh_part_ix;
+shared uint  sh_flag;
+
+void prefix_sum(DataBuffer dst, uint dst_stride, DataBuffer src, uint src_stride)
+{
+    DTYPE local[N_ROWS];
+    // Determine partition to process by atomic counter (described in Section 4.4 of prefix sum paper).
+    if (gl_GlobalInvocationID.x == 0)
+          sh_part_ix = gl_WorkGroupID.x;
+//        sh_part_ix = atomicAdd(part_counter, 1);
+
+    barrier();
+    uint part_ix = sh_part_ix;
+
+    uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
+
+    // TODO: gate buffer read? (evaluate whether shader check or CPU-side padding is better)
+    local[0] = src.v[ix*src_stride];
+    for (uint i = 1; i < N_ROWS; i++)
+        local[i] = local[i - 1] + src.v[(ix + i)*src_stride];
+
+    DTYPE agg = local[N_ROWS - 1];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1u << i))
+            agg += sh_scratch[gl_LocalInvocationID.x - (1u << i)];
+        barrier();
+
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+
+    // Publish aggregate for this partition
+    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+        state[part_ix].aggregate = agg;
+        if (part_ix == 0)
+            state[0].prefix = agg;
+    }
+
+    // Write flag with release semantics
+    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+        uint flag = part_ix == 0 ? FLAG_PREFIX_READY : FLAG_AGGREGATE_READY;
+        atomicStore(state[part_ix].flag, flag, gl_ScopeDevice, RELEASE);
+    }
+
+    DTYPE exclusive = DTYPE(0);
+    if (part_ix != 0) {
+        // step 4 of paper: decoupled lookback
+        uint look_back_ix = part_ix - 1;
+
+        DTYPE their_agg;
+        uint their_ix = 0;
+        while (true) {
+            // Read flag with acquire semantics.
+            if (gl_LocalInvocationID.x == WG_SIZE - 1)
+                sh_flag = atomicLoad(state[look_back_ix].flag, gl_ScopeDevice, ACQUIRE);
+
+            // The flag load is done only in the last thread. However, because the
+            // translation of memoryBarrierBuffer to Metal requires uniform control
+            // flow, we broadcast it to all threads.
+            barrier();
+
+            uint flag = sh_flag;
+            barrier();
+
+            if (flag == FLAG_PREFIX_READY) {
+                if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+                    DTYPE their_prefix = state[look_back_ix].prefix;
+                    exclusive = their_prefix + exclusive;
+                }
+                break;
+            } else if (flag == FLAG_AGGREGATE_READY) {
+                if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+                    their_agg = state[look_back_ix].aggregate;
+                    exclusive = their_agg + exclusive;
+                }
+                look_back_ix--;
+                their_ix = 0;
+                continue;
+            } // else spins
+
+            if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+                // Unfortunately there's no guarantee of forward progress of other
+                // workgroups, so compute a bit of the aggregate before trying again.
+                // In the worst case, spinning stops when the aggregate is complete.
+                DTYPE m = src.v[(look_back_ix * PARTITION_SIZE + their_ix)*src_stride];
+                if (their_ix == 0)
+                    their_agg = m;
+                else
+                    their_agg += m;
+
+                their_ix++;
+                if (their_ix == PARTITION_SIZE) {
+                    exclusive = their_agg + exclusive;
+                    if (look_back_ix == 0) {
+                        sh_flag = FLAG_PREFIX_READY;
+                    } else {
+                        look_back_ix--;
+                        their_ix = 0;
+                    }
+                }
+            }
+            barrier();
+            flag = sh_flag;
+            barrier();
+            if (flag == FLAG_PREFIX_READY)
+                break;
+        }
+
+        // step 5 of paper: compute inclusive prefix
+        if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+            DTYPE inclusive_prefix = exclusive + agg;
+            sh_prefix = exclusive;
+            state[part_ix].prefix = inclusive_prefix;
+        }
+
+        if (gl_LocalInvocationID.x == WG_SIZE - 1)
+            atomicStore(state[part_ix].flag, FLAG_PREFIX_READY, gl_ScopeDevice, RELEASE);
+    }
+
+    barrier();
+    if (part_ix != 0)
+        exclusive = sh_prefix;
+
+    DTYPE row = exclusive;
+    if (gl_LocalInvocationID.x > 0)
+        row += sh_scratch[gl_LocalInvocationID.x - 1];
+
+    // note - may overwrite
+    for (uint i = 0; i < N_ROWS; i++)
+        dst.v[(ix + i)*dst_stride] = row + local[i];
+}
--- a/libavutil/vulkan_functions.h
+++ b/libavutil/vulkan_functions.h
@ -133,6 +133,7 @@ typedef enum FFVulkanExtensions {
    MACRO(1, 1, FF_VK_EXT_NO_FLAG,              CreateBuffer)                            \
    MACRO(1, 1, FF_VK_EXT_NO_FLAG,              BindBufferMemory)                        \
    MACRO(1, 1, FF_VK_EXT_NO_FLAG,              GetBufferDeviceAddress)                    \
+    MACRO(1, 1, FF_VK_EXT_NO_FLAG,              CmdFillBuffer)                             \
    MACRO(1, 1, FF_VK_EXT_NO_FLAG,              DestroyBuffer)                           \
                                                                                         \
    /* Image */                                                                          \