From b29c7bcbf6bc257f4302b8bed0509bb5489ca068 Mon Sep 17 00:00:00 2001 From: Jarek Samic Date: Thu, 8 Aug 2019 09:24:32 -0400 Subject: [PATCH] lavfi: add deshake_opencl filter --- configure | 1 + doc/filters.texi | 69 + libavfilter/Makefile | 2 + libavfilter/allfilters.c | 1 + libavfilter/opencl/deshake.cl | 647 +++++++++ libavfilter/opencl_source.h | 1 + libavfilter/version.h | 2 +- libavfilter/vf_deshake_opencl.c | 2202 +++++++++++++++++++++++++++++++ 8 files changed, 2924 insertions(+), 1 deletion(-) create mode 100644 libavfilter/opencl/deshake.cl create mode 100644 libavfilter/vf_deshake_opencl.c diff --git a/configure b/configure index c5b71883fd..51dc77c780 100755 --- a/configure +++ b/configure @@ -3454,6 +3454,7 @@ delogo_filter_deps="gpl" denoise_vaapi_filter_deps="vaapi" derain_filter_select="dnn" deshake_filter_select="pixelutils" +deshake_opencl_filter_deps="opencl" dilation_opencl_filter_deps="opencl" drawtext_filter_deps="libfreetype" drawtext_filter_suggest="libfontconfig libfribidi" diff --git a/doc/filters.texi b/doc/filters.texi index 323c02970e..2fbb479186 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -19795,6 +19795,75 @@ Make every semi-green pixel in the input transparent with some slight blending: @end example @end itemize +@section deshake_opencl +Feature-point based video stabilization filter. + +The filter accepts the following options: + +@table @option +@item tripod +Simulates a tripod by preventing any camera movement whatsoever from the original frame. Defaults to @code{0}. + +@item debug +Whether or not additional debug info should be displayed, both in the processed output and in the console. + +Note that in order to see console debug output you will also need to pass @code{-v verbose} to ffmpeg. + +Viewing point matches in the output video is only supported for RGB input. + +Defaults to @code{0}. + +@item adaptive_crop +Whether or not to do a tiny bit of cropping at the borders to cut down on the amount of mirrored pixels. + +Defaults to @code{1}. + +@item refine_features +Whether or not feature points should be refined at a sub-pixel level. + +This can be turned off for a slight performance gain at the cost of precision. + +Defaults to @code{1}. + +@item smooth_strength +The strength of the smoothing applied to the camera path from @code{0.0} to @code{1.0}. + +@code{1.0} is the maximum smoothing strength while values less than that result in less smoothing. + +@code{0.0} causes the filter to adaptively choose a smoothing strength on a per-frame basis. + +Defaults to @code{0.0}. + +@item smooth_window_multiplier +Controls the size of the smoothing window (the number of frames buffered to determine motion information from). + +The size of the smoothing window is determined by multiplying the framerate of the video by this number. + +Acceptable values range from @code{0.1} to @code{10.0}. + +Larger values increase the amount of motion data available for determining how to smooth the camera path, +potentially improving smoothness, but also increase latency and memory usage. + +Defaults to @code{2.0}. + +@end table + +@subsection Examples + +@itemize +@item +Stabilize a video with a fixed, medium smoothing strength: +@example +-i INPUT -vf "hwupload, deshake_opencl=smooth_strength=0.5, hwdownload" OUTPUT +@end example + +@item +Stabilize a video with debugging (both in console and in rendered video): +@example +-i INPUT -filter_complex "[0:v]format=rgba, hwupload, deshake_opencl=debug=1, hwdownload, format=rgba, format=yuv420p" -v verbose OUTPUT +@end example +@end itemize + @section nlmeans_opencl Non-local Means denoise filter through OpenCL, this filter accepts same options as @ref{nlmeans}. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 345f7c95cd..3ef4191d9a 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -211,6 +211,8 @@ OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o +OBJS-$(CONFIG_DESHAKE_OPENCL_FILTER) += vf_deshake_opencl.o opencl.o \ + opencl/deshake.o OBJS-$(CONFIG_DESHAKE_FILTER) += vf_deshake.o OBJS-$(CONFIG_DESPILL_FILTER) += vf_despill.o OBJS-$(CONFIG_DETELECINE_FILTER) += vf_detelecine.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 5799fb4b3c..b675c688ee 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -200,6 +200,7 @@ extern AVFilter ff_vf_delogo; extern AVFilter ff_vf_denoise_vaapi; extern AVFilter ff_vf_derain; extern AVFilter ff_vf_deshake; +extern AVFilter ff_vf_deshake_opencl; extern AVFilter ff_vf_despill; extern AVFilter ff_vf_detelecine; extern AVFilter ff_vf_dilation; diff --git a/libavfilter/opencl/deshake.cl b/libavfilter/opencl/deshake.cl new file mode 100644 index 0000000000..fef2681dc6 --- /dev/null +++ b/libavfilter/opencl/deshake.cl @@ -0,0 +1,647 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Copyright (C) 2000, Intel Corporation, all rights reserved. + * Copyright (C) 2013, OpenCV Foundation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistribution's of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistribution's in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * The name of the copyright holders may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the Intel Corporation or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#define HARRIS_THRESHOLD 3.0f +// Block size over which to compute harris response +// +// Note that changing this will require fiddling with the local array sizes in +// harris_response +#define HARRIS_RADIUS 2 +#define DISTANCE_THRESHOLD 80 + +// Sub-pixel refinement window for feature points +#define REFINE_WIN_HALF_W 5 +#define REFINE_WIN_HALF_H 5 +#define REFINE_WIN_W 11 // REFINE_WIN_HALF_W * 2 + 1 +#define REFINE_WIN_H 11 + +// Non-maximum suppression window size +#define NONMAX_WIN 30 +#define NONMAX_WIN_HALF 15 // NONMAX_WIN / 2 + +typedef struct PointPair { + // Previous frame + float2 p1; + // Current frame + float2 p2; +} PointPair; + +typedef struct SmoothedPointPair { + // Non-smoothed point in current frame + int2 p1; + // Smoothed point in current frame + float2 p2; +} SmoothedPointPair; + +typedef struct MotionVector { + PointPair p; + // Used to mark vectors as potential outliers + int should_consider; +} MotionVector; + +const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | + CLK_ADDRESS_CLAMP_TO_EDGE | + CLK_FILTER_NEAREST; + +const sampler_t sampler_linear = CLK_NORMALIZED_COORDS_FALSE | + CLK_ADDRESS_CLAMP_TO_EDGE | + CLK_FILTER_LINEAR; + +const sampler_t sampler_linear_mirror = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_MIRRORED_REPEAT | + CLK_FILTER_LINEAR; + +// Writes to a 1D array at loc, treating it as a 2D array with the same +// dimensions as the global work size. +static void write_to_1d_arrf(__global float *buf, int2 loc, float val) { + buf[loc.x + loc.y * get_global_size(0)] = val; +} + +static void write_to_1d_arrul8(__global ulong8 *buf, int2 loc, ulong8 val) { + buf[loc.x + loc.y * get_global_size(0)] = val; +} + +static void write_to_1d_arrvec(__global MotionVector *buf, int2 loc, MotionVector val) { + buf[loc.x + loc.y * get_global_size(0)] = val; +} + +static void write_to_1d_arrf2(__global float2 *buf, int2 loc, float2 val) { + buf[loc.x + loc.y * get_global_size(0)] = val; +} + +static ulong8 read_from_1d_arrul8(__global const ulong8 *buf, int2 loc) { + return buf[loc.x + loc.y * get_global_size(0)]; +} + +static float2 read_from_1d_arrf2(__global const float2 *buf, int2 loc) { + return buf[loc.x + loc.y * get_global_size(0)]; +} + +// Returns the grayscale value at the given point. +static float pixel_grayscale(__read_only image2d_t src, int2 loc) { + float4 pixel = read_imagef(src, sampler, loc); + return (pixel.x + pixel.y + pixel.z) / 3.0f; +} + +static float convolve( + __local const float *grayscale, + int local_idx_x, + int local_idx_y, + float mask[3][3] +) { + float ret = 0; + + // These loops touch each pixel surrounding loc as well as loc itself + for (int i = 1, i2 = 0; i >= -1; --i, ++i2) { + for (int j = -1, j2 = 0; j <= 1; ++j, ++j2) { + ret += mask[i2][j2] * grayscale[(local_idx_x + 3 + j) + (local_idx_y + 3 + i) * 14]; + } + } + + return ret; +} + +// Sums dx * dy for all pixels within radius of loc +static float sum_deriv_prod( + __local const float *grayscale, + float mask_x[3][3], + float mask_y[3][3] +) { + float ret = 0; + + for (int i = HARRIS_RADIUS; i >= -HARRIS_RADIUS; --i) { + for (int j = -HARRIS_RADIUS; j <= HARRIS_RADIUS; ++j) { + ret += convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask_x) * + convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask_y); + } + } + + return ret; +} + +// Sums d<>^2 (determined by mask) for all pixels within radius of loc +static float sum_deriv_pow(__local const float *grayscale, float mask[3][3]) +{ + float ret = 0; + + for (int i = HARRIS_RADIUS; i >= -HARRIS_RADIUS; --i) { + for (int j = -HARRIS_RADIUS; j <= HARRIS_RADIUS; ++j) { + float deriv = convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask); + ret += deriv * deriv; + } + } + + return ret; +} + +// Fills a box with the given radius and pixel around loc +static void draw_box(__write_only image2d_t dst, int2 loc, float4 pixel, int radius) +{ + for (int i = -radius; i <= radius; ++i) { + for (int j = -radius; j <= radius; ++j) { + write_imagef( + dst, + (int2)( + // Clamp to avoid writing outside image bounds + clamp(loc.x + i, 0, get_image_dim(dst).x - 1), + clamp(loc.y + j, 0, get_image_dim(dst).y - 1) + ), + pixel + ); + } + } +} + +// Converts the src image to grayscale +__kernel void grayscale( + __read_only image2d_t src, + __write_only image2d_t grayscale +) { + int2 loc = (int2)(get_global_id(0), get_global_id(1)); + write_imagef(grayscale, loc, (float4)(pixel_grayscale(src, loc), 0.0f, 0.0f, 1.0f)); +} + +// This kernel computes the harris response for the given grayscale src image +// within the given radius and writes it to harris_buf +__kernel void harris_response( + __read_only image2d_t grayscale, + __global float *harris_buf +) { + int2 loc = (int2)(get_global_id(0), get_global_id(1)); + + if (loc.x > get_image_width(grayscale) - 1 || loc.y > get_image_height(grayscale) - 1) { + write_to_1d_arrf(harris_buf, loc, 0); + return; + } + + float scale = 1.0f / ((1 << 2) * HARRIS_RADIUS * 255.0f); + + float sobel_mask_x[3][3] = { + {-1, 0, 1}, + {-2, 0, 2}, + {-1, 0, 1} + }; + + float sobel_mask_y[3][3] = { + { 1, 2, 1}, + { 0, 0, 0}, + {-1, -2, -1} + }; + + // 8 x 8 local work + 3 pixels around each side (needed to accomodate for the + // block size radius of 2) + __local float grayscale_data[196]; + + int idx = get_group_id(0) * get_local_size(0); + int idy = get_group_id(1) * get_local_size(1); + + for (int i = idy - 3, it = 0; i < idy + (int)get_local_size(1) + 3; i++, it++) { + for (int j = idx - 3, jt = 0; j < idx + (int)get_local_size(0) + 3; j++, jt++) { + grayscale_data[jt + it * 14] = read_imagef(grayscale, sampler, (int2)(j, i)).x; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + float sumdxdy = sum_deriv_prod(grayscale_data, sobel_mask_x, sobel_mask_y); + float sumdx2 = sum_deriv_pow(grayscale_data, sobel_mask_x); + float sumdy2 = sum_deriv_pow(grayscale_data, sobel_mask_y); + + float trace = sumdx2 + sumdy2; + // r = det(M) - k(trace(M))^2 + // k usually between 0.04 to 0.06 + float r = (sumdx2 * sumdy2 - sumdxdy * sumdxdy) - 0.04f * (trace * trace) * pown(scale, 4); + + // Threshold the r value + harris_buf[loc.x + loc.y * get_image_width(grayscale)] = r * step(HARRIS_THRESHOLD, r); +} + +// Gets a patch centered around a float coordinate from a grayscale image using +// bilinear interpolation +static void get_rect_sub_pix( + __read_only image2d_t grayscale, + float *buffer, + int size_x, + int size_y, + float2 center +) { + float2 offset = ((float2)(size_x, size_y) - 1.0f) * 0.5f; + + for (int i = 0; i < size_y; i++) { + for (int j = 0; j < size_x; j++) { + buffer[i * size_x + j] = read_imagef( + grayscale, + sampler_linear, + (float2)(j, i) + center - offset + ).x * 255.0f; + } + } +} + +// Refines detected features at a sub-pixel level +// +// This function is ported from OpenCV +static float2 corner_sub_pix( + __read_only image2d_t grayscale, + float2 feature, + float *mask +) { + float2 init = feature; + int src_width = get_global_size(0); + int src_height = get_global_size(1); + + const int max_iters = 40; + const float eps = 0.001f * 0.001f; + int i, j, k; + + int iter = 0; + float err = 0; + float subpix[(REFINE_WIN_W + 2) * (REFINE_WIN_H + 2)]; + const float flt_epsilon = 0x1.0p-23f; + + do { + float2 feature_tmp; + float a = 0, b = 0, c = 0, bb1 = 0, bb2 = 0; + + get_rect_sub_pix(grayscale, subpix, REFINE_WIN_W + 2, REFINE_WIN_H + 2, feature); + float *subpix_ptr = subpix; + subpix_ptr += REFINE_WIN_W + 2 + 1; + + // process gradient + for (i = 0, k = 0; i < REFINE_WIN_H; i++, subpix_ptr += REFINE_WIN_W + 2) { + float py = i - REFINE_WIN_HALF_H; + + for (j = 0; j < REFINE_WIN_W; j++, k++) { + float m = mask[k]; + float tgx = subpix_ptr[j + 1] - subpix_ptr[j - 1]; + float tgy = subpix_ptr[j + REFINE_WIN_W + 2] - subpix_ptr[j - REFINE_WIN_W - 2]; + float gxx = tgx * tgx * m; + float gxy = tgx * tgy * m; + float gyy = tgy * tgy * m; + float px = j - REFINE_WIN_HALF_W; + + a += gxx; + b += gxy; + c += gyy; + + bb1 += gxx * px + gxy * py; + bb2 += gxy * px + gyy * py; + } + } + + float det = a * c - b * b; + if (fabs(det) <= flt_epsilon * flt_epsilon) { + break; + } + + // 2x2 matrix inversion + float scale = 1.0f / det; + feature_tmp.x = (float)(feature.x + (c * scale * bb1) - (b * scale * bb2)); + feature_tmp.y = (float)(feature.y - (b * scale * bb1) + (a * scale * bb2)); + err = dot(feature_tmp - feature, feature_tmp - feature); + + feature = feature_tmp; + if (feature.x < 0 || feature.x >= src_width || feature.y < 0 || feature.y >= src_height) { + break; + } + } while (++iter < max_iters && err > eps); + + // Make sure new point isn't too far from the initial point (indicates poor convergence) + if (fabs(feature.x - init.x) > REFINE_WIN_HALF_W || fabs(feature.y - init.y) > REFINE_WIN_HALF_H) { + feature = init; + } + + return feature; +} + +// Performs non-maximum suppression on the harris response and writes the resulting +// feature locations to refined_features. +// +// Assumes that refined_features and the global work sizes are set up such that the image +// is split up into a grid of 32x32 blocks where each block has a single slot in the +// refined_features buffer. This kernel finds the best corner in each block (if the +// block has any) and writes it to the corresponding slot in the buffer. +// +// If subpixel_refine is true, the features are additionally refined at a sub-pixel +// level for increased precision. +__kernel void refine_features( + __read_only image2d_t grayscale, + __global const float *harris_buf, + __global float2 *refined_features, + int subpixel_refine +) { + int2 loc = (int2)(get_global_id(0), get_global_id(1)); + // The location in the grayscale buffer rather than the compacted grid + int2 loc_i = (int2)(loc.x * 32, loc.y * 32); + + float new_val; + float max_val = 0; + float2 loc_max = (float2)(-1, -1); + + int end_x = min(loc_i.x + 32, (int)get_image_dim(grayscale).x - 1); + int end_y = min(loc_i.y + 32, (int)get_image_dim(grayscale).y - 1); + + for (int i = loc_i.x; i < end_x; ++i) { + for (int j = loc_i.y; j < end_y; ++j) { + new_val = harris_buf[i + j * get_image_dim(grayscale).x]; + + if (new_val > max_val) { + max_val = new_val; + loc_max = (float2)(i, j); + } + } + } + + if (max_val == 0) { + // There are no features in this part of the frame + write_to_1d_arrf2(refined_features, loc, loc_max); + return; + } + + if (subpixel_refine) { + float mask[REFINE_WIN_H * REFINE_WIN_W]; + for (int i = 0; i < REFINE_WIN_H; i++) { + float y = (float)(i - REFINE_WIN_HALF_H) / REFINE_WIN_HALF_H; + float vy = exp(-y * y); + + for (int j = 0; j < REFINE_WIN_W; j++) { + float x = (float)(j - REFINE_WIN_HALF_W) / REFINE_WIN_HALF_W; + mask[i * REFINE_WIN_W + j] = (float)(vy * exp(-x * x)); + } + } + + loc_max = corner_sub_pix(grayscale, loc_max, mask); + } + + write_to_1d_arrf2(refined_features, loc, loc_max); +} + +// Extracts BRIEF descriptors from the grayscale src image for the given features +// using the provided sampler. +__kernel void brief_descriptors( + __read_only image2d_t grayscale, + __global const float2 *refined_features, + // for 512 bit descriptors + __global ulong8 *desc_buf, + __global const PointPair *brief_pattern +) { + int2 loc = (int2)(get_global_id(0), get_global_id(1)); + float2 feature = read_from_1d_arrf2(refined_features, loc); + + // There was no feature in this part of the frame + if (feature.x == -1) { + write_to_1d_arrul8(desc_buf, loc, (ulong8)(0)); + return; + } + + ulong8 desc = 0; + ulong *p = &desc; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 64; ++j) { + PointPair pair = brief_pattern[j * (i + 1)]; + float l1 = read_imagef(grayscale, sampler_linear, feature + pair.p1).x; + float l2 = read_imagef(grayscale, sampler_linear, feature + pair.p2).x; + + if (l1 < l2) { + p[i] |= 1UL << j; + } + } + } + + write_to_1d_arrul8(desc_buf, loc, desc); +} + +// Given buffers with descriptors for the current and previous frame, determines +// which ones match, writing correspondences to matches_buf. +// +// Feature and descriptor buffers are assumed to be compacted (each element sourced +// from a 32x32 block in the frame being processed). +__kernel void match_descriptors( + __global const float2 *prev_refined_features, + __global const float2 *refined_features, + __global const ulong8 *desc_buf, + __global const ulong8 *prev_desc_buf, + __global MotionVector *matches_buf +) { + int2 loc = (int2)(get_global_id(0), get_global_id(1)); + ulong8 desc = read_from_1d_arrul8(desc_buf, loc); + const int search_radius = 3; + + MotionVector invalid_vector = (MotionVector) { + (PointPair) { + (float2)(-1, -1), + (float2)(-1, -1) + }, + 0 + }; + + if (desc.s0 == 0 && desc.s1 == 0) { + // There was no feature in this part of the frame + write_to_1d_arrvec( + matches_buf, + loc, + invalid_vector + ); + return; + } + + int2 start = max(loc - search_radius, 0); + int2 end = min(loc + search_radius, (int2)(get_global_size(0) - 1, get_global_size(1) - 1)); + + for (int i = start.x; i < end.x; ++i) { + for (int j = start.y; j < end.y; ++j) { + int2 prev_point = (int2)(i, j); + int total_dist = 0; + + ulong8 prev_desc = read_from_1d_arrul8(prev_desc_buf, prev_point); + + if (prev_desc.s0 == 0 && prev_desc.s1 == 0) { + continue; + } + + ulong *prev_desc_p = &prev_desc; + ulong *desc_p = &desc; + + for (int i = 0; i < 8; i++) { + total_dist += popcount(desc_p[i] ^ prev_desc_p[i]); + } + + if (total_dist < DISTANCE_THRESHOLD) { + write_to_1d_arrvec( + matches_buf, + loc, + (MotionVector) { + (PointPair) { + read_from_1d_arrf2(prev_refined_features, prev_point), + read_from_1d_arrf2(refined_features, loc) + }, + 1 + } + ); + + return; + } + } + } + + // There is no found match for this point + write_to_1d_arrvec( + matches_buf, + loc, + invalid_vector + ); +} + +// Returns the position of the given point after the transform is applied +static float2 transformed_point(float2 p, __global const float *transform) { + float2 ret; + + ret.x = p.x * transform[0] + p.y * transform[1] + transform[2]; + ret.y = p.x * transform[3] + p.y * transform[4] + transform[5]; + + return ret; +} + + +// Performs the given transform on the src image +__kernel void transform( + __read_only image2d_t src, + __write_only image2d_t dst, + __global const float *transform +) { + int2 loc = (int2)(get_global_id(0), get_global_id(1)); + float2 norm = convert_float2(get_image_dim(src)); + + write_imagef( + dst, + loc, + read_imagef( + src, + sampler_linear_mirror, + transformed_point((float2)(loc.x, loc.y), transform) / norm + ) + ); +} + +// Returns the new location of the given point using the given crop bounding box +// and the width and height of the original frame. +static float2 cropped_point( + float2 p, + float2 top_left, + float2 bottom_right, + int2 orig_dim +) { + float2 ret; + + float crop_width = bottom_right.x - top_left.x; + float crop_height = bottom_right.y - top_left.y; + + float width_norm = p.x / (float)orig_dim.x; + float height_norm = p.y / (float)orig_dim.y; + + ret.x = (width_norm * crop_width) + top_left.x; + ret.y = (height_norm * crop_height) + ((float)orig_dim.y - bottom_right.y); + + return ret; +} + +// Upscales the given cropped region to the size of the original frame +__kernel void crop_upscale( + __read_only image2d_t src, + __write_only image2d_t dst, + float2 top_left, + float2 bottom_right +) { + int2 loc = (int2)(get_global_id(0), get_global_id(1)); + + write_imagef( + dst, + loc, + read_imagef( + src, + sampler_linear, + cropped_point((float2)(loc.x, loc.y), top_left, bottom_right, get_image_dim(dst)) + ) + ); +} + +// Draws boxes to represent the given point matches and uses the given transform +// and crop info to make sure their positions are accurate on the transformed frame. +// +// model_matches is an array of three points that were used by the RANSAC process +// to generate the given transform +__kernel void draw_debug_info( + __write_only image2d_t dst, + __global const MotionVector *matches, + __global const MotionVector *model_matches, + int num_model_matches, + __global const float *transform +) { + int loc = get_global_id(0); + MotionVector vec = matches[loc]; + // Black box: matched point that RANSAC considered an outlier + float4 big_rect_color = (float4)(0.1f, 0.1f, 0.1f, 1.0f); + + if (vec.should_consider) { + // Green box: matched point that RANSAC considered an inlier + big_rect_color = (float4)(0.0f, 1.0f, 0.0f, 1.0f); + } + + for (int i = 0; i < num_model_matches; i++) { + if (vec.p.p2.x == model_matches[i].p.p2.x && vec.p.p2.y == model_matches[i].p.p2.y) { + // Orange box: point used to calculate model + big_rect_color = (float4)(1.0f, 0.5f, 0.0f, 1.0f); + } + } + + float2 transformed_p1 = transformed_point(vec.p.p1, transform); + float2 transformed_p2 = transformed_point(vec.p.p2, transform); + + draw_box(dst, (int2)(transformed_p2.x, transformed_p2.y), big_rect_color, 5); + // Small light blue box: the point in the previous frame + draw_box(dst, (int2)(transformed_p1.x, transformed_p1.y), (float4)(0.0f, 0.3f, 0.7f, 1.0f), 3); +} diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h index 1a6cd7ca7a..225e7a49ea 100644 --- a/libavfilter/opencl_source.h +++ b/libavfilter/opencl_source.h @@ -23,6 +23,7 @@ extern const char *ff_opencl_source_avgblur; extern const char *ff_opencl_source_colorkey; extern const char *ff_opencl_source_colorspace_common; extern const char *ff_opencl_source_convolution; +extern const char *ff_opencl_source_deshake; extern const char *ff_opencl_source_neighbor; extern const char *ff_opencl_source_nlmeans; extern const char *ff_opencl_source_overlay; diff --git a/libavfilter/version.h b/libavfilter/version.h index dd829160fc..58a0221f43 100644 --- a/libavfilter/version.h +++ b/libavfilter/version.h @@ -31,7 +31,7 @@ #define LIBAVFILTER_VERSION_MAJOR 7 #define LIBAVFILTER_VERSION_MINOR 58 -#define LIBAVFILTER_VERSION_MICRO 100 +#define LIBAVFILTER_VERSION_MICRO 101 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \ diff --git a/libavfilter/vf_deshake_opencl.c b/libavfilter/vf_deshake_opencl.c new file mode 100644 index 0000000000..c959f19475 --- /dev/null +++ b/libavfilter/vf_deshake_opencl.c @@ -0,0 +1,2202 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Copyright (C) 2000-2008, Intel Corporation, all rights reserved. + * Copyright (C) 2009, Willow Garage Inc., all rights reserved. + * Copyright (C) 2013, OpenCV Foundation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistribution's of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistribution's in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * The name of the copyright holders may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the Intel Corporation or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include +#include +#include +#include "libavutil/opt.h" +#include "libavutil/imgutils.h" +#include "libavutil/mem.h" +#include "libavutil/fifo.h" +#include "libavutil/common.h" +#include "libavutil/avassert.h" +#include "libavutil/pixfmt.h" +#include "avfilter.h" +#include "framequeue.h" +#include "filters.h" +#include "transform.h" +#include "formats.h" +#include "internal.h" +#include "opencl.h" +#include "opencl_source.h" +#include "video.h" + +/* +This filter matches feature points between frames (dealing with outliers) and then +uses the matches to estimate an affine transform between frames. This transform is +decomposed into various values (translation, scale, rotation) and the values are +summed relative to the start of the video to obtain on absolute camera position +for each frame. This "camera path" is then smoothed via a gaussian filter, resulting +in a new path that is turned back into an affine transform and applied to each +frame to render it. + +High-level overview: + +All of the work to extract motion data from frames occurs in queue_frame. Motion data +is buffered in a smoothing window, so queue_frame simply computes the absolute camera +positions and places them in ringbuffers. + +filter_frame is responsible for looking at the absolute camera positions currently +in the ringbuffers, applying the gaussian filter, and then transforming the frames. +*/ + +// Number of bits for BRIEF descriptors +#define BREIFN 512 +// Size of the patch from which a BRIEF descriptor is extracted +// This is the size used in OpenCV +#define BRIEF_PATCH_SIZE 31 +#define BRIEF_PATCH_SIZE_HALF (BRIEF_PATCH_SIZE / 2) + +#define MATCHES_CONTIG_SIZE 2000 + +#define ROUNDED_UP_DIV(a, b) ((a + (b - 1)) / b) + +typedef struct PointPair { + // Previous frame + cl_float2 p1; + // Current frame + cl_float2 p2; +} PointPair; + +typedef struct MotionVector { + PointPair p; + // Used to mark vectors as potential outliers + cl_int should_consider; +} MotionVector; + +// Denotes the indices for the different types of motion in the ringbuffers array +enum RingbufferIndices { + RingbufX, + RingbufY, + RingbufRot, + RingbufScaleX, + RingbufScaleY, + + // Should always be last + RingbufCount +}; + +// Struct that holds data for drawing point match debug data +typedef struct DebugMatches { + MotionVector *matches; + // The points used to calculate the affine transform for a frame + MotionVector model_matches[3]; + + int num_matches; + // For cases where we couldn't calculate a model + int num_model_matches; +} DebugMatches; + +// Groups together the ringbuffers that store absolute distortion / position values +// for each frame +typedef struct AbsoluteFrameMotion { + // Array with the various ringbuffers, indexed via the RingbufferIndices enum + AVFifoBuffer *ringbuffers[RingbufCount]; + + // Offset to get to the current frame being processed + // (not in bytes) + int curr_frame_offset; + // Keeps track of where the start and end of contiguous motion data is (to + // deal with cases where no motion data is found between two frames) + int data_start_offset; + int data_end_offset; + + AVFifoBuffer *debug_matches; +} AbsoluteFrameMotion; + +// Takes care of freeing the arrays within the DebugMatches inside of the +// debug_matches ringbuffer and then freeing the buffer itself. +static void free_debug_matches(AbsoluteFrameMotion *afm) { + DebugMatches dm; + + if (!afm->debug_matches) { + return; + } + + while (av_fifo_size(afm->debug_matches) > 0) { + av_fifo_generic_read( + afm->debug_matches, + &dm, + sizeof(DebugMatches), + NULL + ); + + av_freep(&dm.matches); + } + + av_fifo_freep(&afm->debug_matches); +} + +// Stores the translation, scale, rotation, and skew deltas between two frames +typedef struct FrameDelta { + cl_float2 translation; + float rotation; + cl_float2 scale; + cl_float2 skew; +} FrameDelta; + +typedef struct SimilarityMatrix { + // The 2x3 similarity matrix + double matrix[6]; +} SimilarityMatrix; + +typedef struct CropInfo { + // The top left corner of the bounding box for the crop + cl_float2 top_left; + // The bottom right corner of the bounding box for the crop + cl_float2 bottom_right; +} CropInfo; + +// Returned from function that determines start and end values for iteration +// around the current frame in a ringbuffer +typedef struct IterIndices { + int start; + int end; +} IterIndices; + +typedef struct DeshakeOpenCLContext { + OpenCLFilterContext ocf; + // Whether or not the above `OpenCLFilterContext` has been initialized + int initialized; + + // These variables are used in the activate callback + int64_t duration; + bool eof; + + // State for random number generation + AVLFG alfg; + + // FIFO frame queue used to buffer future frames for processing + FFFrameQueue fq; + // Ringbuffers for frame positions + AbsoluteFrameMotion abs_motion; + + // The number of frames' motion to consider before and after the frame we are + // smoothing + int smooth_window; + // The number of the frame we are currently processing + int curr_frame; + + // Stores a 1d array of normalised gaussian kernel values for convolution + float *gauss_kernel; + + // Buffer for error values used in RANSAC code + float *ransac_err; + + // Information regarding how to crop the smoothed luminance (or RGB) planes + CropInfo crop_y; + // Information regarding how to crop the smoothed chroma planes + CropInfo crop_uv; + + // Whether or not we are processing YUV input (as oppposed to RGB) + bool is_yuv; + // The underlying format of the hardware surfaces + int sw_format; + + // Buffer to copy `matches` into for the CPU to work with + MotionVector *matches_host; + MotionVector *matches_contig_host; + + MotionVector *inliers; + + cl_command_queue command_queue; + cl_kernel kernel_grayscale; + cl_kernel kernel_harris_response; + cl_kernel kernel_refine_features; + cl_kernel kernel_brief_descriptors; + cl_kernel kernel_match_descriptors; + cl_kernel kernel_transform; + cl_kernel kernel_crop_upscale; + + // Stores a frame converted to grayscale + cl_mem grayscale; + // Stores the harris response for a frame (measure of "cornerness" for each pixel) + cl_mem harris_buf; + + // Detected features after non-maximum suppression and sub-pixel refinement + cl_mem refined_features; + // Saved from the previous frame + cl_mem prev_refined_features; + + // BRIEF sampling pattern that is randomly initialized + cl_mem brief_pattern; + // Feature point descriptors for the current frame + cl_mem descriptors; + // Feature point descriptors for the previous frame + cl_mem prev_descriptors; + // Vectors between points in current and previous frame + cl_mem matches; + cl_mem matches_contig; + // Holds the matrix to transform luminance (or RGB) with + cl_mem transform_y; + // Holds the matrix to transform chroma with + cl_mem transform_uv; + + // Configurable options + + int tripod_mode; + int debug_on; + int should_crop; + + // Whether or not feature points should be refined at a sub-pixel level + cl_int refine_features; + // If the user sets a value other than the default, 0, this percentage is + // translated into a sigma value ranging from 0.5 to 40.0 + float smooth_percent; + // This number is multiplied by the video frame rate to determine the size + // of the smooth window + float smooth_window_multiplier; + + // Debug stuff + + cl_kernel kernel_draw_debug_info; + cl_mem debug_matches; + cl_mem debug_model_matches; + + // These store the total time spent executing the different kernels in nanoseconds + unsigned long long grayscale_time; + unsigned long long harris_response_time; + unsigned long long refine_features_time; + unsigned long long brief_descriptors_time; + unsigned long long match_descriptors_time; + unsigned long long transform_time; + unsigned long long crop_upscale_time; + + // Time spent copying matched features from the device to the host + unsigned long long read_buf_time; +} DeshakeOpenCLContext; + +// Returns a random uniformly-distributed number in [low, high] +static int rand_in(int low, int high, AVLFG *alfg) { + return (av_lfg_get(alfg) % (high - low)) + low; +} + +// Returns the average execution time for an event given the total time and the +// number of frames processed. +static double averaged_event_time_ms(unsigned long long total_time, int num_frames) { + return (double)total_time / (double)num_frames / 1000000.0; +} + +// The following code is loosely ported from OpenCV + +// Estimates affine transform from 3 point pairs +// model is a 2x3 matrix: +// a b c +// d e f +static void run_estimate_kernel(const MotionVector *point_pairs, double *model) +{ + // src points + double x1 = point_pairs[0].p.p1.s[0]; + double y1 = point_pairs[0].p.p1.s[1]; + double x2 = point_pairs[1].p.p1.s[0]; + double y2 = point_pairs[1].p.p1.s[1]; + double x3 = point_pairs[2].p.p1.s[0]; + double y3 = point_pairs[2].p.p1.s[1]; + + // dest points + double X1 = point_pairs[0].p.p2.s[0]; + double Y1 = point_pairs[0].p.p2.s[1]; + double X2 = point_pairs[1].p.p2.s[0]; + double Y2 = point_pairs[1].p.p2.s[1]; + double X3 = point_pairs[2].p.p2.s[0]; + double Y3 = point_pairs[2].p.p2.s[1]; + + double d = 1.0 / ( x1*(y2-y3) + x2*(y3-y1) + x3*(y1-y2) ); + + model[0] = d * ( X1*(y2-y3) + X2*(y3-y1) + X3*(y1-y2) ); + model[1] = d * ( X1*(x3-x2) + X2*(x1-x3) + X3*(x2-x1) ); + model[2] = d * ( X1*(x2*y3 - x3*y2) + X2*(x3*y1 - x1*y3) + X3*(x1*y2 - x2*y1) ); + + model[3] = d * ( Y1*(y2-y3) + Y2*(y3-y1) + Y3*(y1-y2) ); + model[4] = d * ( Y1*(x3-x2) + Y2*(x1-x3) + Y3*(x2-x1) ); + model[5] = d * ( Y1*(x2*y3 - x3*y2) + Y2*(x3*y1 - x1*y3) + Y3*(x1*y2 - x2*y1) ); +} + +// Checks that the 3 points in the given array are not collinear +static bool points_not_collinear(const cl_float2 **points) +{ + int j, k, i = 2; + + for (j = 0; j < i; j++) { + double dx1 = points[j]->s[0] - points[i]->s[0]; + double dy1 = points[j]->s[1] - points[i]->s[1]; + + for (k = 0; k < j; k++) { + double dx2 = points[k]->s[0] - points[i]->s[0]; + double dy2 = points[k]->s[1] - points[i]->s[1]; + + // Assuming a 3840 x 2160 video with a point at (0, 0) and one at + // (3839, 2159), this prevents a third point from being within roughly + // 0.5 of a pixel of the line connecting the two on both axes + if (fabs(dx2*dy1 - dy2*dx1) <= 1.0) { + return false; + } + } + } + + return true; +} + +// Checks a subset of 3 point pairs to make sure that the points are not collinear +// and not too close to each other +static bool check_subset(const MotionVector *pairs_subset) +{ + const cl_float2 *prev_points[] = { + &pairs_subset[0].p.p1, + &pairs_subset[1].p.p1, + &pairs_subset[2].p.p1 + }; + + const cl_float2 *curr_points[] = { + &pairs_subset[0].p.p2, + &pairs_subset[1].p.p2, + &pairs_subset[2].p.p2 + }; + + return points_not_collinear(prev_points) && points_not_collinear(curr_points); +} + +// Selects a random subset of 3 points from point_pairs and places them in pairs_subset +static bool get_subset( + AVLFG *alfg, + const MotionVector *point_pairs, + const int num_point_pairs, + MotionVector *pairs_subset, + int max_attempts +) { + int idx[3]; + int i = 0, j, iters = 0; + + for (; iters < max_attempts; iters++) { + for (i = 0; i < 3 && iters < max_attempts;) { + int idx_i = 0; + + for (;;) { + idx_i = idx[i] = rand_in(0, num_point_pairs, alfg); + + for (j = 0; j < i; j++) { + if (idx_i == idx[j]) { + break; + } + } + + if (j == i) { + break; + } + } + + pairs_subset[i] = point_pairs[idx[i]]; + i++; + } + + if (i == 3 && !check_subset(pairs_subset)) { + continue; + } + break; + } + + return i == 3 && iters < max_attempts; +} + +// Computes the error for each of the given points based on the given model. +static void compute_error( + const MotionVector *point_pairs, + const int num_point_pairs, + const double *model, + float *err +) { + double F0 = model[0], F1 = model[1], F2 = model[2]; + double F3 = model[3], F4 = model[4], F5 = model[5]; + + for (int i = 0; i < num_point_pairs; i++) { + const cl_float2 *f = &point_pairs[i].p.p1; + const cl_float2 *t = &point_pairs[i].p.p2; + + double a = F0*f->s[0] + F1*f->s[1] + F2 - t->s[0]; + double b = F3*f->s[0] + F4*f->s[1] + F5 - t->s[1]; + + err[i] = a*a + b*b; + } +} + +// Determines which of the given point matches are inliers for the given model +// based on the specified threshold. +// +// err must be an array of num_point_pairs length +static int find_inliers( + MotionVector *point_pairs, + const int num_point_pairs, + const double *model, + float *err, + double thresh +) { + float t = (float)(thresh * thresh); + int i, n = num_point_pairs, num_inliers = 0; + + compute_error(point_pairs, num_point_pairs, model, err); + + for (i = 0; i < n; i++) { + if (err[i] <= t) { + // This is an inlier + point_pairs[i].should_consider = true; + num_inliers += 1; + } else { + point_pairs[i].should_consider = false; + } + } + + return num_inliers; +} + +// Determines the number of iterations required to achieve the desired confidence level. +// +// The equation used to determine the number of iterations to do is: +// 1 - confidence = (1 - inlier_probability^num_points)^num_iters +// +// Solving for num_iters: +// +// num_iters = log(1 - confidence) / log(1 - inlier_probability^num_points) +// +// A more in-depth explanation can be found at https://en.wikipedia.org/wiki/Random_sample_consensus +// under the 'Parameters' heading +static int ransac_update_num_iters(double confidence, double num_outliers, int max_iters) +{ + double num, denom; + + confidence = av_clipd(confidence, 0.0, 1.0); + num_outliers = av_clipd(num_outliers, 0.0, 1.0); + + // avoid inf's & nan's + num = FFMAX(1.0 - confidence, DBL_MIN); + denom = 1.0 - pow(1.0 - num_outliers, 3); + if (denom < DBL_MIN) { + return 0; + } + + num = log(num); + denom = log(denom); + + return denom >= 0 || -num >= max_iters * (-denom) ? max_iters : (int)round(num / denom); +} + +// Estimates an affine transform between the given pairs of points using RANdom +// SAmple Consensus +static bool estimate_affine_2d( + DeshakeOpenCLContext *deshake_ctx, + MotionVector *point_pairs, + DebugMatches *debug_matches, + const int num_point_pairs, + double *model_out, + const double threshold, + const int max_iters, + const double confidence +) { + bool result = false; + double best_model[6], model[6]; + MotionVector pairs_subset[3], best_pairs[3]; + + int iter, niters = FFMAX(max_iters, 1); + int good_count, max_good_count = 0; + + // We need at least 3 points to build a model from + if (num_point_pairs < 3) { + return false; + } else if (num_point_pairs == 3) { + // There are only 3 points, so RANSAC doesn't apply here + run_estimate_kernel(point_pairs, model_out); + + for (int i = 0; i < 3; ++i) { + point_pairs[i].should_consider = true; + } + + return true; + } + + for (iter = 0; iter < niters; ++iter) { + bool found = get_subset(&deshake_ctx->alfg, point_pairs, num_point_pairs, pairs_subset, 10000); + + if (!found) { + if (iter == 0) { + return false; + } + + break; + } + + run_estimate_kernel(pairs_subset, model); + good_count = find_inliers(point_pairs, num_point_pairs, model, deshake_ctx->ransac_err, threshold); + + if (good_count > FFMAX(max_good_count, 2)) { + for (int mi = 0; mi < 6; ++mi) { + best_model[mi] = model[mi]; + } + + for (int pi = 0; pi < 3; pi++) { + best_pairs[pi] = pairs_subset[pi]; + } + + max_good_count = good_count; + niters = ransac_update_num_iters( + confidence, + (double)(num_point_pairs - good_count) / num_point_pairs, + niters + ); + } + } + + if (max_good_count > 0) { + for (int mi = 0; mi < 6; ++mi) { + model_out[mi] = best_model[mi]; + } + + for (int pi = 0; pi < 3; ++pi) { + debug_matches->model_matches[pi] = best_pairs[pi]; + } + debug_matches->num_model_matches = 3; + + // Find the inliers again for the best model for debugging + find_inliers(point_pairs, num_point_pairs, best_model, deshake_ctx->ransac_err, threshold); + result = true; + } + + return result; +} + +// "Wiggles" the first point in best_pairs around a tiny bit in order to decrease the +// total error +static void optimize_model( + DeshakeOpenCLContext *deshake_ctx, + MotionVector *best_pairs, + MotionVector *inliers, + const int num_inliers, + float best_err, + double *model_out +) { + float move_x_val = 0.01; + float move_y_val = 0.01; + bool move_x = true; + float old_move_x_val = 0; + double model[6]; + int last_changed = 0; + + for (int iters = 0; iters < 200; iters++) { + float total_err = 0; + + if (move_x) { + best_pairs[0].p.p2.s[0] += move_x_val; + } else { + best_pairs[0].p.p2.s[0] += move_y_val; + } + + run_estimate_kernel(best_pairs, model); + compute_error(inliers, num_inliers, model, deshake_ctx->ransac_err); + + for (int j = 0; j < num_inliers; j++) { + total_err += deshake_ctx->ransac_err[j]; + } + + if (total_err < best_err) { + for (int mi = 0; mi < 6; ++mi) { + model_out[mi] = model[mi]; + } + + best_err = total_err; + last_changed = iters; + } else { + // Undo the change + if (move_x) { + best_pairs[0].p.p2.s[0] -= move_x_val; + } else { + best_pairs[0].p.p2.s[0] -= move_y_val; + } + + if (iters - last_changed > 4) { + // We've already improved the model as much as we can + break; + } + + old_move_x_val = move_x_val; + + if (move_x) { + move_x_val *= -1; + } else { + move_y_val *= -1; + } + + if (old_move_x_val < 0) { + move_x = false; + } else { + move_x = true; + } + } + } +} + +// Uses a process similar to that of RANSAC to find a transform that minimizes +// the total error for a set of point matches determined to be inliers +// +// (Pick random subsets, compute model, find total error, iterate until error +// is minimized.) +static bool minimize_error( + DeshakeOpenCLContext *deshake_ctx, + MotionVector *inliers, + DebugMatches *debug_matches, + const int num_inliers, + double *model_out, + const int max_iters +) { + bool result = false; + float best_err = FLT_MAX; + double best_model[6], model[6]; + MotionVector pairs_subset[3], best_pairs[3]; + + for (int i = 0; i < max_iters; i++) { + float total_err = 0; + bool found = get_subset(&deshake_ctx->alfg, inliers, num_inliers, pairs_subset, 10000); + + if (!found) { + if (i == 0) { + return false; + } + + break; + } + + run_estimate_kernel(pairs_subset, model); + compute_error(inliers, num_inliers, model, deshake_ctx->ransac_err); + + for (int j = 0; j < num_inliers; j++) { + total_err += deshake_ctx->ransac_err[j]; + } + + if (total_err < best_err) { + for (int mi = 0; mi < 6; ++mi) { + best_model[mi] = model[mi]; + } + + for (int pi = 0; pi < 3; pi++) { + best_pairs[pi] = pairs_subset[pi]; + } + + best_err = total_err; + } + } + + for (int mi = 0; mi < 6; ++mi) { + model_out[mi] = best_model[mi]; + } + + for (int pi = 0; pi < 3; ++pi) { + debug_matches->model_matches[pi] = best_pairs[pi]; + } + debug_matches->num_model_matches = 3; + result = true; + + optimize_model(deshake_ctx, best_pairs, inliers, num_inliers, best_err, model_out); + return result; +} + +// End code from OpenCV + +// Decomposes a similarity matrix into translation, rotation, scale, and skew +// +// See http://frederic-wang.fr/decomposition-of-2d-transform-matrices.html +static FrameDelta decompose_transform(double *model) +{ + FrameDelta ret; + + double a = model[0]; + double c = model[1]; + double e = model[2]; + double b = model[3]; + double d = model[4]; + double f = model[5]; + double delta = a * d - b * c; + + ret.translation.s[0] = e; + ret.translation.s[1] = f; + + // This is the QR method + if (a != 0 || b != 0) { + double r = hypot(a, b); + + ret.rotation = FFSIGN(b) * acos(a / r); + ret.scale.s[0] = r; + ret.scale.s[1] = delta / r; + ret.skew.s[0] = atan((a * c + b * d) / (r * r)); + ret.skew.s[1] = 0; + } else if (c != 0 || d != 0) { + double s = sqrt(c * c + d * d); + + ret.rotation = M_PI / 2 - FFSIGN(d) * acos(-c / s); + ret.scale.s[0] = delta / s; + ret.scale.s[1] = s; + ret.skew.s[0] = 0; + ret.skew.s[1] = atan((a * c + b * d) / (s * s)); + } // otherwise there is only translation + + return ret; +} + +// Move valid vectors from the 2d buffer into a 1d buffer where they are contiguous +static int make_vectors_contig( + DeshakeOpenCLContext *deshake_ctx, + int size_y, + int size_x +) { + int num_vectors = 0; + + for (int i = 0; i < size_y; ++i) { + for (int j = 0; j < size_x; ++j) { + MotionVector v = deshake_ctx->matches_host[j + i * size_x]; + + if (v.should_consider) { + deshake_ctx->matches_contig_host[num_vectors] = v; + ++num_vectors; + } + + // Make sure we do not exceed the amount of space we allocated for these vectors + if (num_vectors == MATCHES_CONTIG_SIZE - 1) { + return num_vectors; + } + } + } + return num_vectors; +} + +// Returns the gaussian kernel value for the given x coordinate and sigma value +static float gaussian_for(int x, float sigma) { + return 1.0f / expf(((float)x * (float)x) / (2.0f * sigma * sigma)); +} + +// Makes a normalized gaussian kernel of the given length for the given sigma +// and places it in gauss_kernel +static void make_gauss_kernel(float *gauss_kernel, float length, float sigma) +{ + float gauss_sum = 0; + int window_half = length / 2; + + for (int i = 0; i < length; ++i) { + float val = gaussian_for(i - window_half, sigma); + + gauss_sum += val; + gauss_kernel[i] = val; + } + + // Normalize the gaussian values + for (int i = 0; i < length; ++i) { + gauss_kernel[i] /= gauss_sum; + } +} + +// Returns indices to start and end iteration at in order to iterate over a window +// of length size centered at the current frame in a ringbuffer +// +// Always returns numbers that result in a window of length size, even if that +// means specifying negative indices or indices past the end of the values in the +// ringbuffers. Make sure you clip indices appropriately within your loop. +static IterIndices start_end_for(DeshakeOpenCLContext *deshake_ctx, int length) { + IterIndices indices; + + indices.start = deshake_ctx->abs_motion.curr_frame_offset - (length / 2); + indices.end = deshake_ctx->abs_motion.curr_frame_offset + (length / 2) + (length % 2); + + return indices; +} + +// Sets val to the value in the given ringbuffer at the given offset, taking care of +// clipping the offset into the appropriate range +static void ringbuf_float_at( + DeshakeOpenCLContext *deshake_ctx, + AVFifoBuffer *values, + float *val, + int offset +) { + int clip_start, clip_end, offset_clipped; + if (deshake_ctx->abs_motion.data_end_offset != -1) { + clip_end = deshake_ctx->abs_motion.data_end_offset; + } else { + // This expression represents the last valid index in the buffer, + // which we use repeatedly at the end of the video. + clip_end = deshake_ctx->smooth_window - (av_fifo_space(values) / sizeof(float)) - 1; + } + + if (deshake_ctx->abs_motion.data_start_offset != -1) { + clip_start = deshake_ctx->abs_motion.data_start_offset; + } else { + // Negative indices will occur at the start of the video, and we want + // them to be clipped to 0 in order to repeatedly use the position of + // the first frame. + clip_start = 0; + } + + offset_clipped = av_clip( + offset, + clip_start, + clip_end + ); + + av_fifo_generic_peek_at( + values, + val, + offset_clipped * sizeof(float), + sizeof(float), + NULL + ); +} + +// Returns smoothed current frame value of the given buffer of floats based on the +// given Gaussian kernel and its length (also the window length, centered around the +// current frame) and the "maximum value" of the motion. +// +// This "maximum value" should be the width / height of the image in the case of +// translation and an empirically chosen constant for rotation / scale. +// +// The sigma chosen to generate the final gaussian kernel with used to smooth the +// camera path is either hardcoded (set by user, deshake_ctx->smooth_percent) or +// adaptively chosen. +static float smooth( + DeshakeOpenCLContext *deshake_ctx, + float *gauss_kernel, + int length, + float max_val, + AVFifoBuffer *values +) { + float new_large_s = 0, new_small_s = 0, new_best = 0, old, diff_between, + percent_of_max, inverted_percent; + IterIndices indices = start_end_for(deshake_ctx, length); + float large_sigma = 40.0f; + float small_sigma = 2.0f; + float best_sigma; + + if (deshake_ctx->smooth_percent) { + best_sigma = (large_sigma - 0.5f) * deshake_ctx->smooth_percent + 0.5f; + } else { + // Strategy to adaptively smooth trajectory: + // + // 1. Smooth path with large and small sigma values + // 2. Take the absolute value of the difference between them + // 3. Get a percentage by putting the difference over the "max value" + // 4, Invert the percentage + // 5. Calculate a new sigma value weighted towards the larger sigma value + // 6. Determine final smoothed trajectory value using that sigma + + make_gauss_kernel(gauss_kernel, length, large_sigma); + for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) { + ringbuf_float_at(deshake_ctx, values, &old, i); + new_large_s += old * gauss_kernel[j]; + } + + make_gauss_kernel(gauss_kernel, length, small_sigma); + for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) { + ringbuf_float_at(deshake_ctx, values, &old, i); + new_small_s += old * gauss_kernel[j]; + } + + diff_between = fabsf(new_large_s - new_small_s); + percent_of_max = diff_between / max_val; + inverted_percent = 1 - percent_of_max; + best_sigma = large_sigma * powf(inverted_percent, 40); + } + + make_gauss_kernel(gauss_kernel, length, best_sigma); + for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) { + ringbuf_float_at(deshake_ctx, values, &old, i); + new_best += old * gauss_kernel[j]; + } + + return new_best; +} + +// Returns the position of the given point after the transform is applied +static cl_float2 transformed_point(float x, float y, float *transform) { + cl_float2 ret; + + ret.s[0] = x * transform[0] + y * transform[1] + transform[2]; + ret.s[1] = x * transform[3] + y * transform[4] + transform[5]; + + return ret; +} + +// Creates an affine transform that scales from the center of a frame +static void transform_center_scale( + float x_shift, + float y_shift, + float angle, + float scale_x, + float scale_y, + float center_w, + float center_h, + float *matrix +) { + cl_float2 center_s; + float center_s_w, center_s_h; + + ff_get_matrix( + 0, + 0, + 0, + scale_x, + scale_y, + matrix + ); + + center_s = transformed_point(center_w, center_h, matrix); + center_s_w = center_w - center_s.s[0]; + center_s_h = center_h - center_s.s[1]; + + ff_get_matrix( + x_shift + center_s_w, + y_shift + center_s_h, + angle, + scale_x, + scale_y, + matrix + ); +} + +// Determines the crop necessary to eliminate black borders from a smoothed frame +// and updates target crop accordingly +static void update_needed_crop( + CropInfo* crop, + float *transform, + float frame_width, + float frame_height +) { + float new_width, new_height, adjusted_width, adjusted_height, adjusted_x, adjusted_y; + + cl_float2 top_left = transformed_point(0, 0, transform); + cl_float2 top_right = transformed_point(frame_width, 0, transform); + cl_float2 bottom_left = transformed_point(0, frame_height, transform); + cl_float2 bottom_right = transformed_point(frame_width, frame_height, transform); + float ar_h = frame_height / frame_width; + float ar_w = frame_width / frame_height; + + if (crop->bottom_right.s[0] == 0) { + // The crop hasn't been set to the original size of the plane + crop->bottom_right.s[0] = frame_width; + crop->bottom_right.s[1] = frame_height; + } + + crop->top_left.s[0] = FFMAX3( + crop->top_left.s[0], + top_left.s[0], + bottom_left.s[0] + ); + + crop->top_left.s[1] = FFMAX3( + crop->top_left.s[1], + top_left.s[1], + top_right.s[1] + ); + + crop->bottom_right.s[0] = FFMIN3( + crop->bottom_right.s[0], + bottom_right.s[0], + top_right.s[0] + ); + + crop->bottom_right.s[1] = FFMIN3( + crop->bottom_right.s[1], + bottom_right.s[1], + bottom_left.s[1] + ); + + // Make sure our potentially new bounding box has the same aspect ratio + new_height = crop->bottom_right.s[1] - crop->top_left.s[1]; + new_width = crop->bottom_right.s[0] - crop->top_left.s[0]; + + adjusted_width = new_height * ar_w; + adjusted_x = crop->bottom_right.s[0] - adjusted_width; + + if (adjusted_x >= crop->top_left.s[0]) { + crop->top_left.s[0] = adjusted_x; + } else { + adjusted_height = new_width * ar_h; + adjusted_y = crop->bottom_right.s[1] - adjusted_height; + crop->top_left.s[1] = adjusted_y; + } +} + +static av_cold void deshake_opencl_uninit(AVFilterContext *avctx) +{ + DeshakeOpenCLContext *ctx = avctx->priv; + cl_int cle; + + for (int i = 0; i < RingbufCount; i++) + av_fifo_freep(&ctx->abs_motion.ringbuffers[i]); + + if (ctx->debug_on) + free_debug_matches(&ctx->abs_motion); + + if (ctx->gauss_kernel) + av_freep(&ctx->gauss_kernel); + + if (ctx->ransac_err) + av_freep(&ctx->ransac_err); + + if (ctx->matches_host) + av_freep(&ctx->matches_host); + + if (ctx->matches_contig_host) + av_freep(&ctx->matches_contig_host); + + if (ctx->inliers) + av_freep(&ctx->inliers); + + ff_framequeue_free(&ctx->fq); + + CL_RELEASE_KERNEL(ctx->kernel_grayscale); + CL_RELEASE_KERNEL(ctx->kernel_harris_response); + CL_RELEASE_KERNEL(ctx->kernel_refine_features); + CL_RELEASE_KERNEL(ctx->kernel_brief_descriptors); + CL_RELEASE_KERNEL(ctx->kernel_match_descriptors); + CL_RELEASE_KERNEL(ctx->kernel_crop_upscale); + if (ctx->debug_on) + CL_RELEASE_KERNEL(ctx->kernel_draw_debug_info); + + CL_RELEASE_QUEUE(ctx->command_queue); + + if (!ctx->is_yuv) + CL_RELEASE_MEMORY(ctx->grayscale); + CL_RELEASE_MEMORY(ctx->harris_buf); + CL_RELEASE_MEMORY(ctx->refined_features); + CL_RELEASE_MEMORY(ctx->prev_refined_features); + CL_RELEASE_MEMORY(ctx->brief_pattern); + CL_RELEASE_MEMORY(ctx->descriptors); + CL_RELEASE_MEMORY(ctx->prev_descriptors); + CL_RELEASE_MEMORY(ctx->matches); + CL_RELEASE_MEMORY(ctx->matches_contig); + CL_RELEASE_MEMORY(ctx->transform_y); + CL_RELEASE_MEMORY(ctx->transform_uv); + if (ctx->debug_on) { + CL_RELEASE_MEMORY(ctx->debug_matches); + CL_RELEASE_MEMORY(ctx->debug_model_matches); + } + + ff_opencl_filter_uninit(avctx); +} + +static int deshake_opencl_init(AVFilterContext *avctx) +{ + DeshakeOpenCLContext *ctx = avctx->priv; + AVFilterLink *outlink = avctx->outputs[0]; + AVFilterLink *inlink = avctx->inputs[0]; + // Pointer to the host-side pattern buffer to be initialized and then copied + // to the GPU + PointPair *pattern_host; + cl_int cle; + int err; + cl_ulong8 zeroed_ulong8; + FFFrameQueueGlobal fqg; + cl_image_format grayscale_format; + cl_image_desc grayscale_desc; + cl_command_queue_properties queue_props; + + const enum AVPixelFormat disallowed_formats[14] = { + AV_PIX_FMT_GBRP, + AV_PIX_FMT_GBRP9BE, + AV_PIX_FMT_GBRP9LE, + AV_PIX_FMT_GBRP10BE, + AV_PIX_FMT_GBRP10LE, + AV_PIX_FMT_GBRP16BE, + AV_PIX_FMT_GBRP16LE, + AV_PIX_FMT_GBRAP, + AV_PIX_FMT_GBRAP16BE, + AV_PIX_FMT_GBRAP16LE, + AV_PIX_FMT_GBRAP12BE, + AV_PIX_FMT_GBRAP12LE, + AV_PIX_FMT_GBRAP10BE, + AV_PIX_FMT_GBRAP10LE + }; + + // Number of elements for an array + const int image_grid_32 = ROUNDED_UP_DIV(outlink->h, 32) * ROUNDED_UP_DIV(outlink->w, 32); + + const int descriptor_buf_size = image_grid_32 * (BREIFN / 8); + const int features_buf_size = image_grid_32 * sizeof(cl_float2); + + const AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hw_frames_ctx->sw_format); + + av_assert0(hw_frames_ctx); + av_assert0(desc); + + ff_framequeue_global_init(&fqg); + ff_framequeue_init(&ctx->fq, &fqg); + ctx->eof = false; + ctx->smooth_window = (int)(av_q2d(avctx->inputs[0]->frame_rate) * ctx->smooth_window_multiplier); + ctx->curr_frame = 0; + + memset(&zeroed_ulong8, 0, sizeof(cl_ulong8)); + + ctx->gauss_kernel = av_malloc_array(ctx->smooth_window, sizeof(float)); + if (!ctx->gauss_kernel) { + err = AVERROR(ENOMEM); + goto fail; + } + + ctx->ransac_err = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(float)); + if (!ctx->ransac_err) { + err = AVERROR(ENOMEM); + goto fail; + } + + for (int i = 0; i < RingbufCount; i++) { + ctx->abs_motion.ringbuffers[i] = av_fifo_alloc_array( + ctx->smooth_window, + sizeof(float) + ); + + if (!ctx->abs_motion.ringbuffers[i]) { + err = AVERROR(ENOMEM); + goto fail; + } + } + + if (ctx->debug_on) { + ctx->abs_motion.debug_matches = av_fifo_alloc_array( + ctx->smooth_window / 2, + sizeof(DebugMatches) + ); + + if (!ctx->abs_motion.debug_matches) { + err = AVERROR(ENOMEM); + goto fail; + } + } + + ctx->abs_motion.curr_frame_offset = 0; + ctx->abs_motion.data_start_offset = -1; + ctx->abs_motion.data_end_offset = -1; + + pattern_host = av_malloc_array(BREIFN, sizeof(PointPair)); + if (!pattern_host) { + err = AVERROR(ENOMEM); + goto fail; + } + + ctx->matches_host = av_malloc_array(image_grid_32, sizeof(MotionVector)); + if (!ctx->matches_host) { + err = AVERROR(ENOMEM); + goto fail; + } + + ctx->matches_contig_host = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(MotionVector)); + if (!ctx->matches_contig_host) { + err = AVERROR(ENOMEM); + goto fail; + } + + ctx->inliers = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(MotionVector)); + if (!ctx->inliers) { + err = AVERROR(ENOMEM); + goto fail; + } + + // Initializing the patch pattern for building BREIF descriptors with + av_lfg_init(&ctx->alfg, 234342424); + for (int i = 0; i < BREIFN; ++i) { + PointPair pair; + + for (int j = 0; j < 2; ++j) { + pair.p1.s[j] = rand_in(-BRIEF_PATCH_SIZE_HALF, BRIEF_PATCH_SIZE_HALF + 1, &ctx->alfg); + pair.p2.s[j] = rand_in(-BRIEF_PATCH_SIZE_HALF, BRIEF_PATCH_SIZE_HALF + 1, &ctx->alfg); + } + + pattern_host[i] = pair; + } + + for (int i = 0; i < 14; i++) { + if (ctx->sw_format == disallowed_formats[i]) { + av_log(avctx, AV_LOG_ERROR, "unsupported format in deshake_opencl.\n"); + err = AVERROR(ENOSYS); + goto fail; + } + } + + if (desc->flags & AV_PIX_FMT_FLAG_RGB) { + ctx->is_yuv = false; + } else { + ctx->is_yuv = true; + } + ctx->sw_format = hw_frames_ctx->sw_format; + + err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_deshake, 1); + if (err < 0) + goto fail; + + if (ctx->debug_on) { + queue_props = CL_QUEUE_PROFILING_ENABLE; + } else { + queue_props = 0; + } + ctx->command_queue = clCreateCommandQueue( + ctx->ocf.hwctx->context, + ctx->ocf.hwctx->device_id, + queue_props, + &cle + ); + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL command queue %d.\n", cle); + + CL_CREATE_KERNEL(ctx, grayscale); + CL_CREATE_KERNEL(ctx, harris_response); + CL_CREATE_KERNEL(ctx, refine_features); + CL_CREATE_KERNEL(ctx, brief_descriptors); + CL_CREATE_KERNEL(ctx, match_descriptors); + CL_CREATE_KERNEL(ctx, transform); + CL_CREATE_KERNEL(ctx, crop_upscale); + if (ctx->debug_on) + CL_CREATE_KERNEL(ctx, draw_debug_info); + + if (!ctx->is_yuv) { + grayscale_format.image_channel_order = CL_R; + grayscale_format.image_channel_data_type = CL_FLOAT; + + grayscale_desc = (cl_image_desc) { + .image_type = CL_MEM_OBJECT_IMAGE2D, + .image_width = outlink->w, + .image_height = outlink->h, + .image_depth = 0, + .image_array_size = 0, + .image_row_pitch = 0, + .image_slice_pitch = 0, + .num_mip_levels = 0, + .num_samples = 0, + .buffer = NULL, + }; + + ctx->grayscale = clCreateImage( + ctx->ocf.hwctx->context, + 0, + &grayscale_format, + &grayscale_desc, + NULL, + &cle + ); + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create grayscale image: %d.\n", cle); + } + + CL_CREATE_BUFFER(ctx, harris_buf, outlink->h * outlink->w * sizeof(float)); + CL_CREATE_BUFFER(ctx, refined_features, features_buf_size); + CL_CREATE_BUFFER(ctx, prev_refined_features, features_buf_size); + CL_CREATE_BUFFER_FLAGS( + ctx, + brief_pattern, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + BREIFN * sizeof(PointPair), + pattern_host + ); + CL_CREATE_BUFFER(ctx, descriptors, descriptor_buf_size); + CL_CREATE_BUFFER(ctx, prev_descriptors, descriptor_buf_size); + CL_CREATE_BUFFER(ctx, matches, image_grid_32 * sizeof(MotionVector)); + CL_CREATE_BUFFER(ctx, matches_contig, MATCHES_CONTIG_SIZE * sizeof(MotionVector)); + CL_CREATE_BUFFER(ctx, transform_y, 9 * sizeof(float)); + CL_CREATE_BUFFER(ctx, transform_uv, 9 * sizeof(float)); + if (ctx->debug_on) { + CL_CREATE_BUFFER(ctx, debug_matches, MATCHES_CONTIG_SIZE * sizeof(MotionVector)); + CL_CREATE_BUFFER(ctx, debug_model_matches, 3 * sizeof(MotionVector)); + } + + ctx->initialized = 1; + av_freep(&pattern_host); + + return 0; + +fail: + if (!pattern_host) + av_freep(&pattern_host); + return err; +} + +// Logs debug information about the transform data +static void transform_debug(AVFilterContext *avctx, float *new_vals, float *old_vals, int curr_frame) { + av_log(avctx, AV_LOG_VERBOSE, + "Frame %d:\n" + "\tframe moved from: %f x, %f y\n" + "\t to: %f x, %f y\n" + "\t rotated from: %f degrees\n" + "\t to: %f degrees\n" + "\t scaled from: %f x, %f y\n" + "\t to: %f x, %f y\n" + "\n" + "\tframe moved by: %f x, %f y\n" + "\t rotated by: %f degrees\n" + "\t scaled by: %f x, %f y\n", + curr_frame, + old_vals[RingbufX], old_vals[RingbufY], + new_vals[RingbufX], new_vals[RingbufY], + old_vals[RingbufRot] * (180.0 / M_PI), + new_vals[RingbufRot] * (180.0 / M_PI), + old_vals[RingbufScaleX], old_vals[RingbufScaleY], + new_vals[RingbufScaleX], new_vals[RingbufScaleY], + old_vals[RingbufX] - new_vals[RingbufX], old_vals[RingbufY] - new_vals[RingbufY], + old_vals[RingbufRot] * (180.0 / M_PI) - new_vals[RingbufRot] * (180.0 / M_PI), + new_vals[RingbufScaleX] / old_vals[RingbufScaleX], new_vals[RingbufScaleY] / old_vals[RingbufScaleY] + ); +} + +// Uses the buffered motion information to determine a transform that smooths the +// given frame and applies it +static int filter_frame(AVFilterLink *link, AVFrame *input_frame) +{ + AVFilterContext *avctx = link->dst; + AVFilterLink *outlink = avctx->outputs[0]; + DeshakeOpenCLContext *deshake_ctx = avctx->priv; + AVFrame *cropped_frame = NULL, *transformed_frame = NULL; + int err; + cl_int cle; + float new_vals[RingbufCount]; + float old_vals[RingbufCount]; + // Luma (in the case of YUV) transform, or just the transform in the case of RGB + float transform_y[9]; + // Chroma transform + float transform_uv[9]; + // Luma crop transform (or RGB) + float transform_crop_y[9]; + // Chroma crop transform + float transform_crop_uv[9]; + float transform_debug_rgb[9]; + size_t global_work[2]; + int64_t duration; + cl_mem src, transformed, dst; + cl_mem transforms[3]; + CropInfo crops[3]; + cl_event transform_event, crop_upscale_event; + DebugMatches debug_matches; + cl_int num_model_matches; + + const float center_w = (float)input_frame->width / 2; + const float center_h = (float)input_frame->height / 2; + + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(deshake_ctx->sw_format); + const int chroma_width = AV_CEIL_RSHIFT(input_frame->width, desc->log2_chroma_w); + const int chroma_height = AV_CEIL_RSHIFT(input_frame->height, desc->log2_chroma_h); + + const float center_w_chroma = (float)chroma_width / 2; + const float center_h_chroma = (float)chroma_height / 2; + + const float luma_w_over_chroma_w = ((float)input_frame->width / (float)chroma_width); + const float luma_h_over_chroma_h = ((float)input_frame->height / (float)chroma_height); + + if (deshake_ctx->debug_on) { + av_fifo_generic_read( + deshake_ctx->abs_motion.debug_matches, + &debug_matches, + sizeof(DebugMatches), + NULL + ); + } + + if (input_frame->pkt_duration) { + duration = input_frame->pkt_duration; + } else { + duration = av_rescale_q(1, av_inv_q(outlink->frame_rate), outlink->time_base); + } + deshake_ctx->duration = input_frame->pts + duration; + + // Get the absolute transform data for this frame + for (int i = 0; i < RingbufCount; i++) { + av_fifo_generic_peek_at( + deshake_ctx->abs_motion.ringbuffers[i], + &old_vals[i], + deshake_ctx->abs_motion.curr_frame_offset * sizeof(float), + sizeof(float), + NULL + ); + } + + if (deshake_ctx->tripod_mode) { + // If tripod mode is turned on we simply undo all motion relative to the + // first frame + + new_vals[RingbufX] = 0.0f; + new_vals[RingbufY] = 0.0f; + new_vals[RingbufRot] = 0.0f; + new_vals[RingbufScaleX] = 1.0f; + new_vals[RingbufScaleY] = 1.0f; + } else { + // Tripod mode is off and we need to smooth a moving camera + + new_vals[RingbufX] = smooth( + deshake_ctx, + deshake_ctx->gauss_kernel, + deshake_ctx->smooth_window, + input_frame->width, + deshake_ctx->abs_motion.ringbuffers[RingbufX] + ); + new_vals[RingbufY] = smooth( + deshake_ctx, + deshake_ctx->gauss_kernel, + deshake_ctx->smooth_window, + input_frame->height, + deshake_ctx->abs_motion.ringbuffers[RingbufY] + ); + new_vals[RingbufRot] = smooth( + deshake_ctx, + deshake_ctx->gauss_kernel, + deshake_ctx->smooth_window, + M_PI / 4, + deshake_ctx->abs_motion.ringbuffers[RingbufRot] + ); + new_vals[RingbufScaleX] = smooth( + deshake_ctx, + deshake_ctx->gauss_kernel, + deshake_ctx->smooth_window, + 2.0f, + deshake_ctx->abs_motion.ringbuffers[RingbufScaleX] + ); + new_vals[RingbufScaleY] = smooth( + deshake_ctx, + deshake_ctx->gauss_kernel, + deshake_ctx->smooth_window, + 2.0f, + deshake_ctx->abs_motion.ringbuffers[RingbufScaleY] + ); + } + + transform_center_scale( + old_vals[RingbufX] - new_vals[RingbufX], + old_vals[RingbufY] - new_vals[RingbufY], + old_vals[RingbufRot] - new_vals[RingbufRot], + new_vals[RingbufScaleX] / old_vals[RingbufScaleX], + new_vals[RingbufScaleY] / old_vals[RingbufScaleY], + center_w, + center_h, + transform_y + ); + + transform_center_scale( + (old_vals[RingbufX] - new_vals[RingbufX]) / luma_w_over_chroma_w, + (old_vals[RingbufY] - new_vals[RingbufY]) / luma_h_over_chroma_h, + old_vals[RingbufRot] - new_vals[RingbufRot], + new_vals[RingbufScaleX] / old_vals[RingbufScaleX], + new_vals[RingbufScaleY] / old_vals[RingbufScaleY], + center_w_chroma, + center_h_chroma, + transform_uv + ); + + CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_y, 9 * sizeof(float), transform_y, NULL); + CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_uv, 9 * sizeof(float), transform_uv, NULL); + + if (deshake_ctx->debug_on) + transform_debug(avctx, new_vals, old_vals, deshake_ctx->curr_frame); + + cropped_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h); + if (!cropped_frame) { + err = AVERROR(ENOMEM); + goto fail; + } + + transformed_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h); + if (!transformed_frame) { + err = AVERROR(ENOMEM); + goto fail; + } + + transforms[0] = deshake_ctx->transform_y; + transforms[1] = transforms[2] = deshake_ctx->transform_uv; + + for (int p = 0; p < FF_ARRAY_ELEMS(transformed_frame->data); p++) { + // Transform all of the planes appropriately + src = (cl_mem)input_frame->data[p]; + transformed = (cl_mem)transformed_frame->data[p]; + + if (!transformed) + break; + + err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, p, 0); + if (err < 0) + goto fail; + + CL_RUN_KERNEL_WITH_ARGS( + deshake_ctx->command_queue, + deshake_ctx->kernel_transform, + global_work, + NULL, + &transform_event, + { sizeof(cl_mem), &src }, + { sizeof(cl_mem), &transformed }, + { sizeof(cl_mem), &transforms[p] }, + ); + } + + if (deshake_ctx->debug_on && !deshake_ctx->is_yuv && debug_matches.num_matches > 0) { + CL_BLOCKING_WRITE_BUFFER( + deshake_ctx->command_queue, + deshake_ctx->debug_matches, + debug_matches.num_matches * sizeof(MotionVector), + debug_matches.matches, + NULL + ); + + CL_BLOCKING_WRITE_BUFFER( + deshake_ctx->command_queue, + deshake_ctx->debug_model_matches, + debug_matches.num_model_matches * sizeof(MotionVector), + debug_matches.model_matches, + NULL + ); + + num_model_matches = debug_matches.num_model_matches; + + // Invert the transform + transform_center_scale( + new_vals[RingbufX] - old_vals[RingbufX], + new_vals[RingbufY] - old_vals[RingbufY], + new_vals[RingbufRot] - old_vals[RingbufRot], + old_vals[RingbufScaleX] / new_vals[RingbufScaleX], + old_vals[RingbufScaleY] / new_vals[RingbufScaleY], + center_w, + center_h, + transform_debug_rgb + ); + + CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_y, 9 * sizeof(float), transform_debug_rgb, NULL); + + transformed = (cl_mem)transformed_frame->data[0]; + CL_RUN_KERNEL_WITH_ARGS( + deshake_ctx->command_queue, + deshake_ctx->kernel_draw_debug_info, + (size_t[]){ debug_matches.num_matches }, + NULL, + NULL, + { sizeof(cl_mem), &transformed }, + { sizeof(cl_mem), &deshake_ctx->debug_matches }, + { sizeof(cl_mem), &deshake_ctx->debug_model_matches }, + { sizeof(cl_int), &num_model_matches }, + { sizeof(cl_mem), &deshake_ctx->transform_y } + ); + } + + if (deshake_ctx->should_crop) { + // Generate transforms for cropping + transform_center_scale( + (old_vals[RingbufX] - new_vals[RingbufX]) / 5, + (old_vals[RingbufY] - new_vals[RingbufY]) / 5, + (old_vals[RingbufRot] - new_vals[RingbufRot]) / 5, + new_vals[RingbufScaleX] / old_vals[RingbufScaleX], + new_vals[RingbufScaleY] / old_vals[RingbufScaleY], + center_w, + center_h, + transform_crop_y + ); + update_needed_crop(&deshake_ctx->crop_y, transform_crop_y, input_frame->width, input_frame->height); + + transform_center_scale( + (old_vals[RingbufX] - new_vals[RingbufX]) / (5 * luma_w_over_chroma_w), + (old_vals[RingbufY] - new_vals[RingbufY]) / (5 * luma_h_over_chroma_h), + (old_vals[RingbufRot] - new_vals[RingbufRot]) / 5, + new_vals[RingbufScaleX] / old_vals[RingbufScaleX], + new_vals[RingbufScaleY] / old_vals[RingbufScaleY], + center_w_chroma, + center_h_chroma, + transform_crop_uv + ); + update_needed_crop(&deshake_ctx->crop_uv, transform_crop_uv, chroma_width, chroma_height); + + crops[0] = deshake_ctx->crop_y; + crops[1] = crops[2] = deshake_ctx->crop_uv; + + for (int p = 0; p < FF_ARRAY_ELEMS(cropped_frame->data); p++) { + // Crop all of the planes appropriately + dst = (cl_mem)cropped_frame->data[p]; + transformed = (cl_mem)transformed_frame->data[p]; + + if (!dst) + break; + + err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, p, 0); + if (err < 0) + goto fail; + + CL_RUN_KERNEL_WITH_ARGS( + deshake_ctx->command_queue, + deshake_ctx->kernel_crop_upscale, + global_work, + NULL, + &crop_upscale_event, + { sizeof(cl_mem), &transformed }, + { sizeof(cl_mem), &dst }, + { sizeof(cl_float2), &crops[p].top_left }, + { sizeof(cl_float2), &crops[p].bottom_right }, + ); + } + } + + if (deshake_ctx->curr_frame < deshake_ctx->smooth_window / 2) { + // This means we are somewhere at the start of the video. We need to + // increment the current frame offset until it reaches the center of + // the ringbuffers (as the current frame will be located there for + // the rest of the video). + // + // The end of the video is taken care of by draining motion data + // one-by-one out of the buffer, causing the (at that point fixed) + // offset to move towards later frames' data. + ++deshake_ctx->abs_motion.curr_frame_offset; + } + + if (deshake_ctx->abs_motion.data_end_offset != -1) { + // Keep the end offset in sync with the frame it's supposed to be + // positioned at + --deshake_ctx->abs_motion.data_end_offset; + + if (deshake_ctx->abs_motion.data_end_offset == deshake_ctx->abs_motion.curr_frame_offset - 1) { + // The end offset would be the start of the new video sequence; flip to + // start offset + deshake_ctx->abs_motion.data_end_offset = -1; + deshake_ctx->abs_motion.data_start_offset = deshake_ctx->abs_motion.curr_frame_offset; + } + } else if (deshake_ctx->abs_motion.data_start_offset != -1) { + // Keep the start offset in sync with the frame it's supposed to be + // positioned at + --deshake_ctx->abs_motion.data_start_offset; + } + + if (deshake_ctx->debug_on) { + deshake_ctx->transform_time += ff_opencl_get_event_time(transform_event); + if (deshake_ctx->should_crop) { + deshake_ctx->crop_upscale_time += ff_opencl_get_event_time(crop_upscale_event); + } + } + + ++deshake_ctx->curr_frame; + + if (deshake_ctx->debug_on) + av_freep(&debug_matches.matches); + + if (deshake_ctx->should_crop) { + err = av_frame_copy_props(cropped_frame, input_frame); + if (err < 0) + goto fail; + + av_frame_free(&transformed_frame); + av_frame_free(&input_frame); + return ff_filter_frame(outlink, cropped_frame); + + } else { + err = av_frame_copy_props(transformed_frame, input_frame); + if (err < 0) + goto fail; + + av_frame_free(&cropped_frame); + av_frame_free(&input_frame); + return ff_filter_frame(outlink, transformed_frame); + } + +fail: + clFinish(deshake_ctx->command_queue); + + if (deshake_ctx->debug_on) + if (debug_matches.matches) + av_freep(&debug_matches.matches); + + av_frame_free(&input_frame); + av_frame_free(&transformed_frame); + av_frame_free(&cropped_frame); + return err; +} + +// Add the given frame to the frame queue to eventually be processed. +// +// Also determines the motion from the previous frame and updates the stored +// motion information accordingly. +static int queue_frame(AVFilterLink *link, AVFrame *input_frame) +{ + AVFilterContext *avctx = link->dst; + DeshakeOpenCLContext *deshake_ctx = avctx->priv; + int err; + int num_vectors; + int num_inliers = 0; + cl_int cle; + FrameDelta relative; + SimilarityMatrix model; + size_t global_work[2]; + size_t harris_global_work[2]; + size_t grid_32_global_work[2]; + int grid_32_h, grid_32_w; + size_t local_work[2]; + cl_mem src, temp; + float prev_vals[5]; + float new_vals[5]; + cl_event grayscale_event, harris_response_event, refine_features_event, + brief_event, match_descriptors_event, read_buf_event; + DebugMatches debug_matches; + + num_vectors = 0; + + local_work[0] = 8; + local_work[1] = 8; + + err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, 0, 0); + if (err < 0) + goto fail; + + err = ff_opencl_filter_work_size_from_image(avctx, harris_global_work, input_frame, 0, 8); + if (err < 0) + goto fail; + + err = ff_opencl_filter_work_size_from_image(avctx, grid_32_global_work, input_frame, 0, 32); + if (err < 0) + goto fail; + + // We want a single work-item for each 32x32 block of pixels in the input frame + grid_32_global_work[0] /= 32; + grid_32_global_work[1] /= 32; + + grid_32_h = ROUNDED_UP_DIV(input_frame->height, 32); + grid_32_w = ROUNDED_UP_DIV(input_frame->width, 32); + + if (deshake_ctx->is_yuv) { + deshake_ctx->grayscale = (cl_mem)input_frame->data[0]; + } else { + src = (cl_mem)input_frame->data[0]; + + CL_RUN_KERNEL_WITH_ARGS( + deshake_ctx->command_queue, + deshake_ctx->kernel_grayscale, + global_work, + NULL, + &grayscale_event, + { sizeof(cl_mem), &src }, + { sizeof(cl_mem), &deshake_ctx->grayscale } + ); + } + + CL_RUN_KERNEL_WITH_ARGS( + deshake_ctx->command_queue, + deshake_ctx->kernel_harris_response, + harris_global_work, + local_work, + &harris_response_event, + { sizeof(cl_mem), &deshake_ctx->grayscale }, + { sizeof(cl_mem), &deshake_ctx->harris_buf } + ); + + CL_RUN_KERNEL_WITH_ARGS( + deshake_ctx->command_queue, + deshake_ctx->kernel_refine_features, + grid_32_global_work, + NULL, + &refine_features_event, + { sizeof(cl_mem), &deshake_ctx->grayscale }, + { sizeof(cl_mem), &deshake_ctx->harris_buf }, + { sizeof(cl_mem), &deshake_ctx->refined_features }, + { sizeof(cl_int), &deshake_ctx->refine_features } + ); + + CL_RUN_KERNEL_WITH_ARGS( + deshake_ctx->command_queue, + deshake_ctx->kernel_brief_descriptors, + grid_32_global_work, + NULL, + &brief_event, + { sizeof(cl_mem), &deshake_ctx->grayscale }, + { sizeof(cl_mem), &deshake_ctx->refined_features }, + { sizeof(cl_mem), &deshake_ctx->descriptors }, + { sizeof(cl_mem), &deshake_ctx->brief_pattern} + ); + + if (av_fifo_size(deshake_ctx->abs_motion.ringbuffers[RingbufX]) == 0) { + // This is the first frame we've been given to queue, meaning there is + // no previous frame to match descriptors to + + goto no_motion_data; + } + + CL_RUN_KERNEL_WITH_ARGS( + deshake_ctx->command_queue, + deshake_ctx->kernel_match_descriptors, + grid_32_global_work, + NULL, + &match_descriptors_event, + { sizeof(cl_mem), &deshake_ctx->prev_refined_features }, + { sizeof(cl_mem), &deshake_ctx->refined_features }, + { sizeof(cl_mem), &deshake_ctx->descriptors }, + { sizeof(cl_mem), &deshake_ctx->prev_descriptors }, + { sizeof(cl_mem), &deshake_ctx->matches } + ); + + cle = clEnqueueReadBuffer( + deshake_ctx->command_queue, + deshake_ctx->matches, + CL_TRUE, + 0, + grid_32_h * grid_32_w * sizeof(MotionVector), + deshake_ctx->matches_host, + 0, + NULL, + &read_buf_event + ); + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read matches to host: %d.\n", cle); + + num_vectors = make_vectors_contig(deshake_ctx, grid_32_h, grid_32_w); + + if (num_vectors < 10) { + // Not enough matches to get reliable motion data for this frame + // + // From this point on all data is relative to this frame rather than the + // original frame. We have to make sure that we don't mix values that were + // relative to the original frame with the new values relative to this + // frame when doing the gaussian smoothing. We keep track of where the old + // values end using this data_end_offset field in order to accomplish + // that goal. + // + // If no motion data is present for multiple frames in a short window of + // time, we leave the end where it was to avoid mixing 0s in with the + // old data (and just treat them all as part of the new values) + if (deshake_ctx->abs_motion.data_end_offset == -1) { + deshake_ctx->abs_motion.data_end_offset = + av_fifo_size(deshake_ctx->abs_motion.ringbuffers[RingbufX]) / sizeof(float) - 1; + } + + goto no_motion_data; + } + + if (!estimate_affine_2d( + deshake_ctx, + deshake_ctx->matches_contig_host, + &debug_matches, + num_vectors, + model.matrix, + 10.0, + 3000, + 0.999999999999 + )) { + goto no_motion_data; + } + + for (int i = 0; i < num_vectors; i++) { + if (deshake_ctx->matches_contig_host[i].should_consider) { + deshake_ctx->inliers[num_inliers] = deshake_ctx->matches_contig_host[i]; + num_inliers++; + } + } + + if (!minimize_error( + deshake_ctx, + deshake_ctx->inliers, + &debug_matches, + num_inliers, + model.matrix, + 400 + )) { + goto no_motion_data; + } + + + relative = decompose_transform(model.matrix); + + // Get the absolute transform data for the previous frame + for (int i = 0; i < RingbufCount; i++) { + av_fifo_generic_peek_at( + deshake_ctx->abs_motion.ringbuffers[i], + &prev_vals[i], + av_fifo_size(deshake_ctx->abs_motion.ringbuffers[i]) - sizeof(float), + sizeof(float), + NULL + ); + } + + new_vals[RingbufX] = prev_vals[RingbufX] + relative.translation.s[0]; + new_vals[RingbufY] = prev_vals[RingbufY] + relative.translation.s[1]; + new_vals[RingbufRot] = prev_vals[RingbufRot] + relative.rotation; + new_vals[RingbufScaleX] = prev_vals[RingbufScaleX] / relative.scale.s[0]; + new_vals[RingbufScaleY] = prev_vals[RingbufScaleY] / relative.scale.s[1]; + + if (deshake_ctx->debug_on) { + if (!deshake_ctx->is_yuv) { + deshake_ctx->grayscale_time += ff_opencl_get_event_time(grayscale_event); + } + deshake_ctx->harris_response_time += ff_opencl_get_event_time(harris_response_event); + deshake_ctx->refine_features_time += ff_opencl_get_event_time(refine_features_event); + deshake_ctx->brief_descriptors_time += ff_opencl_get_event_time(brief_event); + deshake_ctx->match_descriptors_time += ff_opencl_get_event_time(match_descriptors_event); + deshake_ctx->read_buf_time += ff_opencl_get_event_time(read_buf_event); + } + + goto end; + +no_motion_data: + new_vals[RingbufX] = 0.0f; + new_vals[RingbufY] = 0.0f; + new_vals[RingbufRot] = 0.0f; + new_vals[RingbufScaleX] = 1.0f; + new_vals[RingbufScaleY] = 1.0f; + + for (int i = 0; i < num_vectors; i++) { + deshake_ctx->matches_contig_host[i].should_consider = false; + } + debug_matches.num_model_matches = 0; + + if (deshake_ctx->debug_on) { + av_log(avctx, AV_LOG_VERBOSE, + "\n[ALERT] No motion data found in queue_frame, motion reset to 0\n\n" + ); + } + + goto end; + +end: + // Swap the descriptor buffers (we don't need the previous frame's descriptors + // again so we will use that space for the next frame's descriptors) + temp = deshake_ctx->prev_descriptors; + deshake_ctx->prev_descriptors = deshake_ctx->descriptors; + deshake_ctx->descriptors = temp; + + // Same for the refined features + temp = deshake_ctx->prev_refined_features; + deshake_ctx->prev_refined_features = deshake_ctx->refined_features; + deshake_ctx->refined_features = temp; + + if (deshake_ctx->debug_on) { + if (num_vectors == 0) { + debug_matches.matches = NULL; + } else { + debug_matches.matches = av_malloc_array(num_vectors, sizeof(MotionVector)); + + if (!debug_matches.matches) { + err = AVERROR(ENOMEM); + goto fail; + } + } + + for (int i = 0; i < num_vectors; i++) { + debug_matches.matches[i] = deshake_ctx->matches_contig_host[i]; + } + debug_matches.num_matches = num_vectors; + + av_fifo_generic_write( + deshake_ctx->abs_motion.debug_matches, + &debug_matches, + sizeof(DebugMatches), + NULL + ); + } + + for (int i = 0; i < RingbufCount; i++) { + av_fifo_generic_write( + deshake_ctx->abs_motion.ringbuffers[i], + &new_vals[i], + sizeof(float), + NULL + ); + } + + return ff_framequeue_add(&deshake_ctx->fq, input_frame); + +fail: + clFinish(deshake_ctx->command_queue); + av_frame_free(&input_frame); + return err; +} + +static int activate(AVFilterContext *ctx) +{ + AVFilterLink *inlink = ctx->inputs[0]; + AVFilterLink *outlink = ctx->outputs[0]; + DeshakeOpenCLContext *deshake_ctx = ctx->priv; + AVFrame *frame = NULL; + int ret, status; + int64_t pts; + + FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink); + + if (!deshake_ctx->eof) { + ret = ff_inlink_consume_frame(inlink, &frame); + if (ret < 0) + return ret; + if (ret > 0) { + if (!frame->hw_frames_ctx) + return AVERROR(EINVAL); + + if (!deshake_ctx->initialized) { + ret = deshake_opencl_init(ctx); + if (ret < 0) + return ret; + } + + // If there is no more space in the ringbuffers, remove the oldest + // values to make room for the new ones + if (av_fifo_space(deshake_ctx->abs_motion.ringbuffers[RingbufX]) == 0) { + for (int i = 0; i < RingbufCount; i++) { + av_fifo_drain(deshake_ctx->abs_motion.ringbuffers[i], sizeof(float)); + } + } + ret = queue_frame(inlink, frame); + if (ret < 0) + return ret; + if (ret >= 0) { + // See if we have enough buffered frames to process one + // + // "enough" is half the smooth window of queued frames into the future + if (ff_framequeue_queued_frames(&deshake_ctx->fq) >= deshake_ctx->smooth_window / 2) { + return filter_frame(inlink, ff_framequeue_take(&deshake_ctx->fq)); + } + } + } + } + + if (!deshake_ctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts)) { + if (status == AVERROR_EOF) { + deshake_ctx->eof = true; + } + } + + if (deshake_ctx->eof) { + // Finish processing the rest of the frames in the queue. + while(ff_framequeue_queued_frames(&deshake_ctx->fq) != 0) { + for (int i = 0; i < RingbufCount; i++) { + av_fifo_drain(deshake_ctx->abs_motion.ringbuffers[i], sizeof(float)); + } + + ret = filter_frame(inlink, ff_framequeue_take(&deshake_ctx->fq)); + if (ret < 0) { + return ret; + } + } + + if (deshake_ctx->debug_on) { + av_log(ctx, AV_LOG_VERBOSE, + "Average kernel execution times:\n" + "\t grayscale: %0.3f ms\n" + "\t harris_response: %0.3f ms\n" + "\t refine_features: %0.3f ms\n" + "\tbrief_descriptors: %0.3f ms\n" + "\tmatch_descriptors: %0.3f ms\n" + "\t transform: %0.3f ms\n" + "\t crop_upscale: %0.3f ms\n" + "Average buffer read times:\n" + "\t features buf: %0.3f ms\n", + averaged_event_time_ms(deshake_ctx->grayscale_time, deshake_ctx->curr_frame), + averaged_event_time_ms(deshake_ctx->harris_response_time, deshake_ctx->curr_frame), + averaged_event_time_ms(deshake_ctx->refine_features_time, deshake_ctx->curr_frame), + averaged_event_time_ms(deshake_ctx->brief_descriptors_time, deshake_ctx->curr_frame), + averaged_event_time_ms(deshake_ctx->match_descriptors_time, deshake_ctx->curr_frame), + averaged_event_time_ms(deshake_ctx->transform_time, deshake_ctx->curr_frame), + averaged_event_time_ms(deshake_ctx->crop_upscale_time, deshake_ctx->curr_frame), + averaged_event_time_ms(deshake_ctx->read_buf_time, deshake_ctx->curr_frame) + ); + } + + ff_outlink_set_status(outlink, AVERROR_EOF, deshake_ctx->duration); + return 0; + } + + if (!deshake_ctx->eof) { + FF_FILTER_FORWARD_WANTED(outlink, inlink); + } + + return FFERROR_NOT_READY; +} + +static const AVFilterPad deshake_opencl_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = &ff_opencl_filter_config_input, + }, + { NULL } +}; + +static const AVFilterPad deshake_opencl_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = &ff_opencl_filter_config_output, + }, + { NULL } +}; + +#define OFFSET(x) offsetof(DeshakeOpenCLContext, x) +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM + +static const AVOption deshake_opencl_options[] = { + { + "tripod", "simulates a tripod by preventing any camera movement whatsoever " + "from the original frame", + OFFSET(tripod_mode), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, FLAGS + }, + { + "debug", "turn on additional debugging information", + OFFSET(debug_on), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, FLAGS + }, + { + "adaptive_crop", "attempt to subtly crop borders to reduce mirrored content", + OFFSET(should_crop), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS + }, + { + "refine_features", "refine feature point locations at a sub-pixel level", + OFFSET(refine_features), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS + }, + { + "smooth_strength", "smoothing strength (0 attempts to adaptively determine optimal strength)", + OFFSET(smooth_percent), AV_OPT_TYPE_FLOAT, {.dbl = 0.0f}, 0.0f, 1.0f, FLAGS + }, + { + "smooth_window_multiplier", "multiplier for number of frames to buffer for motion data", + OFFSET(smooth_window_multiplier), AV_OPT_TYPE_FLOAT, {.dbl = 2.0}, 0.1, 10.0, FLAGS + }, + { NULL } +}; + +AVFILTER_DEFINE_CLASS(deshake_opencl); + +AVFilter ff_vf_deshake_opencl = { + .name = "deshake_opencl", + .description = NULL_IF_CONFIG_SMALL("Feature-point based video stabilization filter"), + .priv_size = sizeof(DeshakeOpenCLContext), + .priv_class = &deshake_opencl_class, + .init = &ff_opencl_filter_init, + .uninit = &deshake_opencl_uninit, + .query_formats = &ff_opencl_filter_query_formats, + .activate = activate, + .inputs = deshake_opencl_inputs, + .outputs = deshake_opencl_outputs, + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE +};