diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c index 590ddd586c..73b42c32b1 100644 --- a/libavfilter/dnn/dnn_backend_openvino.c +++ b/libavfilter/dnn/dnn_backend_openvino.c @@ -253,9 +253,9 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request) ov_shape_free(&input_shape); return ov2_map_error(status, NULL); } - input.height = dims[1]; - input.width = dims[2]; - input.channels = dims[3]; + for (int i = 0; i < input_shape.rank; i++) + input.dims[i] = dims[i]; + input.layout = DL_NHWC; input.dt = precision_to_datatype(precision); #else status = ie_infer_request_get_blob(request->infer_request, task->input_name, &input_blob); @@ -278,9 +278,9 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request) av_log(ctx, AV_LOG_ERROR, "Failed to get input blob buffer\n"); return DNN_GENERIC_ERROR; } - input.height = dims.dims[2]; - input.width = dims.dims[3]; - input.channels = dims.dims[1]; + for (int i = 0; i < input_shape.rank; i++) + input.dims[i] = dims[i]; + input.layout = DL_NCHW; input.data = blob_buffer.buffer; input.dt = precision_to_datatype(precision); #endif @@ -339,8 +339,8 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request) av_assert0(!"should not reach here"); break; } - input.data = (uint8_t *)input.data - + input.width * input.height * input.channels * get_datatype_size(input.dt); + input.data = (uint8_t *)input.data + + input.dims[1] * input.dims[2] * input.dims[3] * get_datatype_size(input.dt); } #if HAVE_OPENVINO2 ov_tensor_free(tensor); @@ -403,10 +403,11 @@ static void infer_completion_callback(void *args) goto end; } outputs[i].dt = precision_to_datatype(precision); - - outputs[i].channels = output_shape.rank > 2 ? dims[output_shape.rank - 3] : 1; - outputs[i].height = output_shape.rank > 1 ? dims[output_shape.rank - 2] : 1; - outputs[i].width = output_shape.rank > 0 ? dims[output_shape.rank - 1] : 1; + outputs[i].layout = DL_NCHW; + outputs[i].dims[0] = 1; + outputs[i].dims[1] = output_shape.rank > 2 ? dims[output_shape.rank - 3] : 1; + outputs[i].dims[2] = output_shape.rank > 1 ? dims[output_shape.rank - 2] : 1; + outputs[i].dims[3] = output_shape.rank > 0 ? dims[output_shape.rank - 1] : 1; av_assert0(request->lltask_count <= dims[0]); outputs[i].layout = ctx->options.layout; outputs[i].scale = ctx->options.scale; @@ -445,9 +446,9 @@ static void infer_completion_callback(void *args) return; } output.data = blob_buffer.buffer; - output.channels = dims.dims[1]; - output.height = dims.dims[2]; - output.width = dims.dims[3]; + output.layout = DL_NCHW; + for (int i = 0; i < 4; i++) + output.dims[i] = dims.dims[i]; av_assert0(request->lltask_count <= dims.dims[0]); output.dt = precision_to_datatype(precision); output.layout = ctx->options.layout; @@ -469,8 +470,10 @@ static void infer_completion_callback(void *args) ff_proc_from_dnn_to_frame(task->out_frame, outputs, ctx); } } else { - task->out_frame->width = outputs[0].width; - task->out_frame->height = outputs[0].height; + task->out_frame->width = + outputs[0].dims[dnn_get_width_idx_by_layout(outputs[0].layout)]; + task->out_frame->height = + outputs[0].dims[dnn_get_height_idx_by_layout(outputs[0].layout)]; } break; case DFT_ANALYTICS_DETECT: @@ -501,7 +504,8 @@ static void infer_completion_callback(void *args) av_freep(&request->lltasks[i]); for (int i = 0; i < ov_model->nb_outputs; i++) outputs[i].data = (uint8_t *)outputs[i].data + - outputs[i].width * outputs[i].height * outputs[i].channels * get_datatype_size(outputs[i].dt); + outputs[i].dims[1] * outputs[i].dims[2] * outputs[i].dims[3] * + get_datatype_size(outputs[i].dt); } end: #if HAVE_OPENVINO2 @@ -1085,7 +1089,6 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name) #if HAVE_OPENVINO2 ov_shape_t input_shape = {0}; ov_element_type_e precision; - int64_t* dims; ov_status_e status; if (input_name) status = ov_model_const_input_by_name(ov_model->ov_model, input_name, &ov_model->input_port); @@ -1105,16 +1108,18 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name) av_log(ctx, AV_LOG_ERROR, "Failed to get input port shape.\n"); return ov2_map_error(status, NULL); } - dims = input_shape.dims; - if (dims[1] <= 3) { // NCHW - input->channels = dims[1]; - input->height = input_resizable ? -1 : dims[2]; - input->width = input_resizable ? -1 : dims[3]; - } else { // NHWC - input->height = input_resizable ? -1 : dims[1]; - input->width = input_resizable ? -1 : dims[2]; - input->channels = dims[3]; + for (int i = 0; i < 4; i++) + input->dims[i] = input_shape.dims[i]; + if (input_resizable) { + input->dims[dnn_get_width_idx_by_layout(input->layout)] = -1; + input->dims[dnn_get_height_idx_by_layout(input->layout)] = -1; } + + if (input_shape.dims[1] <= 3) // NCHW + input->layout = DL_NCHW; + else // NHWC + input->layout = DL_NHWC; + input->dt = precision_to_datatype(precision); ov_shape_free(&input_shape); return 0; @@ -1144,15 +1149,18 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name) return DNN_GENERIC_ERROR; } - if (dims[1] <= 3) { // NCHW - input->channels = dims[1]; - input->height = input_resizable ? -1 : dims[2]; - input->width = input_resizable ? -1 : dims[3]; - } else { // NHWC - input->height = input_resizable ? -1 : dims[1]; - input->width = input_resizable ? -1 : dims[2]; - input->channels = dims[3]; + for (int i = 0; i < 4; i++) + input->dims[i] = input_shape.dims[i]; + if (input_resizable) { + input->dims[dnn_get_width_idx_by_layout(input->layout)] = -1; + input->dims[dnn_get_height_idx_by_layout(input->layout)] = -1; } + + if (input_shape.dims[1] <= 3) // NCHW + input->layout = DL_NCHW; + else // NHWC + input->layout = DL_NHWC; + input->dt = precision_to_datatype(precision); return 0; } diff --git a/libavfilter/dnn/dnn_backend_tf.c b/libavfilter/dnn/dnn_backend_tf.c index 25046b58d9..27c5178bb5 100644 --- a/libavfilter/dnn/dnn_backend_tf.c +++ b/libavfilter/dnn/dnn_backend_tf.c @@ -251,7 +251,12 @@ static TF_Tensor *allocate_input_tensor(const DNNData *input) { TF_DataType dt; size_t size; - int64_t input_dims[] = {1, input->height, input->width, input->channels}; + int64_t input_dims[4] = { 0 }; + + input_dims[0] = 1; + input_dims[1] = input->dims[dnn_get_height_idx_by_layout(input->layout)]; + input_dims[2] = input->dims[dnn_get_width_idx_by_layout(input->layout)]; + input_dims[3] = input->dims[dnn_get_channel_idx_by_layout(input->layout)]; switch (input->dt) { case DNN_FLOAT: dt = TF_FLOAT; @@ -310,9 +315,9 @@ static int get_input_tf(void *model, DNNData *input, const char *input_name) // currently only NHWC is supported av_assert0(dims[0] == 1 || dims[0] == -1); - input->height = dims[1]; - input->width = dims[2]; - input->channels = dims[3]; + for (int i = 0; i < 4; i++) + input->dims[i] = dims[i]; + input->layout = DL_NHWC; return 0; } @@ -640,8 +645,8 @@ static int fill_model_input_tf(TFModel *tf_model, TFRequestItem *request) { } infer_request = request->infer_request; - input.height = task->in_frame->height; - input.width = task->in_frame->width; + input.dims[1] = task->in_frame->height; + input.dims[2] = task->in_frame->width; infer_request->tf_input = av_malloc(sizeof(TF_Output)); if (!infer_request->tf_input) { @@ -731,9 +736,12 @@ static void infer_completion_callback(void *args) { } for (uint32_t i = 0; i < task->nb_output; ++i) { - outputs[i].height = TF_Dim(infer_request->output_tensors[i], 1); - outputs[i].width = TF_Dim(infer_request->output_tensors[i], 2); - outputs[i].channels = TF_Dim(infer_request->output_tensors[i], 3); + outputs[i].dims[dnn_get_height_idx_by_layout(outputs[i].layout)] = + TF_Dim(infer_request->output_tensors[i], 1); + outputs[i].dims[dnn_get_width_idx_by_layout(outputs[i].layout)] = + TF_Dim(infer_request->output_tensors[i], 2); + outputs[i].dims[dnn_get_channel_idx_by_layout(outputs[i].layout)] = + TF_Dim(infer_request->output_tensors[i], 3); outputs[i].data = TF_TensorData(infer_request->output_tensors[i]); outputs[i].dt = (DNNDataType)TF_TensorType(infer_request->output_tensors[i]); } @@ -747,8 +755,10 @@ static void infer_completion_callback(void *args) { ff_proc_from_dnn_to_frame(task->out_frame, outputs, ctx); } } else { - task->out_frame->width = outputs[0].width; - task->out_frame->height = outputs[0].height; + task->out_frame->width = + outputs[0].dims[dnn_get_width_idx_by_layout(outputs[0].layout)]; + task->out_frame->height = + outputs[0].dims[dnn_get_height_idx_by_layout(outputs[0].layout)]; } break; case DFT_ANALYTICS_DETECT: diff --git a/libavfilter/dnn/dnn_io_proc.c b/libavfilter/dnn/dnn_io_proc.c index ab656e8ed7..e5d6edb301 100644 --- a/libavfilter/dnn/dnn_io_proc.c +++ b/libavfilter/dnn/dnn_io_proc.c @@ -70,7 +70,7 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx) dst_data = (void **)frame->data; linesize[0] = frame->linesize[0]; if (output->layout == DL_NCHW) { - middle_data = av_malloc(plane_size * output->channels); + middle_data = av_malloc(plane_size * output->dims[1]); if (!middle_data) { ret = AVERROR(ENOMEM); goto err; @@ -209,7 +209,7 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx) src_data = (void **)frame->data; linesize[0] = frame->linesize[0]; if (input->layout == DL_NCHW) { - middle_data = av_malloc(plane_size * input->channels); + middle_data = av_malloc(plane_size * input->dims[1]); if (!middle_data) { ret = AVERROR(ENOMEM); goto err; @@ -346,6 +346,7 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index int ret = 0; enum AVPixelFormat fmt; int left, top, width, height; + int width_idx, height_idx; const AVDetectionBBoxHeader *header; const AVDetectionBBox *bbox; AVFrameSideData *sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES); @@ -364,6 +365,9 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index return AVERROR(ENOSYS); } + width_idx = dnn_get_width_idx_by_layout(input->layout); + height_idx = dnn_get_height_idx_by_layout(input->layout); + header = (const AVDetectionBBoxHeader *)sd->data; bbox = av_get_detection_bbox(header, bbox_index); @@ -374,17 +378,20 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index fmt = get_pixel_format(input); sws_ctx = sws_getContext(width, height, frame->format, - input->width, input->height, fmt, + input->dims[width_idx], + input->dims[height_idx], fmt, SWS_FAST_BILINEAR, NULL, NULL, NULL); if (!sws_ctx) { av_log(log_ctx, AV_LOG_ERROR, "Failed to create scale context for the conversion " "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n", av_get_pix_fmt_name(frame->format), width, height, - av_get_pix_fmt_name(fmt), input->width, input->height); + av_get_pix_fmt_name(fmt), + input->dims[width_idx], + input->dims[height_idx]); return AVERROR(EINVAL); } - ret = av_image_fill_linesizes(linesizes, fmt, input->width); + ret = av_image_fill_linesizes(linesizes, fmt, input->dims[width_idx]); if (ret < 0) { av_log(log_ctx, AV_LOG_ERROR, "unable to get linesizes with av_image_fill_linesizes"); sws_freeContext(sws_ctx); @@ -414,7 +421,7 @@ int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx) { struct SwsContext *sws_ctx; int linesizes[4]; - int ret = 0; + int ret = 0, width_idx, height_idx; enum AVPixelFormat fmt = get_pixel_format(input); /* (scale != 1 and scale != 0) or mean != 0 */ @@ -430,18 +437,23 @@ int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx) return AVERROR(ENOSYS); } + width_idx = dnn_get_width_idx_by_layout(input->layout); + height_idx = dnn_get_height_idx_by_layout(input->layout); + sws_ctx = sws_getContext(frame->width, frame->height, frame->format, - input->width, input->height, fmt, + input->dims[width_idx], + input->dims[height_idx], fmt, SWS_FAST_BILINEAR, NULL, NULL, NULL); if (!sws_ctx) { av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion " "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n", av_get_pix_fmt_name(frame->format), frame->width, frame->height, - av_get_pix_fmt_name(fmt), input->width, input->height); + av_get_pix_fmt_name(fmt), input->dims[width_idx], + input->dims[height_idx]); return AVERROR(EINVAL); } - ret = av_image_fill_linesizes(linesizes, fmt, input->width); + ret = av_image_fill_linesizes(linesizes, fmt, input->dims[width_idx]); if (ret < 0) { av_log(log_ctx, AV_LOG_ERROR, "unable to get linesizes with av_image_fill_linesizes"); sws_freeContext(sws_ctx); diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h index 183d8418b2..852d88baa8 100644 --- a/libavfilter/dnn_interface.h +++ b/libavfilter/dnn_interface.h @@ -64,7 +64,7 @@ typedef enum { typedef struct DNNData{ void *data; - int width, height, channels; + int dims[4]; // dt and order together decide the color format DNNDataType dt; DNNColorOrder order; @@ -134,4 +134,19 @@ typedef struct DNNModule{ // Initializes DNNModule depending on chosen backend. const DNNModule *ff_get_dnn_module(DNNBackendType backend_type, void *log_ctx); +static inline int dnn_get_width_idx_by_layout(DNNLayout layout) +{ + return layout == DL_NHWC ? 2 : 3; +} + +static inline int dnn_get_height_idx_by_layout(DNNLayout layout) +{ + return layout == DL_NHWC ? 1 : 2; +} + +static inline int dnn_get_channel_idx_by_layout(DNNLayout layout) +{ + return layout == DL_NHWC ? 3 : 1; +} + #endif diff --git a/libavfilter/vf_dnn_classify.c b/libavfilter/vf_dnn_classify.c index e88e59d09c..d180c3b461 100644 --- a/libavfilter/vf_dnn_classify.c +++ b/libavfilter/vf_dnn_classify.c @@ -68,8 +68,8 @@ static int dnn_classify_post_proc(AVFrame *frame, DNNData *output, uint32_t bbox uint32_t label_id; float confidence; AVFrameSideData *sd; - - if (output->channels <= 0) { + int output_size = output->dims[3] * output->dims[2] * output->dims[1]; + if (output_size <= 0) { return -1; } @@ -88,7 +88,7 @@ static int dnn_classify_post_proc(AVFrame *frame, DNNData *output, uint32_t bbox classifications = output->data; label_id = 0; confidence= classifications[0]; - for (int i = 1; i < output->channels; i++) { + for (int i = 1; i < output_size; i++) { if (classifications[i] > confidence) { label_id = i; confidence= classifications[i]; diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c index 249cbba0f7..caccbf7a12 100644 --- a/libavfilter/vf_dnn_detect.c +++ b/libavfilter/vf_dnn_detect.c @@ -166,14 +166,14 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out scale_w = cell_w; scale_h = cell_h; } else { - if (output[output_index].height != output[output_index].width && - output[output_index].height == output[output_index].channels) { + if (output[output_index].dims[2] != output[output_index].dims[3] && + output[output_index].dims[2] == output[output_index].dims[1]) { is_NHWC = 1; - cell_w = output[output_index].height; - cell_h = output[output_index].channels; + cell_w = output[output_index].dims[2]; + cell_h = output[output_index].dims[1]; } else { - cell_w = output[output_index].width; - cell_h = output[output_index].height; + cell_w = output[output_index].dims[3]; + cell_h = output[output_index].dims[2]; } scale_w = ctx->scale_width; scale_h = ctx->scale_height; @@ -205,14 +205,14 @@ static int dnn_detect_parse_yolo_output(AVFrame *frame, DNNData *output, int out return AVERROR(EINVAL); } - if (output[output_index].channels * output[output_index].width * - output[output_index].height % (box_size * cell_w * cell_h)) { + if (output[output_index].dims[1] * output[output_index].dims[2] * + output[output_index].dims[3] % (box_size * cell_w * cell_h)) { av_log(filter_ctx, AV_LOG_ERROR, "wrong cell_w, cell_h or nb_classes\n"); return AVERROR(EINVAL); } - detection_boxes = output[output_index].channels * - output[output_index].height * - output[output_index].width / box_size / cell_w / cell_h; + detection_boxes = output[output_index].dims[1] * + output[output_index].dims[2] * + output[output_index].dims[3] / box_size / cell_w / cell_h; anchors = anchors + (detection_boxes * output_index * 2); /** @@ -373,18 +373,18 @@ static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outp int scale_w = ctx->scale_width; int scale_h = ctx->scale_height; - if (nb_outputs == 1 && output->width == 7) { - proposal_count = output->height; - detect_size = output->width; + if (nb_outputs == 1 && output->dims[3] == 7) { + proposal_count = output->dims[2]; + detect_size = output->dims[3]; detections = output->data; - } else if (nb_outputs == 2 && output[0].width == 5) { - proposal_count = output[0].height; - detect_size = output[0].width; + } else if (nb_outputs == 2 && output[0].dims[3] == 5) { + proposal_count = output[0].dims[2]; + detect_size = output[0].dims[3]; detections = output[0].data; labels = output[1].data; - } else if (nb_outputs == 2 && output[1].width == 5) { - proposal_count = output[1].height; - detect_size = output[1].width; + } else if (nb_outputs == 2 && output[1].dims[3] == 5) { + proposal_count = output[1].dims[2]; + detect_size = output[1].dims[3]; detections = output[1].data; labels = output[0].data; } else { @@ -821,15 +821,19 @@ static int config_input(AVFilterLink *inlink) AVFilterContext *context = inlink->dst; DnnDetectContext *ctx = context->priv; DNNData model_input; - int ret; + int ret, width_idx, height_idx; ret = ff_dnn_get_input(&ctx->dnnctx, &model_input); if (ret != 0) { av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n"); return ret; } - ctx->scale_width = model_input.width == -1 ? inlink->w : model_input.width; - ctx->scale_height = model_input.height == -1 ? inlink->h : model_input.height; + width_idx = dnn_get_width_idx_by_layout(model_input.layout); + height_idx = dnn_get_height_idx_by_layout(model_input.layout); + ctx->scale_width = model_input.dims[width_idx] == -1 ? inlink->w : + model_input.dims[width_idx]; + ctx->scale_height = model_input.dims[height_idx] == -1 ? inlink->h : + model_input.dims[height_idx]; return 0; } diff --git a/libavfilter/vf_dnn_processing.c b/libavfilter/vf_dnn_processing.c index 6829e94585..0b70c8e024 100644 --- a/libavfilter/vf_dnn_processing.c +++ b/libavfilter/vf_dnn_processing.c @@ -77,22 +77,29 @@ static const enum AVPixelFormat pix_fmts[] = { "the frame's format %s does not match " \ "the model input channel %d\n", \ av_get_pix_fmt_name(fmt), \ - model_input->channels); + model_input->dims[dnn_get_channel_idx_by_layout(model_input->layout)]); static int check_modelinput_inlink(const DNNData *model_input, const AVFilterLink *inlink) { AVFilterContext *ctx = inlink->dst; enum AVPixelFormat fmt = inlink->format; + int width_idx, height_idx; + width_idx = dnn_get_width_idx_by_layout(model_input->layout); + height_idx = dnn_get_height_idx_by_layout(model_input->layout); // the design is to add explicit scale filter before this filter - if (model_input->height != -1 && model_input->height != inlink->h) { + if (model_input->dims[height_idx] != -1 && + model_input->dims[height_idx] != inlink->h) { av_log(ctx, AV_LOG_ERROR, "the model requires frame height %d but got %d\n", - model_input->height, inlink->h); + model_input->dims[height_idx], + inlink->h); return AVERROR(EIO); } - if (model_input->width != -1 && model_input->width != inlink->w) { + if (model_input->dims[width_idx] != -1 && + model_input->dims[width_idx] != inlink->w) { av_log(ctx, AV_LOG_ERROR, "the model requires frame width %d but got %d\n", - model_input->width, inlink->w); + model_input->dims[width_idx], + inlink->w); return AVERROR(EIO); } if (model_input->dt != DNN_FLOAT) { @@ -103,7 +110,7 @@ static int check_modelinput_inlink(const DNNData *model_input, const AVFilterLin switch (fmt) { case AV_PIX_FMT_RGB24: case AV_PIX_FMT_BGR24: - if (model_input->channels != 3) { + if (model_input->dims[dnn_get_channel_idx_by_layout(model_input->layout)] != 3) { LOG_FORMAT_CHANNEL_MISMATCH(); return AVERROR(EIO); } @@ -116,7 +123,7 @@ static int check_modelinput_inlink(const DNNData *model_input, const AVFilterLin case AV_PIX_FMT_YUV410P: case AV_PIX_FMT_YUV411P: case AV_PIX_FMT_NV12: - if (model_input->channels != 1) { + if (model_input->dims[dnn_get_channel_idx_by_layout(model_input->layout)] != 1) { LOG_FORMAT_CHANNEL_MISMATCH(); return AVERROR(EIO); }