//
// Created by ubuntu on 1/24/23.
//
#ifndef SEGMENT_SIMPLE_YOLOV8_SEG_HPP
#define SEGMENT_SIMPLE_YOLOV8_SEG_HPP
#include <fstream>
#include "common.hpp"
#include "NvInferPlugin.h"

using namespace seg;

class YOLOv8_seg
{
public:
	explicit YOLOv8_seg(const std::string& engine_file_path);
	~YOLOv8_seg();

	void make_pipe(bool warmup = true);
	void copy_from_Mat(const cv::Mat& image);
	void copy_from_Mat(const cv::Mat& image, cv::Size& size);
	void letterbox(
		const cv::Mat& image,
		cv::Mat& out,
		cv::Size& size
	);
	void infer();
	void postprocess(
		std::vector<Object>& objs,
		float score_thres = 0.25f,
		float iou_thres = 0.65f,
		int topk = 100,
		int seg_channels = 32,
		int seg_h = 160,
		int seg_w = 160
	);
	static void draw_objects(
		const cv::Mat& image,
		cv::Mat& res,
		const std::vector<Object>& objs,
		const std::vector<std::string>& CLASS_NAMES,
		const std::vector<std::vector<unsigned int>>& COLORS,
		const std::vector<std::vector<unsigned int>>& MASK_COLORS
	);
	int num_bindings;
	int num_inputs = 0;
	int num_outputs = 0;
	std::vector<Binding> input_bindings;
	std::vector<Binding> output_bindings;
	std::vector<void*> host_ptrs;
	std::vector<void*> device_ptrs;

	PreParam pparam;
private:
	nvinfer1::ICudaEngine* engine = nullptr;
	nvinfer1::IRuntime* runtime = nullptr;
	nvinfer1::IExecutionContext* context = nullptr;
	cudaStream_t stream = nullptr;
	Logger gLogger{ nvinfer1::ILogger::Severity::kERROR };

};

YOLOv8_seg::YOLOv8_seg(const std::string& engine_file_path)
{
	std::ifstream file(engine_file_path, std::ios::binary);
	assert(file.good());
	file.seekg(0, std::ios::end);
	auto size = file.tellg();
	file.seekg(0, std::ios::beg);
	char* trtModelStream = new char[size];
	assert(trtModelStream);
	file.read(trtModelStream, size);
	file.close();
	initLibNvInferPlugins(&this->gLogger, "");
	this->runtime = nvinfer1::createInferRuntime(this->gLogger);
	assert(this->runtime != nullptr);

	this->engine = this->runtime->deserializeCudaEngine(trtModelStream, size);
	assert(this->engine != nullptr);

	this->context = this->engine->createExecutionContext();

	assert(this->context != nullptr);
	cudaStreamCreate(&this->stream);
	this->num_bindings = this->engine->getNbBindings();

	for (int i = 0; i < this->num_bindings; ++i)
	{
		Binding binding;
		nvinfer1::Dims dims;
		nvinfer1::DataType dtype = this->engine->getBindingDataType(i);
		std::string name = this->engine->getBindingName(i);
		binding.name = name;
		binding.dsize = type_to_size(dtype);

		bool IsInput = engine->bindingIsInput(i);
		if (IsInput)
		{
			this->num_inputs += 1;
			dims = this->engine->getProfileDimensions(
				i,
				0,
				nvinfer1::OptProfileSelector::kMAX);
			binding.size = get_size_by_dims(dims);
			binding.dims = dims;
			this->input_bindings.push_back(binding);
			// set max opt shape
			this->context->setBindingDimensions(i, dims);

		}
		else
		{
			dims = this->context->getBindingDimensions(i);
			binding.size = get_size_by_dims(dims);
			binding.dims = dims;
			this->output_bindings.push_back(binding);
			this->num_outputs += 1;
		}
	}

}

YOLOv8_seg::~YOLOv8_seg()
{
	this->context->destroy();
	this->engine->destroy();
	this->runtime->destroy();
	cudaStreamDestroy(this->stream);
	for (auto& ptr : this->device_ptrs)
	{
		CHECK(cudaFree(ptr));
	}

	for (auto& ptr : this->host_ptrs)
	{
		CHECK(cudaFreeHost(ptr));
	}
}

void YOLOv8_seg::make_pipe(bool warmup)
{

	for (auto& bindings : this->input_bindings)
	{
		void* d_ptr;
		CHECK(cudaMallocAsync(
			&d_ptr,
			bindings.size * bindings.dsize,
			this->stream)
		);
		this->device_ptrs.push_back(d_ptr);
	}

	for (auto& bindings : this->output_bindings)
	{
		void* d_ptr, * h_ptr;
		size_t size = bindings.size * bindings.dsize;
		CHECK(cudaMallocAsync(
			&d_ptr,
			size,
			this->stream)
		);
		CHECK(cudaHostAlloc(
			&h_ptr,
			size,
			0)
		);
		this->device_ptrs.push_back(d_ptr);
		this->host_ptrs.push_back(h_ptr);
	}

	if (warmup)
	{
		for (int i = 0; i < 10; i++)
		{
			for (auto& bindings : this->input_bindings)
			{
				size_t size = bindings.size * bindings.dsize;
				void* h_ptr = malloc(size);
				memset(h_ptr, 0, size);
				CHECK(cudaMemcpyAsync(
					this->device_ptrs[0],
					h_ptr,
					size,
					cudaMemcpyHostToDevice,
					this->stream)
				);
				free(h_ptr);
			}
			this->infer();
		}
		printf("model warmup 10 times\n");

	}
}

void YOLOv8_seg::letterbox(
	const cv::Mat& image,
	cv::Mat& out,
	cv::Size& size
)
{
	const float inp_h = size.height;
	const float inp_w = size.width;
	float height = image.rows;
	float width = image.cols;

	float r = std::min(inp_h / height, inp_w / width);
	int padw = std::round(width * r);
	int padh = std::round(height * r);

	cv::Mat tmp;
	if ((int)width != padw || (int)height != padh)
	{
		cv::resize(
			image,
			tmp,
			cv::Size(padw, padh)
		);
	}
	else
	{
		tmp = image.clone();
	}

	float dw = inp_w - padw;
	float dh = inp_h - padh;

	dw /= 2.0f;
	dh /= 2.0f;
	int top = int(std::round(dh - 0.1f));
	int bottom = int(std::round(dh + 0.1f));
	int left = int(std::round(dw - 0.1f));
	int right = int(std::round(dw + 0.1f));

	cv::copyMakeBorder(
		tmp,
		tmp,
		top,
		bottom,
		left,
		right,
		cv::BORDER_CONSTANT,
		{ 114, 114, 114 }
	);

	cv::dnn::blobFromImage(tmp,
		out,
		1 / 255.f,
		cv::Size(),
		cv::Scalar(0, 0, 0),
		true,
		false,
		CV_32F
	);
	this->pparam.ratio = 1 / r;
	this->pparam.dw = dw;
	this->pparam.dh = dh;
	this->pparam.height = height;
	this->pparam.width = width;;
}

void YOLOv8_seg::copy_from_Mat(const cv::Mat& image)
{
	cv::Mat nchw;
	auto& in_binding = this->input_bindings[0];
	auto width = in_binding.dims.d[3];
	auto height = in_binding.dims.d[2];
	cv::Size size{ width, height };
	this->letterbox(
		image,
		nchw,
		size
	);

	this->context->setBindingDimensions(
		0,
		nvinfer1::Dims
			{
				4,
				{ 1, 3, height, width }
			}
	);

	CHECK(cudaMemcpyAsync(
		this->device_ptrs[0],
		nchw.ptr<float>(),
		nchw.total() * nchw.elemSize(),
		cudaMemcpyHostToDevice,
		this->stream)
	);
}

void YOLOv8_seg::copy_from_Mat(const cv::Mat& image, cv::Size& size)
{
	cv::Mat nchw;
	this->letterbox(
		image,
		nchw,
		size
	);
	this->context->setBindingDimensions(
		0,
		nvinfer1::Dims
			{ 4,
			  { 1, 3, size.height, size.width }
			}
	);
	CHECK(cudaMemcpyAsync(
		this->device_ptrs[0],
		nchw.ptr<float>(),
		nchw.total() * nchw.elemSize(),
		cudaMemcpyHostToDevice,
		this->stream)
	);
}

void YOLOv8_seg::infer()
{

	this->context->enqueueV2(
		this->device_ptrs.data(),
		this->stream,
		nullptr
	);
	for (int i = 0; i < this->num_outputs; i++)
	{
		size_t osize = this->output_bindings[i].size * this->output_bindings[i].dsize;
		CHECK(cudaMemcpyAsync(this->host_ptrs[i],
			this->device_ptrs[i + this->num_inputs],
			osize,
			cudaMemcpyDeviceToHost,
			this->stream)
		);

	}
	cudaStreamSynchronize(this->stream);

}

void YOLOv8_seg::postprocess(std::vector<Object>& objs,
	float score_thres,
	float iou_thres,
	int topk,
	int seg_channels,
	int seg_h,
	int seg_w
)
{
	objs.clear();
	auto input_h = this->input_bindings[0].dims.d[2];
	auto input_w = this->input_bindings[0].dims.d[3];
	auto num_anchors = this->output_bindings[0].dims.d[1];
	auto num_channels = this->output_bindings[0].dims.d[2];

	auto& dw = this->pparam.dw;
	auto& dh = this->pparam.dh;
	auto& width = this->pparam.width;
	auto& height = this->pparam.height;
	auto& ratio = this->pparam.ratio;

	auto* output = static_cast<float*>(this->host_ptrs[0]);
	cv::Mat protos = cv::Mat(seg_channels, seg_h * seg_w, CV_32F,
		static_cast<float*>(this->host_ptrs[1]));

	std::vector<int> labels;
	std::vector<float> scores;
	std::vector<cv::Rect> bboxes;
	std::vector<cv::Mat> mask_confs;
	std::vector<int> indices;

	for (int i = 0; i < num_anchors; i++)
	{
		float* ptr = output + i * num_channels;
		float score = *(ptr + 4);
		if (score > score_thres)
		{
			float x0 = *ptr++ - dw;
			float y0 = *ptr++ - dh;
			float x1 = *ptr++ - dw;
			float y1 = *ptr++ - dh;

			x0 = clamp(x0 * ratio, 0.f, width);
			y0 = clamp(y0 * ratio, 0.f, height);
			x1 = clamp(x1 * ratio, 0.f, width);
			y1 = clamp(y1 * ratio, 0.f, height);

			int label = *(++ptr);
			cv::Mat mask_conf = cv::Mat(1, seg_channels, CV_32F, ++ptr);
			mask_confs.push_back(mask_conf);
			labels.push_back(label);
			scores.push_back(score);
			bboxes.push_back(cv::Rect_<float>(x0, y0, x1 - x0, y1 - y0));

		}
	}

#if defined(BATCHED_NMS)
	cv::dnn::NMSBoxesBatched(
		bboxes,
		scores,
		labels,
		score_thres,
		iou_thres,
		indices
	);
#else
	cv::dnn::NMSBoxes(
		bboxes,
		scores,
		score_thres,
		iou_thres,
		indices
	);
#endif

	cv::Mat masks;
	int cnt = 0;
	for (auto& i : indices)
	{
		if (cnt >= topk)
		{
			break;
		}
		cv::Rect tmp = bboxes[i];
		Object obj;
		obj.label = labels[i];
		obj.rect = tmp;
		obj.prob = scores[i];
		masks.push_back(mask_confs[i]);
		objs.push_back(obj);
		cnt += 1;
	}

	cv::Mat matmulRes = (masks * protos).t();
	cv::Mat maskMat = matmulRes.reshape(indices.size(), { seg_w, seg_h });

	std::vector<cv::Mat> maskChannels;
	cv::split(maskMat, maskChannels);
	int scale_dw = dw / input_w * seg_w;
	int scale_dh = dh / input_h * seg_h;

	cv::Rect roi(
		scale_dw,
		scale_dh,
		seg_w - 2 * scale_dw,
		seg_h - 2 * scale_dh);

	for (int i = 0; i < indices.size(); i++)
	{
		cv::Mat dest, mask;
		cv::exp(-maskChannels[i], dest);
		dest = 1.0 / (1.0 + dest);
		dest = dest(roi);
		cv::resize(
			dest,
			mask,
			cv::Size((int)width, (int)height),
			cv::INTER_LINEAR
		);
		objs[i].boxMask = mask(objs[i].rect) > 0.5f;
	}

}

void YOLOv8_seg::draw_objects(const cv::Mat& image,
	cv::Mat& res,
	const std::vector<Object>& objs,
	const std::vector<std::string>& CLASS_NAMES,
	const std::vector<std::vector<unsigned int>>& COLORS,
	const std::vector<std::vector<unsigned int>>& MASK_COLORS
)
{
	res = image.clone();
	cv::Mat mask = image.clone();
	for (auto& obj : objs)
	{
		int idx = obj.label;
		cv::Scalar color = cv::Scalar(
			COLORS[idx][0],
			COLORS[idx][1],
			COLORS[idx][2]
		);
		cv::Scalar mask_color = cv::Scalar(
			MASK_COLORS[idx % 20][0],
			MASK_COLORS[idx % 20][1],
			MASK_COLORS[idx % 20][2]
		);
		cv::rectangle(
			res,
			obj.rect,
			color,
			2
		);

		char text[256];
		sprintf(
			text,
			"%s %.1f%%",
			CLASS_NAMES[idx].c_str(),
			obj.prob * 100
		);
		mask(obj.rect).setTo(mask_color, obj.boxMask);

		int baseLine = 0;
		cv::Size label_size = cv::getTextSize(
			text,
			cv::FONT_HERSHEY_SIMPLEX,
			0.4,
			1,
			&baseLine
		);

		int x = (int)obj.rect.x;
		int y = (int)obj.rect.y + 1;

		if (y > res.rows)
			y = res.rows;

		cv::rectangle(
			res,
			cv::Rect(x, y, label_size.width, label_size.height + baseLine),
			{ 0, 0, 255 },
			-1
		);

		cv::putText(
			res,
			text,
			cv::Point(x, y + label_size.height),
			cv::FONT_HERSHEY_SIMPLEX,
			0.4,
			{ 255, 255, 255 },
			1
		);
	}
	cv::addWeighted(
		res,
		0.5,
		mask,
		0.8,
		1,
		res
	);
}
#endif //SEGMENT_SIMPLE_YOLOV8_SEG_HPP