opencv/samples/dnn/scene_text_spotting.cpp

#include <iostream>
#include <fstream>

#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn/dnn.hpp>

using namespace cv;
using namespace cv::dnn;

std::string keys =
        "{ help  h                          | | Print help message. }"
        "{ inputImage i                     | | Path to an input image. Skip this argument to capture frames from a camera. }"
        "{ detModelPath dmp                 | | Path to a binary .onnx model for detection. "
            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
        "{ recModelPath rmp                 | | Path to a binary .onnx model for recognition. "
            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
        "{ inputHeight ih                   |736| image height of the model input. It should be multiple by 32.}"
        "{ inputWidth iw                    |736| image width of the model input. It should be multiple by 32.}"
        "{ RGBInput rgb                     |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
        "{ binaryThreshold bt               |0.3| Confidence threshold of the binary map. }"
        "{ polygonThreshold pt              |0.5| Confidence threshold of polygons. }"
        "{ maxCandidate max                 |200| Max candidates of polygons. }"
        "{ unclipRatio ratio                |2.0| unclip ratio. }"
        "{ vocabularyPath vp                | alphabet_36.txt | Path to benchmarks for evaluation. "
            "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";

void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
bool sortPts(const Point& p1, const Point& p2);

int main(int argc, char** argv)
{
    // Parse arguments
    CommandLineParser parser(argc, argv, keys);
    parser.about("Use this script to run an end-to-end inference sample of textDetectionModel and textRecognitionModel APIs\n"
                 "Use -h for more information");
    if (argc == 1 || parser.has("help"))
    {
        parser.printMessage();
        return 0;
    }

    float binThresh = parser.get<float>("binaryThreshold");
    float polyThresh = parser.get<float>("polygonThreshold");
    uint maxCandidates = parser.get<uint>("maxCandidate");
    String detModelPath = parser.get<String>("detModelPath");
    String recModelPath = parser.get<String>("recModelPath");
    String vocPath = parser.get<String>("vocabularyPath");
    double unclipRatio = parser.get<double>("unclipRatio");
    int height = parser.get<int>("inputHeight");
    int width = parser.get<int>("inputWidth");
    int imreadRGB = parser.get<int>("RGBInput");

    if (!parser.check())
    {
        parser.printErrors();
        return 1;
    }

    // Load networks
    CV_Assert(!detModelPath.empty());
    TextDetectionModel_DB detector(detModelPath);
    detector.setBinaryThreshold(binThresh)
            .setPolygonThreshold(polyThresh)
            .setUnclipRatio(unclipRatio)
            .setMaxCandidates(maxCandidates);

    CV_Assert(!recModelPath.empty());
    TextRecognitionModel recognizer(recModelPath);

    // Load vocabulary
    CV_Assert(!vocPath.empty());
    std::ifstream vocFile;
    vocFile.open(samples::findFile(vocPath));
    CV_Assert(vocFile.is_open());
    String vocLine;
    std::vector<String> vocabulary;
    while (std::getline(vocFile, vocLine)) {
        vocabulary.push_back(vocLine);
    }
    recognizer.setVocabulary(vocabulary);
    recognizer.setDecodeType("CTC-greedy");

    // Parameters for Detection
    double detScale = 1.0 / 255.0;
    Size detInputSize = Size(width, height);
    Scalar detMean = Scalar(122.67891434, 116.66876762, 104.00698793);
    detector.setInputParams(detScale, detInputSize, detMean);

    // Parameters for Recognition
    double recScale = 1.0 / 127.5;
    Scalar recMean = Scalar(127.5);
    Size recInputSize = Size(100, 32);
    recognizer.setInputParams(recScale, recInputSize, recMean);

    // Create a window
    static const std::string winName = "Text_Spotting";

    // Input data
    Mat frame = imread(samples::findFile(parser.get<String>("inputImage")));
    std::cout << frame.size << std::endl;

    // Inference
    std::vector< std::vector<Point> > detResults;
    detector.detect(frame, detResults);

    if (detResults.size() > 0) {
        // Text Recognition
        Mat recInput;
        if (!imreadRGB) {
            cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
        } else {
            recInput = frame;
        }
        std::vector< std::vector<Point> > contours;
        for (uint i = 0; i < detResults.size(); i++)
        {
            const auto& quadrangle = detResults[i];
            CV_CheckEQ(quadrangle.size(), (size_t)4, "");

            contours.emplace_back(quadrangle);

            std::vector<Point2f> quadrangle_2f;
            for (int j = 0; j < 4; j++)
                quadrangle_2f.emplace_back(quadrangle[j]);

            // Transform and Crop
            Mat cropped;
            fourPointsTransform(recInput, &quadrangle_2f[0], cropped);

            std::string recognitionResult = recognizer.recognize(cropped);
            std::cout << i << ": '" << recognitionResult << "'" << std::endl;

            putText(frame, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
        }
        polylines(frame, contours, true, Scalar(0, 255, 0), 2);
    } else {
        std::cout << "No Text Detected." << std::endl;
    }
    imshow(winName, frame);
    waitKey();

    return 0;
}

void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
{
    const Size outputSize = Size(100, 32);

    Point2f targetVertices[4] = {
        Point(0, outputSize.height - 1),
        Point(0, 0),
        Point(outputSize.width - 1, 0),
        Point(outputSize.width - 1, outputSize.height - 1)
    };
    Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);

    warpPerspective(frame, result, rotationMatrix, outputSize);

#if 0
    imshow("roi", result);
    waitKey();
#endif
}

bool sortPts(const Point& p1, const Point& p2)
{
    return p1.x < p2.x;
}
Merge pull request #17570 from HannibalAPE:text_det_recog_demo [GSoC] High Level API and Samples for Scene Text Detection and Recognition * APIs and samples for scene text detection and recognition * update APIs and tutorial for Text Detection and Recognition * API updates: (1) put decodeType into struct Voc (2) optimize the post-processing of DB * sample update: (1) add transformation into scene_text_spotting.cpp (2) modify text_detection.cpp with API update * update tutorial * simplify text recognition API update tutorial * update impl usage in recognize() and detect() * dnn: refactoring public API of TextRecognitionModel/TextDetectionModel * update provided models update opencv.bib * dnn: adjust text rectangle angle * remove points ordering operation in model.cpp * update gts of DB test in test_model.cpp * dnn: ensure to keep text rectangle angle - avoid 90/180 degree turns * dnn(text): use quadrangle result in TextDetectionModel API * dnn: update Text Detection API (1) keep points' order consistent with (bl, tl, tr, br) in unclip (2) update contourScore with boundingRect 4 years ago			`#include <iostream>`
			`#include <fstream>`

			`#include <opencv2/imgproc.hpp>`
			`#include <opencv2/highgui.hpp>`
			`#include <opencv2/dnn/dnn.hpp>`

			`using namespace cv;`
			`using namespace cv::dnn;`

			`std::string keys =`
			`"{ help h \| \| Print help message. }"`
			`"{ inputImage i \| \| Path to an input image. Skip this argument to capture frames from a camera. }"`
			`"{ detModelPath dmp \| \| Path to a binary .onnx model for detection. "`
			`"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"`
			`"{ recModelPath rmp \| \| Path to a binary .onnx model for recognition. "`
			`"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"`
			`"{ inputHeight ih \|736\| image height of the model input. It should be multiple by 32.}"`
			`"{ inputWidth iw \|736\| image width of the model input. It should be multiple by 32.}"`
			`"{ RGBInput rgb \|0\| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"`
			`"{ binaryThreshold bt \|0.3\| Confidence threshold of the binary map. }"`
			`"{ polygonThreshold pt \|0.5\| Confidence threshold of polygons. }"`
			`"{ maxCandidate max \|200\| Max candidates of polygons. }"`
			`"{ unclipRatio ratio \|2.0\| unclip ratio. }"`
			`"{ vocabularyPath vp \| alphabet_36.txt \| Path to benchmarks for evaluation. "`
			`"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";`

			`void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);`
			`bool sortPts(const Point& p1, const Point& p2);`

			`int main(int argc, char** argv)`
			`{`
			`// Parse arguments`
			`CommandLineParser parser(argc, argv, keys);`
			`parser.about("Use this script to run an end-to-end inference sample of textDetectionModel and textRecognitionModel APIs\n"`
			`"Use -h for more information");`
			`if (argc == 1 \|\| parser.has("help"))`
			`{`
			`parser.printMessage();`
			`return 0;`
			`}`

			`float binThresh = parser.get<float>("binaryThreshold");`
			`float polyThresh = parser.get<float>("polygonThreshold");`
			`uint maxCandidates = parser.get<uint>("maxCandidate");`
			`String detModelPath = parser.get<String>("detModelPath");`
			`String recModelPath = parser.get<String>("recModelPath");`
			`String vocPath = parser.get<String>("vocabularyPath");`
			`double unclipRatio = parser.get<double>("unclipRatio");`
			`int height = parser.get<int>("inputHeight");`
			`int width = parser.get<int>("inputWidth");`
			`int imreadRGB = parser.get<int>("RGBInput");`

			`if (!parser.check())`
			`{`
			`parser.printErrors();`
			`return 1;`
			`}`

			`// Load networks`
			`CV_Assert(!detModelPath.empty());`
			`TextDetectionModel_DB detector(detModelPath);`
			`detector.setBinaryThreshold(binThresh)`
			`.setPolygonThreshold(polyThresh)`
			`.setUnclipRatio(unclipRatio)`
			`.setMaxCandidates(maxCandidates);`

			`CV_Assert(!recModelPath.empty());`
			`TextRecognitionModel recognizer(recModelPath);`

			`// Load vocabulary`
			`CV_Assert(!vocPath.empty());`
			`std::ifstream vocFile;`
			`vocFile.open(samples::findFile(vocPath));`
			`CV_Assert(vocFile.is_open());`
			`String vocLine;`
			`std::vector<String> vocabulary;`
			`while (std::getline(vocFile, vocLine)) {`
			`vocabulary.push_back(vocLine);`
			`}`
			`recognizer.setVocabulary(vocabulary);`
			`recognizer.setDecodeType("CTC-greedy");`

			`// Parameters for Detection`
			`double detScale = 1.0 / 255.0;`
			`Size detInputSize = Size(width, height);`
			`Scalar detMean = Scalar(122.67891434, 116.66876762, 104.00698793);`
			`detector.setInputParams(detScale, detInputSize, detMean);`

			`// Parameters for Recognition`
			`double recScale = 1.0 / 127.5;`
			`Scalar recMean = Scalar(127.5);`
			`Size recInputSize = Size(100, 32);`
			`recognizer.setInputParams(recScale, recInputSize, recMean);`

			`// Create a window`
			`static const std::string winName = "Text_Spotting";`

			`// Input data`
			`Mat frame = imread(samples::findFile(parser.get<String>("inputImage")));`
			`std::cout << frame.size << std::endl;`

			`// Inference`
			`std::vector< std::vector<Point> > detResults;`
			`detector.detect(frame, detResults);`

			`if (detResults.size() > 0) {`
			`// Text Recognition`
			`Mat recInput;`
			`if (!imreadRGB) {`
			`cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);`
			`} else {`
			`recInput = frame;`
			`}`
			`std::vector< std::vector<Point> > contours;`
			`for (uint i = 0; i < detResults.size(); i++)`
			`{`
			`const auto& quadrangle = detResults[i];`
			`CV_CheckEQ(quadrangle.size(), (size_t)4, "");`

			`contours.emplace_back(quadrangle);`

			`std::vector<Point2f> quadrangle_2f;`
			`for (int j = 0; j < 4; j++)`
			`quadrangle_2f.emplace_back(quadrangle[j]);`

			`// Transform and Crop`
			`Mat cropped;`
			`fourPointsTransform(recInput, &quadrangle_2f[0], cropped);`

			`std::string recognitionResult = recognizer.recognize(cropped);`
			`std::cout << i << ": '" << recognitionResult << "'" << std::endl;`

			`putText(frame, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);`
			`}`
			`polylines(frame, contours, true, Scalar(0, 255, 0), 2);`
			`} else {`
			`std::cout << "No Text Detected." << std::endl;`
			`}`
			`imshow(winName, frame);`
			`waitKey();`

			`return 0;`
			`}`

			`void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)`
			`{`
			`const Size outputSize = Size(100, 32);`

			`Point2f targetVertices[4] = {`
			`Point(0, outputSize.height - 1),`
			`Point(0, 0),`
			`Point(outputSize.width - 1, 0),`
			`Point(outputSize.width - 1, outputSize.height - 1)`
			`};`
			`Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);`

			`warpPerspective(frame, result, rotationMatrix, outputSize);`

			`#if 0`
			`imshow("roi", result);`
			`waitKey();`
			`#endif`
			`}`

			`bool sortPts(const Point& p1, const Point& p2)`
			`{`
			`return p1.x < p2.x;`
			`}`