text: improve DL-based samples

7 years ago · fd2e37da56
parent 27961cd8cc
commit fd2e37da56
4 changed files with 81 additions and 58 deletions
--- a/modules/text/include/opencv2/text/textDetector.hpp
+++ b/modules/text/include/opencv2/text/textDetector.hpp
@ -54,9 +54,15 @@ public:

    @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
    @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
-    @param detectMultiscale if true, multiple scales of the input image will be used as network input
+    @param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are
+    recommended in @cite LiaoSBWL17 to achieve the best quality.
    */
-    CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename, bool detectMultiscale = false);
+    static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename,
+                                               std::vector<Size> detectionSizes);
+    /**
+      @overload
+    */
+    CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename);
 };

 //! @}
--- a/modules/text/samples/text_recognition_cnn.cpp
+++ b/modules/text/samples/text_recognition_cnn.cpp
@ -1,6 +1,7 @@
 #include <opencv2/text.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/imgproc.hpp>
+#include <opencv2/dnn.hpp>

 #include  <iostream>
 #include  <fstream>
@ -29,22 +30,27 @@ bool fileExists (const string& filename)
    return f.good();
 }

-void textbox_draw(Mat src, vector<Rect>& groups, vector<float>& probs, float thres)
+void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
 {
-    for (size_t i = 0; i < groups.size(); i++)
+    for (size_t i = 0; i < indexes.size(); i++)
    {
-        if(probs[i] > thres)
+        if (src.type() == CV_8UC3)
        {
-            if (src.type() == CV_8UC3)
-            {
-                rectangle(src, groups[i], Scalar( 0, 255, 255 ), 2, LINE_AA);
-                String label = format("%.2f", probs[i]);
-                cout << "text box: " << groups[i] << " confidence: " << probs[i] << "\n";
-                putText(src, label, groups.at(i).tl(), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,255 ), 1, LINE_AA);
-            }
-            else
-                rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
+            Rect currrentBox = groups[indexes[i]];
+            rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+            String label = format("%.2f", probs[indexes[i]]);
+            std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";
+
+            int baseLine = 0;
+            Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+            int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+            rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                      Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+            putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
        }
+        else
+            rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
    }
 }

@ -73,33 +79,41 @@ int main(int argc, const char * argv[])

    cout << "Starting Text Box Demo" << endl;
    Ptr<text::TextDetectorCNN> textSpotter =
-            text::TextDetectorCNN::create(modelArch, moddelWeights, false);
+            text::TextDetectorCNN::create(modelArch, moddelWeights);

    vector<Rect> bbox;
    vector<float> outProbabillities;
    textSpotter->detect(image, bbox, outProbabillities);
+    std::vector<int> indexes;
+    cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes);

-    float prob_threshold = 0.6f;
    Mat image_copy = image.clone();
-    textbox_draw(image_copy, bbox, outProbabillities, prob_threshold);
+    textbox_draw(image_copy, bbox, outProbabillities, indexes);
    imshow("Text detection", image_copy);
    image_copy = image.clone();

    Ptr<text::OCRHolisticWordRecognizer> wordSpotter =
            text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");

-    for(size_t i = 0; i < bbox.size(); i++)
+    for(size_t i = 0; i < indexes.size(); i++)
    {
-        if(outProbabillities[i] > prob_threshold)
-        {
-            Mat wordImg;
-            cvtColor(image(bbox[i]), wordImg, COLOR_BGR2GRAY);
-            string word;
-            vector<float> confs;
-            wordSpotter->run(wordImg, word, NULL, NULL, &confs);
-            rectangle(image_copy, bbox[i], Scalar(0, 255, 255), 1, LINE_AA);
-            putText(image_copy, word, bbox[i].tl(), FONT_HERSHEY_PLAIN, 1, Scalar(0, 0, 255), 1, LINE_AA);
-        }
+        Mat wordImg;
+        cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY);
+        string word;
+        vector<float> confs;
+        wordSpotter->run(wordImg, word, NULL, NULL, &confs);
+
+        Rect currrentBox = bbox[indexes[i]];
+        rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+
+        int baseLine = 0;
+        Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+        int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+        rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                  Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+        putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
+
    }
    imshow("Text recognition", image_copy);
    cout << "Recognition finished. Press any key to exit.\n";
--- a/modules/text/samples/textbox_demo.cpp
+++ b/modules/text/samples/textbox_demo.cpp
@ -1,6 +1,7 @@
 #include <opencv2/text.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/imgproc.hpp>
+#include <opencv2/dnn.hpp>

 #include  <sstream>
 #include  <iostream>
@ -27,22 +28,27 @@ bool fileExists (const std::string& filename)
    return f.good();
 }

-void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, float thres)
+void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
 {
-    for (size_t i = 0; i < groups.size(); i++)
+    for (size_t i = 0; i < indexes.size(); i++)
    {
-        if(probs[i] > thres)
+        if (src.type() == CV_8UC3)
        {
-            if (src.type() == CV_8UC3)
-            {
-                rectangle(src, groups[i], Scalar( 0, 255, 255 ), 2, LINE_AA);
-                String label = format("%.2f", probs[i]);
-                std::cout << "text box: " << groups[i] << " confidence: " << probs[i] << "\n";
-                putText(src, label, groups.at(i).tl(), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,255 ), 1, LINE_AA);
-            }
-            else
-                rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
+            Rect currrentBox = groups[indexes[i]];
+            rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+            String label = format("%.2f", probs[indexes[i]]);
+            std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";
+
+            int baseLine = 0;
+            Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+            int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+            rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                      Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+            putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
        }
+        else
+            rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
    }
 }

@ -62,7 +68,7 @@ int main(int argc, const char * argv[])

    if (!fileExists(modelArch) || !fileExists(moddelWeights))
    {
-        std::cout<<getHelpStr(argv[0]);
+        std::cout << getHelpStr(argv[0]);
        std::cout << "Model files not found in the current directory. Aborting!" << std::endl;
        exit(1);
    }
@ -71,13 +77,16 @@ int main(int argc, const char * argv[])

    std::cout << "Starting Text Box Demo" << std::endl;
    Ptr<text::TextDetectorCNN> textSpotter =
-            text::TextDetectorCNN::create(modelArch, moddelWeights, false);
+            text::TextDetectorCNN::create(modelArch, moddelWeights);

    std::vector<Rect> bbox;
    std::vector<float> outProbabillities;
    textSpotter->detect(image, bbox, outProbabillities);

-    textbox_draw(image, bbox, outProbabillities, 0.5f);
+    std::vector<int> indexes;
+    cv::dnn::NMSBoxes(bbox, outProbabillities, 0.3f, 0.4f, indexes);
+
+    textbox_draw(image, bbox, outProbabillities, indexes);

    imshow("TextBox Demo",image);
    std::cout << "Done!" << std::endl << std::endl;
--- a/modules/text/src/text_detectorCNN.cpp
+++ b/modules/text/src/text_detectorCNN.cpp
@ -23,8 +23,6 @@ protected:
    Net net_;
    std::vector<Size> sizes_;
    int inputChannelCount_;
-    bool detectMultiscale_;
-

    void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,
                               std::vector<Rect>& Bbox, std::vector<float>& confidence, Size inputShape)
@ -54,21 +52,12 @@ protected:
    }

 public:
-    TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, bool detectMultiscale) :
-        detectMultiscale_(detectMultiscale)
+    TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, std::vector<Size> detectionSizes) :
+        sizes_(detectionSizes)
    {
        net_ = readNetFromCaffe(modelArchFilename, modelWeightsFilename);
        CV_Assert(!net_.empty());
        inputChannelCount_ = 3;
-        sizes_.push_back(Size(700, 700));
-
-        if(detectMultiscale_)
-        {
-            sizes_.push_back(Size(300, 300));
-            sizes_.push_back(Size(700,500));
-            sizes_.push_back(Size(700,300));
-            sizes_.push_back(Size(1600,1600));
-        }
    }

    void detect(InputArray inputImage_, std::vector<Rect>& Bbox, std::vector<float>& confidence)
@ -92,9 +81,14 @@ public:
     }
 };

-Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, bool detectMultiscale)
+Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, std::vector<Size> detectionSizes)
+{
+    return makePtr<TextDetectorCNNImpl>(modelArchFilename, modelWeightsFilename, detectionSizes);
+}
+
+Ptr<TextDetectorCNN> TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename)
 {
-    return makePtr<TextDetectorCNNImpl>(modelArchFilename, modelWeightsFilename, detectMultiscale);
+    return create(modelArchFilename, modelWeightsFilename, std::vector<Size>(1, Size(300, 300)));
 }
 } //namespace text
 } //namespace cv