From 42fc4892d50f4e8356f8284ebae1004f7a994f82 Mon Sep 17 00:00:00 2001 From: Kumataro Date: Tue, 21 Mar 2023 13:59:29 +0900 Subject: [PATCH] text: change default char_whitelist parameter. --- modules/text/include/opencv2/text/ocr.hpp | 6 ++++-- modules/text/src/ocr_tesseract.cpp | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index c1e0d1719..bae9642a7 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -153,14 +153,16 @@ public: @param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the system's default directory. @param language an ISO 639-3 code or NULL will default to "eng". - @param char_whitelist specifies the list of characters used for recognition. NULL defaults to - "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ". + @param char_whitelist specifies the list of characters used for recognition. NULL defaults to "" + (All characters will be used for recognition). @param oem tesseract-ocr offers different OCR Engine Modes (OEM), by default tesseract::OEM_DEFAULT is used. See the tesseract-ocr API documentation for other possible values. @param psmode tesseract-ocr offers different Page Segmentation Modes (PSM) tesseract::PSM_AUTO (fully automatic layout analysis) is used. See the tesseract-ocr API documentation for other possible values. + + @note The char_whitelist default is changed after OpenCV 4.7.0/3.19.0 from "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" to "". */ CV_WRAP static Ptr create(const char* datapath=NULL, const char* language=NULL, const char* char_whitelist=NULL, int oem=OEM_DEFAULT, int psmode=PSM_AUTO); diff --git a/modules/text/src/ocr_tesseract.cpp b/modules/text/src/ocr_tesseract.cpp index d04c698e1..90a29ea42 100644 --- a/modules/text/src/ocr_tesseract.cpp +++ b/modules/text/src/ocr_tesseract.cpp @@ -163,10 +163,12 @@ public: tesseract::PageSegMode pagesegmode = (tesseract::PageSegMode)psmode; tess.SetPageSegMode(pagesegmode); + // tessedit_whitelist default changes from [0-9a-zA-Z] to "". + // See https://github.com/opencv/opencv_contrib/issues/3457 if(char_whitelist != NULL) tess.SetVariable("tessedit_char_whitelist", char_whitelist); else - tess.SetVariable("tessedit_char_whitelist", "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"); + tess.SetVariable("tessedit_char_whitelist", ""); tess.SetVariable("save_best_choices", "T"); #else